o
    p’×ií  ã                   @   sÀ  d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
  mZ d dlm
Z
 ddlmZ ddlmZmZ ddlmZ 					
dwdejdededededefdd„Z		dxdedeeeeef f dedededejfdd„Z	dydd„Zdd „ Zd!d"„ ZG d#d$„ d$e
jƒZG d%d&„ d&e
jƒZ G d'd(„ d(e
jƒZ!dzd*d+„Z"d{d,d-„Z#dxd.d/„Z$	0					)d|d1ed2eejef d3efd4d5„Z%	)	6d}d7ejd8eejeej f d9ed:edeejejf f
d;d<„Z&G d=d>„ d>e
jƒZ'G d?d@„ d@e
jƒZ(G dAdB„ dBe
jƒZ)G dCdD„ dDe
jƒZ*G dEdF„ dFe
jƒZ+G dGdH„ dHe
jƒZ,G dIdJ„ dJe
jƒZ-G dKdL„ dLe
jƒZ.G dMdN„ dNe
jƒZ/G dOdP„ dPe
jƒZ0G dQdR„ dRe
jƒZ1G dSdT„ dTe
jƒZ2G dUdV„ dVe
jƒZ3G dWdX„ dXe
jƒZ4G dYdZ„ dZe
jƒZ5G d[d\„ d\e
jƒZ6G d]d^„ d^e
jƒZ7G d_d`„ d`e
jƒZ8G dadb„ dbe
jƒZ9G dcdd„ dde
jƒZ:G dedf„ dfe
jƒZ;dgdh„ Z<G didj„ dje
jƒZ=G dkdl„ dle
jƒZ>G dmdn„ dne
jƒZ?G dodp„ dpe
jƒZ@G dqdr„ dre
jƒZAG dsdt„ dte
jƒZBG dudv„ dve
jƒZCdS )~é    N)ÚListÚOptionalÚTupleÚUnion)Únné   )Ú	deprecateé   )ÚFP32SiLUÚget_activation)Ú	AttentionFé'  Ú	timestepsÚembedding_dimÚflip_sin_to_cosÚdownscale_freq_shiftÚscaleÚ
max_periodc           	      C   sö   t | jƒdksJ dƒ‚|d }t |¡ tjd|tj| jd }|||  }t |¡}| dd…df  	¡ |ddd…f  }|| }tj
t |¡t |¡gdd}|rktj
|dd…|d…f |dd…d|…f gdd}|d dkrytjj |d	¡}|S )
a&  
    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.

    Args
        timesteps (torch.Tensor):
            a 1-D Tensor of N indices, one per batch element. These may be fractional.
        embedding_dim (int):
            the dimension of the output.
        flip_sin_to_cos (bool):
            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
        downscale_freq_shift (float):
            Controls the delta between frequencies between dimensions
        scale (float):
            Scaling factor applied to the embeddings.
        max_period (int):
            Controls the maximum frequency of the embeddings
    Returns
        torch.Tensor: an [N x dim] Tensor of positional embeddings.
    r	   zTimesteps should be a 1d-arrayr   r   )ÚstartÚendÚdtypeÚdeviceNéÿÿÿÿ©Údim)r   r	   r   r   )ÚlenÚshapeÚmathÚlogÚtorchÚarangeÚfloat32r   ÚexpÚfloatÚcatÚsinÚcosr   Ú
functionalÚpad)	r   r   r   r   r   r   Úhalf_dimÚexponentÚemb© r,   úY/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/embeddings.pyÚget_timestep_embedding   s   ÿ
$2r.   ç      ð?Ú	embed_dimÚspatial_sizeÚtemporal_sizeÚspatial_interpolation_scaleÚtemporal_interpolation_scaleÚreturnc                 C   s8  | d dkr
t dƒ‚t|tƒr||f}d|  d }| d }tj|d tjd| }tj|d tjd| }t ||¡}	tj|	dd}	|	 dd|d |d g¡}	t	||	ƒ}
tj|tjd| }t
||ƒ}|
tjd	d	…d	d	…f }
tj|
|dd}
|d	d	…tjd	d	…f }tj||d |d  dd}tj||
gd
d}|S )zý
    Args:
        embed_dim (`int`):
        spatial_size (`int` or `Tuple[int, int]`):
        temporal_size (`int`):
        spatial_interpolation_scale (`float`, defaults to 1.0):
        temporal_interpolation_scale (`float`, defaults to 1.0):
    é   r   z"`embed_dim` must be divisible by 4é   r	   ©r   ©Úaxisr   Nr   )Ú
ValueErrorÚ
isinstanceÚintÚnpr    r!   ÚmeshgridÚstackÚreshapeÚ!get_2d_sincos_pos_embed_from_gridÚ!get_1d_sincos_pos_embed_from_gridÚnewaxisÚrepeatÚconcatenate)r0   r1   r2   r3   r4   Úembed_dim_spatialÚembed_dim_temporalÚgrid_hÚgrid_wÚgridÚpos_embed_spatialÚgrid_tÚpos_embed_temporalÚ	pos_embedr,   r,   r-   Úget_3d_sincos_pos_embedQ   s(   


rP   é   c           
      C   sÄ   t |tƒr	||f}tj|d tjd|d |  | }tj|d tjd|d |  | }t ||¡}tj|dd}| dd|d |d g¡}t| |ƒ}	|r`|dkr`tj	t 
|| g¡|	gdd}	|	S )z©
    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    r   r8   r	   r9   r   )r<   r=   r>   r    r!   r?   r@   rA   rB   rF   Úzeros)
r0   Ú	grid_sizeÚ	cls_tokenÚextra_tokensÚinterpolation_scaleÚ	base_sizerI   rJ   rK   rO   r,   r,   r-   Úget_2d_sincos_pos_embed€   s   
$$
rX   c                 C   sN   | d dkr
t dƒ‚t| d |d ƒ}t| d |d ƒ}tj||gdd}|S )Nr   r   ú embed_dim must be divisible by 2r	   r9   )r;   rC   r>   rF   )r0   rK   Úemb_hÚemb_wr+   r,   r,   r-   rB   –   s   rB   c                 C   s‚   | d dkr
t dƒ‚tj| d tjd}|| d  }dd|  }| d¡}t d	||¡}t |¡}t |¡}tj||gd
d}|S )zu
    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
    r   r   rY   r8   g       @r/   r   r   zm,d->mdr	   r9   )	r;   r>   r    Úfloat64rA   Úeinsumr%   r&   rF   )r0   ÚposÚomegaÚoutÚemb_sinÚemb_cosr+   r,   r,   r-   rC   ¢   s   


rC   c                       sH   e Zd ZdZ												
d‡ fdd„	Zdd„ Zdd„ Z‡  ZS )Ú
PatchEmbedz:2D Image to Patch Embedding with support for SD3 cropping.éà   rQ   r7   é   FTr	   ÚsincosNc                    s  t ƒ  ¡  || ||  }|| _|| _|| _tj||||f||d| _|r/tj|ddd| _	nd | _	|| _
|| || | _| _|| | _|	| _|rM|}nt|d ƒ}|
d u r\d | _d S |
dkr‚t||| j| jd}|rndnd}| jd	t |¡ ¡  d
¡|d d S td|
› ƒ‚)N©Úkernel_sizeÚstrideÚbiasFgíµ ÷Æ°>)Úelementwise_affineÚepsç      à?rf   )rW   rV   TrO   r   )Ú
persistentzUnsupported pos_embed_type: )ÚsuperÚ__init__ÚflattenÚ
layer_normÚpos_embed_max_sizer   ÚConv2dÚprojÚ	LayerNormÚnormÚ
patch_sizeÚheightÚwidthrW   rV   r=   rO   rX   Úregister_bufferr   Ú
from_numpyr#   Ú	unsqueezer;   )Úselfry   rz   rx   Úin_channelsr0   rr   rq   rj   rV   Úpos_embed_typers   Únum_patchesrS   rO   rn   ©Ú	__class__r,   r-   rp   º   s6   
ÿ

ÿ$zPatchEmbed.__init__c                 C   sÜ   | j du r	tdƒ‚|| j }|| j }|| j kr$td|› d| j › dƒ‚|| j kr5td|› d| j › dƒ‚| j | d }| j | d }| j d| j | j d	¡}|dd…||| …||| …dd…f }| dd	|jd	 ¡}|S )
z2Crops positional embeddings for SD3 compatibility.Nz.`pos_embed_max_size` must be set for cropping.zHeight (z/) cannot be greater than `pos_embed_max_size`: Ú.zWidth (r   r	   r   )rs   r;   rx   rO   rA   r   )r~   ry   rz   ÚtopÚleftÚspatial_pos_embedr,   r,   r-   Úcropped_pos_embedí   s$   



ÿ
ÿ(zPatchEmbed.cropped_pos_embedc                 C   s  | j d ur|jdd … \}}n|jd | j |jd | j }}|  |¡}| jr1| d¡ dd¡}| jr9|  |¡}| jd u rD| 	|j
¡S | j rN|  ||¡}n,| j|ksX| j|krwt| jjd ||f| j| jd}t |¡ ¡  d¡ 	|j¡}n| j}||  	|j
¡S )Néþÿÿÿr   r   r	   )r0   rS   rW   rV   r   )rs   r   rx   ru   rq   Ú	transposerr   rw   rO   Útor   rˆ   ry   rz   rX   rW   rV   r   r|   r#   r}   r   )r~   Úlatentry   rz   rO   r,   r,   r-   Úforward  s,   
"



üzPatchEmbed.forward)rd   rd   rQ   r7   re   FTTr	   rf   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rp   rˆ   r   Ú__classcell__r,   r,   r‚   r-   rc   ·   s     ô3rc   c                       s*   e Zd ZdZd
‡ fdd„	Zdd	„ Z‡  ZS )ÚLuminaPatchEmbedz72D Image to Patch Embedding with support for Lumina-T2Xr   r6   re   Tc                    s.   t ƒ  ¡  || _tj|| | ||d| _d S )N©Úin_featuresÚout_featuresrj   )ro   rp   rx   r   ÚLinearru   )r~   rx   r   r0   rj   r‚   r,   r-   rp   &  s   

ýzLuminaPatchEmbed.__init__c                 C   sÖ   |  |d j¡}| j }}| ¡ \}}}}|| || }	}
| |||	||
|¡ dddddd¡}| d¡}|  |¡}| dd¡}tj	|j
d |j
d tj|jd}||||fg| |d|	…d|
…f  dd¡ d¡fS )	aÉ  
        Patchifies and embeds the input tensor(s).

        Args:
            x (List[torch.Tensor] | torch.Tensor): The input tensor(s) to be patchified and embedded.

        Returns:
            Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], torch.Tensor]: A tuple containing the patchified
            and embedded tensor(s), the mask indicating the valid patches, the original image size(s), and the
            frequency tensor(s).
        r   r   r6   r	   r7   é   )r   r   N)r‹   r   rx   ÚsizeÚviewÚpermuterq   ru   r   Úonesr   Úint32r}   )r~   ÚxÚ	freqs_cisÚpatch_heightÚpatch_widthÚ
batch_sizeÚchannelry   rz   Úheight_tokensÚwidth_tokensÚmaskr,   r,   r-   r   /  s    
ÿ

" üzLuminaPatchEmbed.forward)r   r6   re   T©rŽ   r   r   r‘   rp   r   r’   r,   r,   r‚   r-   r“   #  s    	r“   c                       sX   e Zd Z					ddededed	ed
eddf‡ fdd„Zdejdejfdd„Z‡  Z	S )ÚCogVideoXPatchEmbedr   rQ   é€  é   Trx   r   r0   Útext_embed_dimrj   r5   Nc                    s<   t ƒ  ¡  || _tj||||f||d| _t ||¡| _d S )Nrg   )ro   rp   rx   r   rt   ru   r—   Ú	text_proj)r~   rx   r   r0   r«   rj   r‚   r,   r-   rp   R  s   
ÿzCogVideoXPatchEmbed.__init__Útext_embedsÚimage_embedsc           	      C   sŠ   |   |¡}|j\}}}}}| d|||¡}|  |¡}|j||g|jdd… ¢R Ž }| d¡ dd¡}| dd¡}tj||gdd 	¡ }|S )a7  
        Args:
            text_embeds (`torch.Tensor`):
                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
            image_embeds (`torch.Tensor`):
                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
        r   r	   Nr7   r   r   )
r¬   r   rA   ru   rš   rq   rŠ   r   r$   Ú
contiguous)	r~   r­   r®   ÚbatchÚ
num_framesÚchannelsry   rz   Úembedsr,   r,   r-   r   b  s   

ÿþzCogVideoXPatchEmbed.forward)r   rQ   r©   rª   T)
rŽ   r   r   r=   Úboolrp   r   ÚTensorr   r’   r,   r,   r‚   r-   r¨   Q  s(    úþýüûúùr¨   Tc           
      C   s”   |\}}t j|d |d |d dt jd}t j|d |d |d dt jd}t  ||¡}t j|dd}| ddg|jdd… ¢¡}t| ||d}	|	S )	a  
    RoPE for image tokens with 2d structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size
    crops_coords (`Tuple[int]`)
        The top-left and bottom-right coordinates of the crop.
    grid_size (`Tuple[int]`):
        The grid size of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r   F)Úendpointr   r	   r9   r   N©Úuse_real)r>   Úlinspacer!   r?   r@   rA   r   Ú!get_2d_rotary_pos_embed_from_grid)
r0   Úcrops_coordsrS   r¸   r   ÚstoprI   rJ   rK   rO   r,   r,   r-   Úget_2d_rotary_pos_embedy  s   ""r½   c                 C   sž   | d dksJ ‚t | d |d  d¡|d}t | d |d  d¡|d}|rDtj|d |d gdd}tj|d |d gdd}||fS tj||gdd}|S )Nr6   r   r   r   r·   r	   r   )Úget_1d_rotary_pos_embedrA   r   r$   )r0   rK   r¸   rZ   r[   r&   r%   r+   r,   r,   r-   rº   •  s   ÿÿrº   c                 C   s”   | d dksJ ‚t | d |||d}t | d |||d}| |d| d d¡ d|dd¡}| d|| d d¡ |ddd¡}tj||gdd d¡}|S )Nr6   r   r   )Úlinear_factorÚ
ntk_factorr	   r   r   )r¾   rš   rE   r   r$   rq   )r0   Úlen_hÚlen_wr¿   rÀ   rZ   r[   r+   r,   r,   r-   Úget_2d_rotary_pos_embed_lumina©  s   ÿÿ  rÃ   ç     ˆÃ@r   r^   Úthetac                 C   s  | d dksJ ‚t |tƒrt |¡}|| }d|t d| d¡d| d …  ¡ |    | }t |¡ |j¡}t 	||¡ ¡ }|rX|rX| 
¡ jddd}	| ¡ jddd}
|	|
fS |rxtj| 
¡ | 
¡ gdd}	tj| ¡ | ¡ gdd}
|	|
fS t t |¡|¡}|S )a  
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
    data type.

    Args:
        dim (`int`): Dimension of the frequency tensor.
        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
        theta (`float`, *optional*, defaults to 10000.0):
            Scaling factor for frequency computation. Defaults to 10000.0.
        use_real (`bool`, *optional*):
            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
        linear_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor for the context extrapolation. Defaults to 1.0.
        ntk_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
        repeat_interleave_real (`bool`, *optional*, defaults to `True`):
            If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
            Otherwise, they are concateanted with themselves.
    Returns:
        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
    r   r   r/   Nr	   r   r   )r<   r=   r>   r    r   r#   r|   r‹   r   Úouterr&   Úrepeat_interleaver%   r$   ÚpolarÚ	ones_like)r   r^   rÅ   r¸   r¿   rÀ   Úrepeat_interleave_realÚfreqsÚtÚ	freqs_cosÚ	freqs_sinrŸ   r,   r,   r-   r¾   ¹  s"   !

.r¾   r   rž   rŸ   r¸   Úuse_real_unbind_dimc                 C   sX  |r‚|\}}|d }|d }|  | j¡|  | j¡}}|dkrC| jg | jdd… ¢d‘d‘R Ž  d¡\}}tj| |gdd d¡}n-|dkrh| jg | jdd… ¢d‘d‘R Ž  d¡\}}tj| |gdd}nt	d|› d	ƒ‚|  
¡ | | 
¡ |    | j¡}	|	S t |  
¡ jg | jdd… ¢d‘d‘R Ž ¡}| d¡}t || ¡ d¡}
|
 | ¡S )
a3  
    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
    tensors contain rotary embeddings and are returned as real tensors.

    Args:
        x (`torch.Tensor`):
            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
    )NNr   Nr   r   r7   r‰   z`use_real_unbind_dim=z` but should be -1 or -2.)r‹   r   rA   r   Úunbindr   r@   rq   r$   r;   r#   r   Úview_as_complexr}   Úview_as_realÚtype_as)rž   rŸ   r¸   rÏ   r&   r%   Úx_realÚx_imagÚ	x_rotatedr`   Úx_outr,   r,   r-   Úapply_rotary_embï  s$   ,, ,

rØ   c                       sL   e Zd Z					ddededededee f
‡ fd	d
„Zddd„Z‡  ZS )ÚTimestepEmbeddingÚsiluNTr   Útime_embed_dimÚact_fnÚout_dimÚpost_act_fnc           	         sˆ   t ƒ  ¡  t |||¡| _|d urtj||dd| _nd | _t|ƒ| _|d ur*|}n|}t |||¡| _|d u r=d | _	d S t|ƒ| _	d S )NF©rj   )
ro   rp   r   r—   Úlinear_1Ú	cond_projr   ÚactÚlinear_2Úpost_act)	r~   r   rÛ   rÜ   rÝ   rÞ   Úcond_proj_dimÚsample_proj_biasÚtime_embed_dim_outr‚   r,   r-   rp      s   



zTimestepEmbedding.__init__c                 C   sV   |d ur||   |¡ }|  |¡}| jd ur|  |¡}|  |¡}| jd ur)|  |¡}|S ©N)rá   rà   râ   rã   rä   )r~   ÚsampleÚ	conditionr,   r,   r-   r   @  s   





zTimestepEmbedding.forward)rÚ   NNNTrè   )	rŽ   r   r   r=   Ústrr   rp   r   r’   r,   r,   r‚   r-   rÙ     s$    øþýüûú rÙ   c                	       s8   e Zd Zd
dedededef‡ fdd„Zdd	„ Z‡  ZS )Ú	Timestepsr	   Únum_channelsr   r   r   c                    s&   t ƒ  ¡  || _|| _|| _|| _d S rè   )ro   rp   rí   r   r   r   )r~   rí   r   r   r   r‚   r,   r-   rp   P  s
   

zTimesteps.__init__c                 C   s   t || j| j| j| jd}|S )N)r   r   r   )r.   rí   r   r   r   )r~   r   Út_embr,   r,   r-   r   W  s   ûzTimesteps.forward)r	   )	rŽ   r   r   r=   r´   r#   rp   r   r’   r,   r,   r‚   r-   rì   O  s     rì   c                       s6   e Zd ZdZ	ddedef‡ fdd	„Zd
d„ Z‡  ZS )ÚGaussianFourierProjectionz-Gaussian Fourier embeddings for noise levels.é   r/   TFÚembedding_sizer   c                    sf   t ƒ  ¡  tjt |¡| dd| _|| _|| _|r1| `tjt |¡| dd| _	| j	| _| `	d S d S )NF)Úrequires_grad)
ro   rp   r   Ú	Parameterr   ÚrandnÚweightr   r   ÚW)r~   rñ   r   Úset_W_to_weightr   r   r‚   r,   r-   rp   e  s   
ûz"GaussianFourierProjection.__init__c                 C   s†   | j rt  |¡}|d d …d f | jd d d …f  d tj }| jr2tjt |¡t |¡gdd}|S tjt |¡t |¡gdd}|S )Nr   r   r   )	r   r   rõ   r>   Úpir   r$   r&   r%   )r~   rž   Úx_projr`   r,   r,   r-   r   t  s   
,ÿz!GaussianFourierProjection.forward)rð   r/   TTF)	rŽ   r   r   r‘   r=   r#   rp   r   r’   r,   r,   r‚   r-   rï   b  s    ÿÿÿrï   c                       s4   e Zd ZdZd	dedef‡ fdd„Zdd„ Z‡  ZS )
ÚSinusoidalPositionalEmbeddinga[  Apply positional information to a sequence of embeddings.

    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
    them

    Args:
        embed_dim: (int): Dimension of the positional embedding.
        max_seq_length: Maximum sequence length to apply positional embeddings

    é    r0   Úmax_seq_lengthc                    s    t ƒ  ¡  t |¡ d¡}t t d|d¡t d¡ |  ¡}t d||¡}t 	|| ¡|dd d …dd d…f< t 
|| ¡|dd d …dd d…f< |  d|¡ d S )Nr	   r   r   rÄ   Úpe)ro   rp   r   r    r}   r"   r   r   rR   r%   r&   r{   )r~   r0   rü   ÚpositionÚdiv_termrý   r‚   r,   r-   rp     s   
$""z&SinusoidalPositionalEmbedding.__init__c                 C   s*   |j \}}}|| jd d …d |…f  }|S rè   )r   rý   )r~   rž   Ú_Ú
seq_lengthr,   r,   r-   r   –  s   z%SinusoidalPositionalEmbedding.forward)rû   ©rŽ   r   r   r‘   r=   rp   r   r’   r,   r,   r‚   r-   rú     s    	rú   c                       s:   e Zd ZdZdedededef‡ fdd„Zdd	„ Z‡  ZS )
ÚImagePositionalEmbeddingsa  
    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
    height and width of the latent space.

    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092

    For VQ-diffusion:

    Output vector embeddings are used as input for the transformer.

    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.

    Args:
        num_embed (`int`):
            Number of embeddings for the latent pixels embeddings.
        height (`int`):
            Height of the latent image i.e. the number of height embeddings.
        width (`int`):
            Width of the latent image i.e. the number of width embeddings.
        embed_dim (`int`):
            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
    Ú	num_embedry   rz   r0   c                    sV   t ƒ  ¡  || _|| _|| _|| _t | j|¡| _t | j|¡| _	t | j|¡| _
d S rè   )ro   rp   ry   rz   r  r0   r   Ú	Embeddingr+   Ú
height_embÚ	width_emb)r~   r  ry   rz   r0   r‚   r,   r-   rp   ´  s   
z"ImagePositionalEmbeddings.__init__c                 C   s¨   |   |¡}|  tj| j|jd d| j¡¡}| d¡}|  tj| j	|jd d| j	¡¡}| d¡}|| }| d| j| j	 d¡}||d d …d |j
d …d d …f  }|S )N©r   r	   r   r   )r+   r  r   r    ry   r   rš   r}   r  rz   r   )r~   Úindexr+   r  r  Úpos_embr,   r,   r-   r   Æ  s   
"
"
$z!ImagePositionalEmbeddings.forwardr  r,   r,   r‚   r-   r  œ  s    þýüûr  c                       s<   e Zd ZdZ‡ fdd„Zd
dd„Zd
dejfdd	„Z‡  Z	S )ÚLabelEmbeddinga7  
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.

    Args:
        num_classes (`int`): The number of classes.
        hidden_size (`int`): The size of the vector embeddings.
        dropout_prob (`float`): The probability of dropping a label.
    c                    s4   t ƒ  ¡  |dk}t || |¡| _|| _|| _d S ©Nr   )ro   rp   r   r  Úembedding_tableÚnum_classesÚdropout_prob)r~   r  Úhidden_sizer  Úuse_cfg_embeddingr‚   r,   r-   rp   ç  s
   

zLabelEmbedding.__init__Nc                 C   sH   |du rt j|jd |jd| jk }nt  |dk¡}t  || j|¡}|S )zB
        Drops labels to enable classifier-free guidance.
        Nr   r  r	   )r   Úrandr   r   r  ÚtensorÚwherer  )r~   ÚlabelsÚforce_drop_idsÚdrop_idsr,   r,   r-   Ú
token_dropî  s
   zLabelEmbedding.token_dropr  c                 C   s6   | j dk}| jr
|s|d ur|  ||¡}|  |¡}|S r  )r  Útrainingr  r  )r~   r  r  Úuse_dropoutÚ
embeddingsr,   r,   r-   r   ù  s
   

zLabelEmbedding.forwardrè   )
rŽ   r   r   r‘   rp   r  r   Ú
LongTensorr   r’   r,   r,   r‚   r-   r  Ý  s
    	
r  c                	       sN   e Zd Z				ddedededef‡ fdd	„Zd
ejdejfdd„Z‡  ZS )ÚTextImageProjectioné   re   é
   r«   Úimage_embed_dimÚcross_attention_dimÚnum_image_text_embedsc                    s6   t ƒ  ¡  || _t || j| ¡| _t ||¡| _d S rè   )ro   rp   r"  r   r—   r®   r¬   )r~   r«   r   r!  r"  r‚   r,   r-   rp     s   
zTextImageProjection.__init__r­   r®   c                 C   s@   |j d }|  |¡}| || jd¡}|  |¡}tj||gddS )Nr   r   r	   r   )r   r®   rA   r"  r¬   r   r$   )r~   r­   r®   r¢   Úimage_text_embedsr,   r,   r-   r     s
   


zTextImageProjection.forward)r  re   re   r  ©	rŽ   r   r   r=   rp   r   rµ   r   r’   r,   r,   r‚   r-   r    s    ûþýüûr  c                       sB   e Zd Z			ddededef‡ fdd„Zdejfd	d
„Z‡  ZS )ÚImageProjectionre   rû   r   r!  r"  c                    s4   t ƒ  ¡  || _t || j| ¡| _t |¡| _d S rè   )ro   rp   r"  r   r—   r®   rv   rw   )r~   r   r!  r"  r‚   r,   r-   rp     s   
zImageProjection.__init__r®   c                 C   s2   |j d }|  |¡}| || jd¡}|  |¡}|S )Nr   r   )r   r®   rA   r"  rw   )r~   r®   r¢   r,   r,   r-   r   )  s
   


zImageProjection.forward)re   re   rû   r$  r,   r,   r‚   r-   r%    s    üþýür%  c                       s.   e Zd Zd‡ fdd„	Zdejfdd„Z‡  ZS )ÚIPAdapterFullImageProjectionr  c                    s8   t ƒ  ¡  ddlm} |||ddd| _t |¡| _d S ©Nr	   ©ÚFeedForwardÚgelu)ÚmultÚactivation_fn)ro   rp   Ú	attentionr)  Úffr   rv   rw   )r~   r   r!  r)  r‚   r,   r-   rp   4  s   
z%IPAdapterFullImageProjection.__init__r®   c                 C   s   |   |  |¡¡S rè   )rw   r.  )r~   r®   r,   r,   r-   r   ;  s   z$IPAdapterFullImageProjection.forward)r  r  ©rŽ   r   r   rp   r   rµ   r   r’   r,   r,   r‚   r-   r&  3  s    r&  c                       s.   e Zd Zd‡ fdd„	Zdejfdd„Z‡  ZS )	ÚIPAdapterFaceIDImageProjectionr  r	   c                    sH   t ƒ  ¡  ddlm} || _|| _|||| |dd| _t |¡| _	d S r'  )
ro   rp   r-  r)  Ú
num_tokensr!  r.  r   rv   rw   )r~   r   r!  r+  r1  r)  r‚   r,   r-   rp   @  s   
z'IPAdapterFaceIDImageProjection.__init__r®   c                 C   s&   |   |¡}| d| j| j¡}|  |¡S )Nr   )r.  rA   r1  r!  rw   )r~   r®   rž   r,   r,   r-   r   I  s   

z&IPAdapterFaceIDImageProjection.forward)r  r  r	   r	   r/  r,   r,   r‚   r-   r0  ?  s    	r0  c                       s(   e Zd Zd‡ fdd„	Zddd„Z‡  ZS )	ÚCombinedTimestepLabelEmbeddingsçš™™™™™¹?c                    s:   t ƒ  ¡  tdddd| _td|d| _t|||ƒ| _d S )Nrð   Tr	   ©rí   r   r   ©r   rÛ   )ro   rp   rì   Ú	time_projrÙ   Útimestep_embedderr  Úclass_embedder)r~   r  r   Úclass_dropout_probr‚   r,   r-   rp   P  s   
z(CombinedTimestepLabelEmbeddings.__init__Nc                 C   s2   |   |¡}|  |j|d¡}|  |¡}|| }|S ©Nr8   )r6  r7  r‹   r8  )r~   ÚtimestepÚclass_labelsÚhidden_dtypeÚtimesteps_projÚtimesteps_embÚconditioningr,   r,   r-   r   W  s
   

z'CombinedTimestepLabelEmbeddings.forward)r3  rè   ©rŽ   r   r   rp   r   r’   r,   r,   r‚   r-   r2  O  s    r2  c                       ó$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )Ú"CombinedTimestepTextProjEmbeddingsc                    s<   t ƒ  ¡  tdddd| _td|d| _t||dd| _d S ©Nrð   Tr   r4  r5  rÚ   )rÜ   )ro   rp   rì   r6  rÙ   r7  ÚPixArtAlphaTextProjectionÚtext_embedder©r~   r   Úpooled_projection_dimr‚   r,   r-   rp   c  s   
z+CombinedTimestepTextProjEmbeddings.__init__c                 C   s4   |   |¡}|  |j|jd¡}|  |¡}|| }|S r:  )r6  r7  r‹   r   rF  )r~   r;  Úpooled_projectionr>  r?  Úpooled_projectionsr@  r,   r,   r-   r   j  s
   

z*CombinedTimestepTextProjEmbeddings.forwardrA  r,   r,   r‚   r-   rC  b  s    rC  c                       rB  )Ú*CombinedTimestepGuidanceTextProjEmbeddingsc                    sJ   t ƒ  ¡  tdddd| _td|d| _td|d| _t||dd| _d S rD  )	ro   rp   rì   r6  rÙ   r7  Úguidance_embedderrE  rF  rG  r‚   r,   r-   rp   v  s
   
z3CombinedTimestepGuidanceTextProjEmbeddings.__init__c                 C   sZ   |   |¡}|  |j|jd¡}|   |¡}|  |j|jd¡}|| }|  |¡}	||	 }
|
S r:  )r6  r7  r‹   r   rL  rF  )r~   r;  ÚguidancerI  r>  r?  Úguidance_projÚguidance_embÚtime_guidance_embrJ  r@  r,   r,   r-   r   ~  s   


z2CombinedTimestepGuidanceTextProjEmbeddings.forwardrA  r,   r,   r‚   r-   rK  u  s    rK  c                	       s8   e Zd Zd
dedededef‡ fdd„Zdd	„ Z‡  ZS )ÚHunyuanDiTAttentionPoolNÚspacial_dimr0   Ú	num_headsÚ
output_dimc                    sp   t ƒ  ¡  t t |d |¡|d  ¡| _t ||¡| _t ||¡| _	t ||¡| _
t ||p0|¡| _|| _d S )Nr	   rm   )ro   rp   r   ró   r   rô   Úpositional_embeddingr—   Úk_projÚq_projÚv_projÚc_projrS  )r~   rR  r0   rS  rT  r‚   r,   r-   rp     s   
 
z HunyuanDiTAttentionPool.__init__c              	   C   s  |  ddd¡}tj|jddd|gdd}|| jd d …d d d …f  |j¡ }tjdi d|d d… “d|“d	|“d
|j	d “d| j
“d| jj“d| jj“d| jj“dd “dt | jj| jj| jjg¡“dd “dd “dd“dd“d| jj“d| jj“dd“d| j“dd“Ž\}}| d¡S )Nr	   r   r   T©r   Úkeepdimr   ÚqueryÚkeyÚvalueÚembed_dim_to_checkr   rS  Úq_proj_weightÚk_proj_weightÚv_proj_weightÚin_proj_weightÚin_proj_biasÚbias_kÚbias_vÚadd_zero_attnFÚ	dropout_pÚout_proj_weightÚout_proj_biasÚuse_separate_proj_weightr  Úneed_weightsr,   )r›   r   r$   ÚmeanrU  r‹   r   ÚFÚmulti_head_attention_forwardr   rS  rW  rõ   rV  rX  rj   rY  r  Úsqueeze)r~   rž   r   r,   r,   r-   r   ™  sV   $
ÿþý
üûúùø	÷
öõôóòñðïîí
zHunyuanDiTAttentionPool.forwardrè   ©rŽ   r   r   r=   rp   r   r’   r,   r,   r‚   r-   rQ    s     	rQ  c                       s0   e Zd Z				d
‡ fdd„	Zddd	„Z‡  ZS )Ú-HunyuanCombinedTimestepTextSizeStyleEmbeddingr  rð   é   Tc                    sŽ   t ƒ  ¡  tdddd| _td|d| _tdddd| _t||d|d| _|| _	|r8t
 d|¡| _d	| | }n|}t||d
 |dd| _d S )Nrð   Tr   r4  r5  é   )rS  rT  r	   é   r6   Ú	silu_fp32)r•   r  r–   rÜ   )ro   rp   rì   r6  rÙ   r7  Ú	size_projrQ  ÚpoolerÚ"use_style_cond_and_image_meta_sizer   r  Ústyle_embedderrE  Úextra_embedder)r~   r   rH  Úseq_lenr!  ry  Úextra_in_dimr‚   r,   r-   rp   ¶  s$   
ÿüz6HunyuanCombinedTimestepTextSizeStyleEmbedding.__init__Nc                 C   s–   |   |¡}|  |j|d¡}|  |¡}| jr:|  | d¡¡}|j|d}| dd¡}|  |¡}	tj	|||	gdd}
ntj	|gdd}
||  
|
¡ }|S )Nr8   r   ru  r	   r   )r6  r7  r‹   rx  ry  rw  rš   rz  r   r$   r{  )r~   r;  Úencoder_hidden_statesÚimage_meta_sizeÚstyler=  r>  r?  rJ  Ústyle_embeddingÚ
extra_condr@  r,   r,   r-   r   Ø  s   


z5HunyuanCombinedTimestepTextSizeStyleEmbedding.forward)r  rð   rs  Trè   rA  r,   r,   r‚   r-   rr  µ  s    ú"rr  c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )	Ú&LuminaCombinedTimestepCaptionEmbeddingrª   rs  rð   c                    sL   t ƒ  ¡  t|ddd| _t||d| _t t |¡tj	||dd¡| _
d S )NTg        r4  r5  rß   )ro   rp   rì   r6  rÙ   r7  r   Ú
Sequentialrv   r—   Úcaption_embedder)r~   r  r!  Úfrequency_embedding_sizer‚   r,   r-   rp   ó  s   
ÿý
þz/LuminaCombinedTimestepCaptionEmbedding.__init__c           
      C   sn   |   |¡}|  |j| jjjjd¡}| ¡  d¡}|| jdd|jdd }| |¡}|  	|¡}|| }	|	S ©Nr8   r   r	   r   )
r6  r7  r‹   rà   rõ   r   r#   r}   Úsumr…  )
r~   r;  Úcaption_featÚcaption_maskÚ	time_freqÚ
time_embedÚcaption_mask_floatÚcaption_feats_poolÚcaption_embedr@  r,   r,   r-   r     s   


z.LuminaCombinedTimestepCaptionEmbedding.forward)rª   rs  rð   rA  r,   r,   r‚   r-   rƒ  ò  s    rƒ  c                       s4   e Zd Zd	dededef‡ fdd„Zdd„ Z‡  ZS )
ÚTextTimeEmbeddingé@   Úencoder_dimrÛ   rS  c                    s@   t ƒ  ¡  t |¡| _t||ƒ| _t ||¡| _t |¡| _	d S rè   )
ro   rp   r   rv   Únorm1ÚAttentionPoolingÚpoolr—   ru   Únorm2)r~   r’  rÛ   rS  r‚   r,   r-   rp     s
   
zTextTimeEmbedding.__init__c                 C   s,   |   |¡}|  |¡}|  |¡}|  |¡}|S rè   )r“  r•  ru   r–  )r~   Úhidden_statesr,   r,   r-   r     s
   



zTextTimeEmbedding.forward)r‘  rq  r,   r,   r‚   r-   r    s    r  c                       sB   e Zd Zddededef‡ fdd„Zdejd	ejfd
d„Z‡  ZS )ÚTextImageTimeEmbeddingre   ru  r«   r   rÛ   c                    s6   t ƒ  ¡  t ||¡| _t |¡| _t ||¡| _d S rè   )ro   rp   r   r—   r¬   rv   Ú	text_normÚ
image_proj)r~   r«   r   rÛ   r‚   r,   r-   rp   %  s   
zTextImageTimeEmbedding.__init__r­   r®   c                 C   s&   |   |¡}|  |¡}|  |¡}|| S rè   )r¬   r™  rš  )r~   r­   r®   Útime_text_embedsÚtime_image_embedsr,   r,   r-   r   +  s   


zTextImageTimeEmbedding.forward)re   re   ru  r$  r,   r,   r‚   r-   r˜  $  s    r˜  c                       s8   e Zd Zd
dedef‡ fdd„Zdejfdd	„Z‡  ZS )ÚImageTimeEmbeddingre   ru  r   rÛ   c                    s(   t ƒ  ¡  t ||¡| _t |¡| _d S rè   )ro   rp   r   r—   rš  rv   Ú
image_norm©r~   r   rÛ   r‚   r,   r-   rp   7  s   
zImageTimeEmbedding.__init__r®   c                 C   s   |   |¡}|  |¡}|S rè   )rš  rž  )r~   r®   rœ  r,   r,   r-   r   <  s   

zImageTimeEmbedding.forward©re   ru  r$  r,   r,   r‚   r-   r  6  s    r  c                       s>   e Zd Zddedef‡ fdd„Zdejdejfd	d
„Z‡  ZS )ÚImageHintTimeEmbeddingre   ru  r   rÛ   c                    sâ   t ƒ  ¡  t ||¡| _t |¡| _t tjdddddt 	¡ tjdddddt 	¡ tjddddddt 	¡ tjdddddt 	¡ tjddddddt 	¡ tjdddddt 	¡ tjdd	ddddt 	¡ tjd	d
ddd¡| _
d S )Nr7   rQ   r	   )Úpaddingrû   r   )r¢  ri   é`   rð   r6   )ro   rp   r   r—   rš  rv   rž  r„  rt   ÚSiLUÚinput_hint_blockrŸ  r‚   r,   r-   rp   D  s(   

ñzImageHintTimeEmbedding.__init__r®   Úhintc                 C   s&   |   |¡}|  |¡}|  |¡}||fS rè   )rš  rž  r¥  )r~   r®   r¦  rœ  r,   r,   r-   r   Z  s   


zImageHintTimeEmbedding.forwardr   r$  r,   r,   r‚   r-   r¡  C  s    r¡  c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )r”  Nc                    s~   t ƒ  ¡  || _t t d|¡|d  ¡| _tj||| jd| _	tj||| jd| _
tj||| jd| _|| _|| j | _d S )Nr	   rm   r8   )ro   rp   r   r   ró   r   rô   rU  r—   rV  rW  rX  rS  Údim_per_head)r~   rS  r0   r   r‚   r,   r-   rp   e  s   
zAttentionPooling.__init__c                    sô   |  ¡ \‰ }}‡ ‡fdd„}|jdddˆj |j¡ }tj||gdd}|ˆ |¡ƒ}|ˆ |¡ƒ}|ˆ 	|¡ƒ}dt
 t
 ˆj¡¡ }	t d||	 ||	 ¡}
tj|
 ¡ dd |
j¡}
t d	|
|¡}| ˆ dd¡ dd
¡}|d d …dd d …f S )Nc                    sF   |   ˆ dˆjˆj¡} |  dd¡} |  ˆ ˆj dˆj¡} |  dd¡} | S )Nr   r	   r   )rš   rS  r§  rŠ   rA   )rž   ©Úbsr~   r,   r-   r   r  s
   z'AttentionPooling.forward.<locals>.shaper	   TrZ  r   zbct,bcs->btsr   zbts,bcs->bctr   r   )r™   rm  rU  r‹   r   r   r$   rW  rV  rX  r   Úsqrtr§  r]   Úsoftmaxr#   ÚtyperA   rŠ   )r~   rž   Úlengthrz   r   Úclass_tokenÚqÚkÚvr   rõ   Úar,   r¨  r-   r   o  s   zAttentionPooling.forwardrè   rA  r,   r,   r‚   r-   r”  b  s    
r”  c                 C   sŠ   |j dd… \}}dt | ¡|   }|d j|j|jd}|| d¡ }tj| ¡ | 	¡ fdd}| 
dd	d
dd¡ ||| d d ¡}|S )zÉ
    Args:
        embed_dim: int
        box: a 3-D tensor [B x N x 4] representing the bounding boxes for GLIGEN pipeline
    Returns:
        [B x N x embed_dim] tensor of positional embeddings
    Nr   éd   )NNN)r   r   r   r   r   r	   r7   r6   )r   r   r    r‹   r   r   r}   r@   r%   r&   r›   rA   )r0   Úboxr¢   Ú	num_boxesr+   r,   r,   r-   Ú#get_fourier_embeds_from_boundingbox”  s   	$r¶  c                       s2   e Zd Zd‡ fdd„	Z					d	dd„Z‡  ZS )
ÚGLIGENTextBoundingboxProjectionú	text-onlyrt  c              
      sf  t ƒ  ¡  || _|| _|| _|d d | _t|tƒr|d }|dkrMt 	t 
| j| j d¡t ¡ t 
dd¡t ¡ t 
d|¡¡| _tj t | jg¡¡| _nX|dkr¥t 	t 
| j| j d¡t ¡ t 
dd¡t ¡ t 
d|¡¡| _t 	t 
| j| j d¡t ¡ t 
dd¡t ¡ t 
d|¡¡| _tj t | jg¡¡| _tj t | jg¡¡| _tj t | jg¡¡| _d S )Nr   r6   r   r¸  é   z
text-image)ro   rp   Úpositive_lenrÝ   Úfourier_embedder_dimÚposition_dimr<   Útupler   r„  r—   r¤  Úlinearsr   ró   rR   Únull_positive_featureÚlinears_textÚlinears_imageÚnull_text_featureÚnull_image_featureÚnull_position_feature)r~   rº  rÝ   Úfeature_typeÚfourier_freqsr‚   r,   r-   rp   ª  sD   



û

û

ûz(GLIGENTextBoundingboxProjection.__init__Nc                 C   s$  |  d¡}t| j|ƒ}| j ddd¡}	|| d| |	  }|d urA| j ddd¡}
|| d| |
  }|  tj||gdd¡}|S |  d¡}|  d¡}| j	 ddd¡}| j
 ddd¡}|| d| |  }|| d| |  }|  tj||gdd¡}|  tj||gdd¡}tj||gdd}|S )Nr   r	   r   )r}   r¶  r»  rÄ  rš   r¿  r¾  r   r$   rÂ  rÃ  rÀ  rÁ  )r~   ÚboxesÚmasksÚpositive_embeddingsÚphrases_masksÚimage_masksÚphrases_embeddingsÚimage_embeddingsÚxyxy_embeddingÚ	xyxy_nullÚpositive_nullÚobjsÚ	text_nullÚ
image_nullÚ	objs_textÚ
objs_imager,   r,   r-   r   Ó  s&   


ñ
z'GLIGENTextBoundingboxProjection.forward)r¸  rt  )NNNNNrA  r,   r,   r‚   r-   r·  ©  s    -ør·  c                       s0   e Zd ZdZddef‡ fdd„Zdd„ Z‡  ZS )	Ú)PixArtAlphaCombinedTimestepSizeEmbeddingszº
    For PixArt-Alpha.

    Reference:
    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
    FÚuse_additional_conditionsc                    sl   t ƒ  ¡  || _tdddd| _td|d| _|| _|r4tdddd| _td|d| _	td|d| _
d S d S )Nrð   Tr   r4  r5  )ro   rp   Úoutdimrì   r6  rÙ   r7  r×  Úadditional_condition_projÚresolution_embedderÚaspect_ratio_embedder)r~   r   Úsize_emb_dimr×  r‚   r,   r-   rp     s   
ýz2PixArtAlphaCombinedTimestepSizeEmbeddings.__init__c                 C   s   |   |¡}|  |j|d¡}| jrD|  | ¡ ¡ |¡}|  |¡ |d¡}|  | ¡ ¡ |¡}	|  |	¡ |d¡}	|t	j
||	gdd }
|
S |}
|
S r‡  )r6  r7  r‹   r×  rÙ  rq   rÚ  rA   rÛ  r   r$   )r~   r;  Ú
resolutionÚaspect_ratior¢   r=  r>  r?  Úresolution_embÚaspect_ratio_embr@  r,   r,   r-   r     s   
þz1PixArtAlphaCombinedTimestepSizeEmbeddings.forward©F)rŽ   r   r   r‘   r´   rp   r   r’   r,   r,   r‚   r-   rÖ    s    rÖ  c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )	rE  zÊ
    Projects caption embeddings. Also handles dropout for classifier-free guidance.

    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
    NÚ	gelu_tanhc                    sŠ   t ƒ  ¡  |d u r|}tj||dd| _|dkr tjdd| _n|dkr*t ¡ | _n|dkr3tƒ | _nt	d|› ƒ‚tj||dd| _
d S )	NTr”   râ  Útanh)ÚapproximaterÚ   rv  zUnknown activation function: )ro   rp   r   r—   rà   ÚGELUÚact_1r¤  r
   r;   rã   )r~   r•   r  r–   rÜ   r‚   r,   r-   rp   2  s   

z"PixArtAlphaTextProjection.__init__c                 C   s"   |   |¡}|  |¡}|  |¡}|S rè   )rà   ræ  rã   )r~   Úcaptionr—  r,   r,   r-   r   A  s   


z!PixArtAlphaTextProjection.forward)Nrâ  r§   r,   r,   r‚   r-   rE  +  s    rE  c                       sD   e Zd Z				ddedededed	d
f
‡ fdd„Zdd„ Z‡  ZS )Ú!IPAdapterPlusImageProjectionBlockre   r‘  rQ   r6   Ú
embed_dimsÚdim_headÚheadsÚ	ffn_ratior5   Nc              
      sf   t ƒ  ¡  ddlm} t |¡| _t |¡| _t|||dd| _	t 
t |¡|||d|dd¡| _d S )Nr	   r(  F)Ú	query_dimrê  rë  Úout_biasr*  )r,  r+  rj   )ro   rp   r-  r)  r   rv   Úln0Úln1r   Úattnr„  r.  )r~   ré  rê  rë  rì  r)  r‚   r,   r-   rp   I  s   
ü
þz*IPAdapterPlusImageProjectionBlock.__init__c                 C   sH   |   |¡}|  |¡}tj||gdd}|  ||¡| }|  |¡| }|S )Nr‰   r   )rï  rð  r   r$   rñ  r.  )r~   rž   ÚlatentsÚresidualr~  r,   r,   r-   r   `  s   

z)IPAdapterPlusImageProjectionBlock.forward)re   r‘  rQ   r6   )rŽ   r   r   r=   r#   rp   r   r’   r,   r,   r‚   r-   rè  H  s"    ûþýüûúrè  c                       sn   e Zd ZdZ								dd	ed
edededededededdf‡ fdd„Zdejdejfdd„Z	‡  Z
S )ÚIPAdapterPlusImageProjectionaä  Resampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_queries (int):
            The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio
        of feedforward network hidden
            layer channels. Defaults to 4.
    re   r  é   r6   r‘  rQ   rt  ré  Úoutput_dimsÚhidden_dimsÚdepthrê  rë  Únum_queriesrì  r5   Nc	           	         sx   t ƒ  ¡  t t d|ˆ¡ˆd  ¡| _t |ˆ¡| _t ˆ|¡| _	t 
|¡| _t ‡ ‡‡‡fdd„t|ƒD ƒ¡| _d S )Nr	   rm   c                    s   g | ]	}t ˆˆ ˆˆƒ‘qS r,   ©rè  ©Ú.0r   ©rê  rì  rë  r÷  r,   r-   Ú
<listcomp>  ó    z9IPAdapterPlusImageProjection.__init__.<locals>.<listcomp>)ro   rp   r   ró   r   rô   rò  r—   Úproj_inÚproj_outrv   Únorm_outÚ
ModuleListÚrangeÚlayers)	r~   ré  rö  r÷  rø  rê  rë  rù  rì  r‚   rý  r-   rp   y  s   

ÿz%IPAdapterPlusImageProjection.__init__rž   c                 C   sP   | j  | d¡dd¡}|  |¡}| jD ]
}|}||||ƒ}q|  |¡}|  |¡S )z‹Forward pass.

        Args:
            x (torch.Tensor): Input Tensor.
        Returns:
            torch.Tensor: Output Tensor.
        r   r	   )rò  rE   r™   r   r  r  r  )r~   rž   rò  Úblockró  r,   r,   r-   r     s   



z$IPAdapterPlusImageProjection.forward)re   r  rõ  r6   r‘  rQ   rt  r6   ©rŽ   r   r   r‘   r=   r#   rp   r   rµ   r   r’   r,   r,   r‚   r-   rô  i  s<    ÷þýüûúùø	÷
örô  c                       s€   e Zd ZdZ												dd
edededededededededededdf‡ fdd„Zdejdejfdd„Z	‡  Z
S )Ú"IPAdapterFaceIDPlusImageProjectiona—  FacePerceiverResampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_tokens (int): Number of tokens num_queries (int): The number of queries. Defaults to 8.
        ffn_ratio (float): The expansion ratio of feedforward network hidden
            layer channels. Defaults to 4.
        ffproj_ratio (float): The expansion ratio of feedforward network hidden
            layer channels (for ID embeddings). Defaults to 4.
    re   rõ  r¹  r6   r‘  rQ   rt  r   ré  rö  r÷  Úid_embeddings_dimrø  rê  rë  r1  rù  rì  Úffproj_ratior5   Nc                    s¦   t ƒ  ¡  ddlm} || _ˆ| _d | _d| _d| _||ˆ| d|d| _	t
 ˆ¡| _t
 |ˆ¡| _t
 ˆ|¡| _t
 |¡| _t
 ‡ ‡‡‡fdd„t|ƒD ƒ¡| _d S )	Nr	   r(  Fr/   r*  )r,  r+  c                    s   g | ]	}t ˆˆ ˆˆƒ‘qS r,   rú  rû  ©rê  ré  rì  rë  r,   r-   rþ  Õ  rÿ  z?IPAdapterFaceIDPlusImageProjection.__init__.<locals>.<listcomp>)ro   rp   r-  r)  r1  r0   Úclip_embedsÚshortcutÚshortcut_scaleru   r   rv   rw   r—   r   r  r  r  r  r  )r~   ré  rö  r÷  r	  rø  rê  rë  r1  rù  rì  r
  r)  r‚   r  r-   rp   µ  s   

ÿz+IPAdapterFaceIDPlusImageProjection.__init__Ú	id_embedsc                 C   s¦   |  | jj¡}|  |¡}| d| j| j¡}|  |¡}|}|  | j¡}| d|j	d |j	d ¡}| j
D ]
}|}||||ƒ}q2|  |¡}|  |¡}| jrQ|| j|  }|S )zŸForward pass.

        Args:
            id_embeds (torch.Tensor): Input Tensor (ID embeds).
        Returns:
            torch.Tensor: Output Tensor.
        r   r   r7   )r‹   r  r   ru   rA   r1  r0   rw   r   r   r  r  r  r  r  )r~   r  rò  r  rž   r  ró  r`   r,   r,   r-   r   Ø  s   




z*IPAdapterFaceIDPlusImageProjection.forward)re   re   rõ  r¹  r6   r‘  rQ   r6   rt  r6   r   r  r,   r,   r‚   r-   r  ¤  sN    ôþýüûúùø	÷
öõôó#r  c                       sJ   e Zd Zdeeej eej f f‡ fdd„Zdee	j
 fdd„Z‡  ZS )ÚMultiIPAdapterImageProjectionÚIPAdapterImageProjectionLayersc                    s   t ƒ  ¡  t |¡| _d S rè   )ro   rp   r   r  Úimage_projection_layers)r~   r  r‚   r,   r-   rp   õ  s   
z&MultiIPAdapterImageProjection.__init__r®   c                 C   sÚ   g }t |tƒsd}tdd|dd | d¡g}t|ƒt| jƒkr/tdt|ƒ› dt| jƒ› ƒ‚t|| jƒD ]5\}}|jd	 |jd }}| 	|| f|jd
d …  ¡}||ƒ}| 	||f|jdd …  ¡}| 
|¡ q5|S )NzÓYou have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning.zimage_embeds not a listz1.0.0F)Ústandard_warnr	   zGimage_embeds must have the same length as image_projection_layers, got z and r   r   )r<   Úlistr   r}   r   r  r;   Úzipr   rA   Úappend)r~   r®   Úprojected_image_embedsÚdeprecation_messageÚimage_embedÚimage_projection_layerr¢   Ú
num_imagesr,   r,   r-   r   ù  s"   
ÿÿz%MultiIPAdapterImageProjection.forward)rŽ   r   r   r   r   r   ÚModuler   rp   r   rµ   r   r’   r,   r,   r‚   r-   r  ô  s    &r  )Fr	   r	   r   )r/   r/   )Fr   r/   rQ   )Trá  )rÄ   Fr/   r/   T)Tr   )Dr   Útypingr   r   r   r   Únumpyr>   r   Útorch.nn.functionalr   r'   rn  Úutilsr   Úactivationsr
   r   Úattention_processorr   rµ   r=   r´   r#   r.   ÚndarrayrP   rX   rB   rC   r  rc   r“   r¨   r½   rº   rÃ   r¾   rØ   rÙ   rì   rï   rú   r  r  r  r%  r&  r0  r2  rC  rK  rQ  rr  rƒ  r  r˜  r  r¡  r”  r¶  r·  rÖ  rE  rè  rô  r  r  r,   r,   r,   r-   Ú<module>   sÒ   úÿþýüû
ú:ûÿþýüû
ú0
ÿl.
(

ùÿþ
ý9üÿþýü
û00A$(="2]%!;P