o
    GÆÏið– ã                   @   sn  d dl Z d dlZd dlZd dlm  mZ d dlmZ ddlm	Z	 ddl
mZmZ ddlmZ 					dd
ejdedededededejfdd„Z				dždedeeeef B dedededejdB dedejfdd„Z		dŸdedeeeef B dedededejfdd„Z		 		 		d dejdB defd!d"„Zd¡d#d$„Zd¢d%d&„Z	 d£d'd(„Zd)d*„ Zd+d,„ ZG d-d.„ d.ej ƒZ!G d/d0„ d0ej ƒZ"G d1d2„ d2ej ƒZ#G d3d4„ d4ej ƒZ$			5	6		d¤d7ed8ed9ed:eeef dB dejdB dejeejejf B fd;d<„Z%	=			d¥d>eeeef d7edejdB dejeejejf B fd?d@„Z&	d¦dejdB defdAdB„Z'd§dCdD„Z(d¨dEdF„Z)dŸdGdH„Z*dIdddd5ej+fdJedKejeB d7efdLdM„Z,	5	N	d©dOejdPejeej B d8edQedRedeejejf fdSdT„Z-dOejfdUdV„Z.G dWdX„ dXej ƒZ/G dYdZ„ dZej ƒZ0G d[d\„ d\ej ƒZ1G d]d^„ d^ej ƒZ2G d_d`„ d`ej ƒZ3G dadb„ dbej ƒZ4G dcdd„ ddej ƒZ5G dedf„ dfej ƒZ6G dgdh„ dhej ƒZ7G didj„ djej ƒZ8G dkdl„ dlej ƒZ9G dmdn„ dnej ƒZ:G dodp„ dpej ƒZ;G dqdr„ drej ƒZ<G dsdt„ dtej ƒZ=G dudv„ dvej ƒZ>G dwdx„ dxej ƒZ?G dydz„ dzej ƒZ@G d{d|„ d|ej ƒZAG d}d~„ d~ej ƒZBG dd€„ d€ej ƒZCG dd‚„ d‚ej ƒZDG dƒd„„ d„ej ƒZEG d…d†„ d†ej ƒZFd‡dˆ„ ZGG d‰dŠ„ dŠej ƒZHG d‹dŒ„ dŒej ƒZIG ddŽ„ dŽej ƒZJG dd„ dej ƒZKG d‘d’„ d’ej ƒZLG d“d”„ d”ej ƒZMG d•d–„ d–ej ƒZNG d—d˜„ d˜ej ƒZOG d™dš„ dšej ƒZPG d›dœ„ dœej ƒZQdS )ªé    N)Únné   )Ú	deprecateé   )ÚFP32SiLUÚget_activation)Ú	AttentionFé'  Ú	timestepsÚembedding_dimÚflip_sin_to_cosÚdownscale_freq_shiftÚscaleÚ
max_periodÚreturnc           	      C   sö   t | jƒdksJ dƒ‚|d }t |¡ tjd|tj| jd }|||  }t |¡}| dd…df  	¡ |ddd…f  }|| }tj
t |¡t |¡gdd}|rktj
|dd…|d…f |dd…d|…f gdd}|d dkrytjj |d	¡}|S )
a&  
    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.

    Args
        timesteps (torch.Tensor):
            a 1-D Tensor of N indices, one per batch element. These may be fractional.
        embedding_dim (int):
            the dimension of the output.
        flip_sin_to_cos (bool):
            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
        downscale_freq_shift (float):
            Controls the delta between frequencies between dimensions
        scale (float):
            Scaling factor applied to the embeddings.
        max_period (int):
            Controls the maximum frequency of the embeddings
    Returns
        torch.Tensor: an [N x dim] Tensor of positional embeddings.
    r   zTimesteps should be a 1d-arrayr   r   )ÚstartÚendÚdtypeÚdeviceNéÿÿÿÿ©Údim)r   r   r   r   )ÚlenÚshapeÚmathÚlogÚtorchÚarangeÚfloat32r   ÚexpÚfloatÚcatÚsinÚcosr   Ú
functionalÚpad)	r
   r   r   r   r   r   Úhalf_dimÚexponentÚemb© r)   úO/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/embeddings.pyÚget_timestep_embedding   s   ÿ
$2r+   ç      ð?ÚnpÚ	embed_dimÚspatial_sizeÚtemporal_sizeÚspatial_interpolation_scaleÚtemporal_interpolation_scaler   Úoutput_typec                 C   sh  |dkrt | ||||dS | d dkrtdƒ‚t|tƒr ||f}d|  d }| d }tj|d |tjd| }	tj|d |tjd| }
tj|
|	d	d
}tj|dd}| 	dd|d |d g¡}t
||dd}tj||tjd| }t||dd}|ddd…dd…f }|j|d|jd | d}|dd…ddd…f }|j|d |d  dd}tj||gdd}|S )a˜  
    Creates 3D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension of inputs. It must be divisible by 16.
        spatial_size (`int` or `tuple[int, int]`):
            The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
            spatial dimensions (height and width).
        temporal_size (`int`):
            The temporal dimension of positional embeddings (number of frames).
        spatial_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for spatial grid interpolation.
        temporal_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for temporal grid interpolation.

    Returns:
        `torch.Tensor`:
            The 3D sinusoidal positional embeddings of shape `[temporal_size, spatial_size[0] * spatial_size[1],
            embed_dim]`.
    r-   )r.   r/   r0   r1   r2   é   r   ú"`embed_dim` must be divisible by 4é   r   ©r   r   Úxy©Úindexingr   r   Úpt©r3   N©r   Úoutput_sizer   )Ú_get_3d_sincos_pos_embed_npÚ
ValueErrorÚ
isinstanceÚintr   r   r   ÚmeshgridÚstackÚreshapeÚ!get_2d_sincos_pos_embed_from_gridÚ!get_1d_sincos_pos_embed_from_gridÚrepeat_interleaver   Úconcat)r.   r/   r0   r1   r2   r   r3   Úembed_dim_spatialÚembed_dim_temporalÚgrid_hÚgrid_wÚgridÚpos_embed_spatialÚgrid_tÚpos_embed_temporalÚ	pos_embedr)   r)   r*   Úget_3d_sincos_pos_embedP   s@   û
ÿÿrS   c                 C   sL  d}t dd|dd | d dkrtdƒ‚t|tƒr||f}d	|  d }| d }tj|d
 tjd| }tj|d tjd| }	t |	|¡}
tj|
dd}
|
 	dd
|d
 |d g¡}
t
||
ƒ}tj|tjd| }t||ƒ}|tjdd…dd…f }tj||dd}|dd…tjdd…f }tj||d |d
  d
d}tj||gdd}|S )a–  
    Creates 3D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension of inputs. It must be divisible by 16.
        spatial_size (`int` or `tuple[int, int]`):
            The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
            spatial dimensions (height and width).
        temporal_size (`int`):
            The temporal dimension of positional embeddings (number of frames).
        spatial_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for spatial grid interpolation.
        temporal_interpolation_scale (`float`, defaults to 1.0):
            Scale factor for temporal grid interpolation.

    Returns:
        `np.ndarray`:
            The 3D sinusoidal positional embeddings of shape `[temporal_size, spatial_size[0] * spatial_size[1],
            embed_dim]`.
    z•`get_3d_sincos_pos_embed` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.úoutput_type=='np'ú0.33.0F©Ústandard_warnr4   r   r5   r6   r   ©r   ©Úaxisr   Nr   )r   r@   rA   rB   r-   r   r   rC   rD   rE   rF   rG   ÚnewaxisÚrepeatÚconcatenate)r.   r/   r0   r1   r2   Údeprecation_messagerJ   rK   rL   rM   rN   rO   rP   rQ   rR   r)   r)   r*   r?   š   s.   ÿ


r?   é   c                 C   s   |dkrd}t dd|dd t| |||||dS t|tƒr!||f}tj|d |tjd	|d |  | }	tj|d
 |tjd	|d
 |  | }
tj|
|	dd}tj|dd}| 	dd
|d
 |d g¡}t
| ||d}|r~|dkr~tjt || g¡|gdd}|S )aÒ  
    Creates 2D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension.
        grid_size (`int`):
            The size of the grid height and width.
        cls_token (`bool`, defaults to `False`):
            Whether or not to add a classification token.
        extra_tokens (`int`, defaults to `0`):
            The number of extra tokens to add.
        interpolation_scale (`float`, defaults to `1.0`):
            The scale of the interpolation.

    Returns:
        pos_embed (`torch.Tensor`):
            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
            embed_dim]` if using cls_token
    r-   ú•`get_2d_sincos_pos_embed` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.rT   rU   FrV   )r.   Ú	grid_sizeÚ	cls_tokenÚextra_tokensÚinterpolation_scaleÚ	base_sizer   r7   r   r8   r9   r   r   r<   )r   Úget_2d_sincos_pos_embed_nprA   rB   r   r   r   rC   rD   rE   rF   rI   Úzeros)r.   ra   rb   rc   rd   re   r   r3   r^   rL   rM   rN   rR   r)   r)   r*   Úget_2d_sincos_pos_embedÜ   sB   ÿú

ÿþÿ
ÿþÿrh   c                 C   s~   |dkrd}t dd|dd t| |dS | d d	krtd
ƒ‚t| d |d	 |d}t| d |d |d}tj||gdd}|S )aG  
    This function generates 2D sinusoidal positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension.
        grid (`torch.Tensor`): Grid of positions with shape `(H * W,)`.

    Returns:
        `torch.Tensor`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
    r-   zŸ`get_2d_sincos_pos_embed_from_grid` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.rT   rU   FrV   )r.   rN   r   r   ú embed_dim must be divisible by 2r<   r   r   )r   Ú$get_2d_sincos_pos_embed_from_grid_npr@   rG   r   rI   )r.   rN   r3   r^   Úemb_hÚemb_wr(   r)   r)   r*   rF      s   ÿþrF   c                 C   s  |dkrd}t dd|dd t| |dS | d d	krtd
ƒ‚|du r.|jjdkr+tjntj}tj| d |j|d}|| d  }dd|  }| 	d¡}t 
||¡}t |¡}t |¡}	tj||	gdd}
|r‚tj|
dd…| d d…f |
dd…d| d …f gdd}
|
S )aâ  
    This function generates 1D positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension `D`
        pos (`torch.Tensor`): 1D tensor of positions with shape `(M,)`
        output_type (`str`, *optional*, defaults to `"np"`): Output type. Use `"pt"` for PyTorch tensors.
        flip_sin_to_cos (`bool`, *optional*, defaults to `False`): Whether to flip sine and cosine embeddings.
        dtype (`torch.dtype`, *optional*): Data type for frequency calculations. If `None`, defaults to
            `torch.float32` on MPS devices (which don't support `torch.float64`) and `torch.float64` on other devices.

    Returns:
        `torch.Tensor`: Sinusoidal positional embeddings of shape `(M, D)`.
    r-   zŸ`get_1d_sincos_pos_embed_from_grid` uses `torch` and supports `device`. `from_numpy` is no longer required.  Pass `output_type='pt' to use the new version now.rT   z0.34.0FrV   )r.   Úposr   r   ri   NÚmpsr7   ç       @r,   r	   r   r   r   )r   Ú$get_1d_sincos_pos_embed_from_grid_npr@   r   Útyper   r   Úfloat64r   rE   Úouterr"   r#   rI   r!   )r.   rm   r3   r   r   r^   ÚomegaÚoutÚemb_sinÚemb_cosr(   r)   r)   r*   rG   A  s(   ÿ


:rG   c           
      C   sÄ   t |tƒr	||f}tj|d tjd|d |  | }tj|d tjd|d |  | }t ||¡}tj|dd}| dd|d |d g¡}t| |ƒ}	|r`|dkr`tj	t 
|| g¡|	gdd}	|	S )aÐ  
    Creates 2D sinusoidal positional embeddings.

    Args:
        embed_dim (`int`):
            The embedding dimension.
        grid_size (`int`):
            The size of the grid height and width.
        cls_token (`bool`, defaults to `False`):
            Whether or not to add a classification token.
        extra_tokens (`int`, defaults to `0`):
            The number of extra tokens to add.
        interpolation_scale (`float`, defaults to `1.0`):
            The scale of the interpolation.

    Returns:
        pos_embed (`np.ndarray`):
            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
            embed_dim]` if using cls_token
    r   rX   r   rY   r   )rA   rB   r-   r   r   rC   rD   rE   rj   r]   rg   )
r.   ra   rb   rc   rd   re   rL   rM   rN   rR   r)   r)   r*   rf   r  s   
$$
rf   c                 C   sN   | d dkr
t dƒ‚t| d |d ƒ}t| d |d ƒ}tj||gdd}|S )aC  
    This function generates 2D sinusoidal positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension.
        grid (`np.ndarray`): Grid of positions with shape `(H * W,)`.

    Returns:
        `np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
    r   r   ri   r   rY   )r@   rp   r-   r]   )r.   rN   rk   rl   r(   r)   r)   r*   rj   ˜  s   rj   c                 C   s‚   | d dkr
t dƒ‚tj| d tjd}|| d  }dd|  }| d¡}t d	||¡}t |¡}t |¡}tj||gd
d}|S )a,  
    This function generates 1D positional embeddings from a grid.

    Args:
        embed_dim (`int`): The embedding dimension `D`
        pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)`

    Returns:
        `numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`.
    r   r   ri   rX   ro   r,   r	   r   zm,d->mdr   rY   )	r@   r-   r   rr   rE   Úeinsumr"   r#   r]   )r.   rm   rt   ru   rv   rw   r(   r)   r)   r*   rp   ®  s   


rp   c                       sH   e Zd ZdZ												
d‡ fdd„	Zdd„ Zdd„ Z‡  ZS )Ú
PatchEmbedaÍ  
    2D Image to Patch Embedding with support for SD3 cropping.

    Args:
        height (`int`, defaults to `224`): The height of the image.
        width (`int`, defaults to `224`): The width of the image.
        patch_size (`int`, defaults to `16`): The size of the patches.
        in_channels (`int`, defaults to `3`): The number of input channels.
        embed_dim (`int`, defaults to `768`): The output dimension of the embedding.
        layer_norm (`bool`, defaults to `False`): Whether or not to use layer normalization.
        flatten (`bool`, defaults to `True`): Whether or not to flatten the output.
        bias (`bool`, defaults to `True`): Whether or not to use bias.
        interpolation_scale (`float`, defaults to `1`): The scale of the interpolation.
        pos_embed_type (`str`, defaults to `"sincos"`): The type of positional embedding.
        pos_embed_max_size (`int`, defaults to `None`): The maximum size of the positional embedding.
    éà   r_   r6   é   FTr   ÚsincosNc                    s  t ƒ  ¡  || ||  }|| _|| _|| _tj||||f||d| _|r/tj|ddd| _	nd | _	|| _
|| || | _| _|| | _|	| _|rM|}nt|d ƒ}|
d u r\d | _d S |
dkr€t||| j| jdd}|rod	nd}| jd
| ¡  d¡|d d S td|
› ƒ‚)N©Úkernel_sizeÚstrideÚbiasFgíµ ÷Æ°>)Úelementwise_affineÚepsç      à?r|   r;   )re   rd   r3   TrR   r   ©Ú
persistentzUnsupported pos_embed_type: )ÚsuperÚ__init__ÚflattenÚ
layer_normÚpos_embed_max_sizer   ÚConv2dÚprojÚ	LayerNormÚnormÚ
patch_sizeÚheightÚwidthre   rd   rB   rR   rh   Úregister_bufferr    Ú	unsqueezer@   )Úselfr   r‘   r   Úin_channelsr.   r‰   rˆ   r€   rd   Úpos_embed_typerŠ   Únum_patchesra   rR   r…   ©Ú	__class__r)   r*   r‡   Ü  s>   
ÿ

ûzPatchEmbed.__init__c                 C   sÜ   | j du r	tdƒ‚|| j }|| j }|| j kr$td|› d| j › dƒ‚|| j kr5td|› d| j › dƒ‚| j | d }| j | d }| j d| j | j d	¡}|dd…||| …||| …dd…f }| dd	|jd	 ¡}|S )
z2Crops positional embeddings for SD3 compatibility.Nz.`pos_embed_max_size` must be set for cropping.zHeight (z/) cannot be greater than `pos_embed_max_size`: Ú.zWidth (r   r   r   )rŠ   r@   r   rR   rE   r   )r”   r   r‘   ÚtopÚleftÚspatial_pos_embedr)   r)   r*   Úcropped_pos_embed  s$   



ÿ
ÿ(zPatchEmbed.cropped_pos_embedc                 C   sü   | j d ur|jdd … \}}n|jd | j |jd | j }}|  |¡}| jr1| d¡ dd¡}| jr9|  |¡}| jd u rD| 	|j
¡S | j rN|  ||¡}n(| j|ksX| j|krst| jjd ||f| j| j|jdd}| ¡  d¡}n| j}||  	|j
¡S )Néþÿÿÿr   r   r   r;   )r.   ra   re   rd   r   r3   r   )rŠ   r   r   rŒ   rˆ   Ú	transposer‰   rŽ   rR   Útor   rž   r   r‘   rh   re   rd   r   r    r“   )r”   Úlatentr   r‘   rR   r)   r)   r*   Úforward*  s0   
"



úzPatchEmbed.forward)rz   rz   r_   r6   r{   FTTr   r|   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r‡   rž   r£   Ú__classcell__r)   r)   r˜   r*   ry   Ê  s     ô7ry   c                       s*   e Zd ZdZd
‡ fdd„	Zdd	„ Z‡  ZS )ÚLuminaPatchEmbedaz  
    2D Image to Patch Embedding with support for Lumina-T2X

    Args:
        patch_size (`int`, defaults to `2`): The size of the patches.
        in_channels (`int`, defaults to `4`): The number of input channels.
        embed_dim (`int`, defaults to `768`): The output dimension of the embedding.
        bias (`bool`, defaults to `True`): Whether or not to use bias.
    r   r4   r{   Tc                    s.   t ƒ  ¡  || _tj|| | ||d| _d S )N©Úin_featuresÚout_featuresr€   )r†   r‡   r   r   ÚLinearrŒ   )r”   r   r•   r.   r€   r˜   r)   r*   r‡   U  s   

ýzLuminaPatchEmbed.__init__c                 C   sÖ   |  |d j¡}| j }}| ¡ \}}}}|| || }	}
| |||	||
|¡ dddddd¡}| d¡}|  |¡}| dd¡}tj	|j
d |j
d tj|jd}||||fg| |d|	…d|
…f  dd¡ d¡fS )	aÉ  
        Patchifies and embeds the input tensor(s).

        Args:
            x (list[torch.Tensor] | torch.Tensor): The input tensor(s) to be patchified and embedded.

        Returns:
            tuple[torch.Tensor, torch.Tensor, list[tuple[int, int]], torch.Tensor]: A tuple containing the patchified
            and embedded tensor(s), the mask indicating the valid patches, the original image size(s), and the
            frequency tensor(s).
        r   r   r4   r   r6   é   ©r   r   N)r¡   r   r   ÚsizeÚviewÚpermuterˆ   rŒ   r   Úonesr   Úint32r“   )r”   ÚxÚ	freqs_cisÚpatch_heightÚpatch_widthÚ
batch_sizeÚchannelr   r‘   Úheight_tokensÚwidth_tokensÚmaskr)   r)   r*   r£   ^  s    
ÿ

" üzLuminaPatchEmbed.forward)r   r4   r{   T©r¤   r¥   r¦   r§   r‡   r£   r¨   r)   r)   r˜   r*   r©   J  s    
	r©   c                !       sÂ   e Zd Z											
					d'dededB dedededededededededededededdf ‡ fdd„Z	d(dededed ejdB dej	f
d!d"„Z
d#ej	d$ej	fd%d&„Z‡  ZS ))ÚCogVideoXPatchEmbedr   Nr_   é€  é   TéZ   é<   é1   r4   éâ   ç      þ?r,   r   Úpatch_size_tr•   r.   Útext_embed_dimr€   Úsample_widthÚsample_heightÚsample_framesÚtemporal_compression_ratioÚmax_text_seq_lengthr1   r2   Úuse_positional_embeddingsÚ!use_learned_positional_embeddingsr   c                    sÐ   t ƒ  ¡  || _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _|| _|d u r;tj||||f||d| _nt || | | |¡| _t ||¡| _|sS|rf|}|  |||	¡}| jd||d d S d S )Nr}   Úpos_embeddingr„   )r†   r‡   r   rÇ   r.   rÊ   rÉ   rË   rÌ   rÍ   r1   r2   rÎ   rÏ   r   r‹   rŒ   r­   Ú	text_projÚ_get_positional_embeddingsr’   )r”   r   rÇ   r•   r.   rÈ   r€   rÉ   rÊ   rË   rÌ   rÍ   r1   r2   rÎ   rÏ   r…   rÐ   r˜   r)   r*   r‡     s0   

ÿýzCogVideoXPatchEmbed.__init__r   c              	   C   sš   || j  }|| j  }|d | j d }|| | }t| j||f|| j| j|dd}	|	 dd¡}	|	jd| j| | jdd}
|
j	d d …| jd …f  
|	¡ |
S )Nr   r;   )r   r3   r   F©Úrequires_grad)r   rÌ   rS   r.   r1   r2   rˆ   Ú	new_zerosrÍ   ÚdataÚcopy_)r”   rÊ   rÉ   rË   r   Úpost_patch_heightÚpost_patch_widthÚpost_time_compression_framesr—   rÐ   Újoint_pos_embeddingr)   r)   r*   rÒ   ²  s&   

ù	ÿz.CogVideoXPatchEmbed._get_positional_embeddingsÚtext_embedsÚimage_embedsc              
   C   sš  |   |¡}|j\}}}}}| jdu r>| d|||¡}|  |¡}|j||g|jdd… ¢R Ž }| d¡ dd¡}| dd¡}n:| j}| j}	| 	ddddd¡}| |||	 |	|| ||| ||¡}| 	ddddd	ddd
¡ dd	¡ dd¡}|  |¡}t
j||gdd ¡ }
| js‰| jrË| jrš| j|ks–| j|krštdƒ‚|d | j d }| j|ks²| j|ks²| j|kr½| j||||
jd}n| j}|j|
jd}|
| }
|
S )a7  
        Args:
            text_embeds (`torch.Tensor`):
                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
            image_embeds (`torch.Tensor`):
                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
        Nr   r   r6   r   r   r4   r®   é   é   r   a   It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'.If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues.©r   rX   )rÑ   r   rÇ   rE   rŒ   r±   rˆ   r    r   r²   r   r!   Ú
contiguousrÎ   rÏ   rÉ   rÊ   r@   rÌ   rË   rÒ   r   rÐ   r¡   r   )r”   rÜ   rÝ   r¹   Ú
num_framesÚchannelsr   r‘   ÚpÚp_tÚembedsÚpre_time_compression_framesrÐ   r)   r)   r*   r£   Ë  sJ   


ÿ(
ÿþÿ



ÿzCogVideoXPatchEmbed.forward)r   Nr_   rÀ   rÁ   TrÂ   rÃ   rÄ   r4   rÅ   rÆ   r,   TT©N)r¤   r¥   r¦   rB   Úboolr    r‡   r   r   ÚTensorrÒ   r£   r¨   r)   r)   r˜   r*   r¿   €  s|    ðþýüûúùø	÷
öõôóòñðï2ÿÿÿÿÿ
þr¿   c                       sZ   e Zd Z					ddededed	ed
ef
‡ fdd„Zdejdejdejfdd„Z‡  ZS )ÚCogView3PlusPatchEmbedr_   é 
  r   rÁ   é€   r•   Úhidden_sizer   Útext_hidden_sizerŠ   c                    s‚   t ƒ  ¡  || _|| _|| _|| _|| _t ||d  |¡| _	t ||¡| _
t|||dd}| |||¡}| jd| ¡ dd d S )Nr   r;   )re   r3   rR   Fr„   )r†   r‡   r•   rî   r   rï   rŠ   r   r­   rŒ   rÑ   rh   rE   r’   r    )r”   r•   rî   r   rï   rŠ   rR   r˜   r)   r*   r‡     s   
ÿzCogView3PlusPatchEmbed.__init__Úhidden_statesÚencoder_hidden_statesr   c                 C   s&  |j \}}}}|| j dks|| j dkrtdƒ‚|| j }|| j }| |||| j|| j¡}| dddddd¡ ¡ }| ||| || j | j ¡}|  |¡}|  |¡}tj	||gdd}|j d }| j
d |…d |…f  || d	¡}tj|| jf|j|jd
}	tj	|	|gddd }
||
  |j¡S )Nr   z0Height and width must be divisible by patch sizer   r4   r   r6   r®   r   r   r¯   )N.)r   r   r@   r±   r²   rá   rŒ   rÑ   r   r!   rR   rE   rg   rî   r   r   r¡   )r”   rð   rñ   r¹   rº   r   r‘   Útext_lengthÚimage_pos_embedÚtext_pos_embedrR   r)   r)   r*   r£   !  s$   




"ÿzCogView3PlusPatchEmbed.forward)r_   rì   r   rÁ   rí   ©	r¤   r¥   r¦   rB   r‡   r   rê   r£   r¨   r)   r)   r˜   r*   rë     s$    úþýüûú$rë   TÚlinspaceÚthetaÚuse_realÚ	grid_typeÚmax_sizec	                    sø  |durt dƒ‚|dkrY|\}	}
|\‰ ‰tj|	d |
d ˆ d  ˆ  ˆ |tjd}tj|	d |
d ˆd  ˆ ˆ|tjd}tjˆ|tjd}tjdˆˆd  ˆ ˆ|tjd}n,|dkr|\}}|\‰ ‰tj||tjd}tj||tjd}tjˆ|tjd}nt dƒ‚| d	 }| d
 d }| d
 d }t|||dd}t|||dd}t|||dd}‡ ‡‡fdd„}|\}}|\}}|\}}|dkrì|dˆ… |dˆ… }}|dˆ … |dˆ … }}|dˆ… |dˆ… }}||||ƒ}||||ƒ}||fS )aÏ  
    RoPE for video tokens with 3D structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size, corresponding to hidden_size_head.
    crops_coords (`tuple[int]`):
        The top-left and bottom-right coordinates of the crop.
    grid_size (`tuple[int]`):
        The grid size of the spatial positional embedding (height, width).
    temporal_size (`int`):
        The size of the temporal dimension.
    theta (`float`):
        Scaling factor for frequency computation.
    grid_type (`str`):
        Whether to use "linspace" or "slice" to compute grids.

    Returns:
        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
    TzJ `use_real = False` is not currently supported for get_3d_rotary_pos_embedrö   r   r   r7   Úslicez%Invalid value passed for `grid_type`.r4   é   r6   )r÷   rø   c                    s˜   | d d …d d d d …f   dˆ ˆd¡} |d d d …d d d …f   ˆdˆd¡}|d d d d …d d …f   ˆˆ dd¡}tj| ||gdd}| ˆˆ  ˆ d¡}|S )Nr   r   )Úexpandr   r!   r±   )Úfreqs_tÚfreqs_hÚfreqs_wÚfreqs©Úgrid_size_hÚgrid_size_wr0   r)   r*   Úcombine_time_height_width‚  s    ÿÿÿ
ÿÿz:get_3d_rotary_pos_embed.<locals>.combine_time_height_widthN)r@   r   rö   r   r   Úget_1d_rotary_pos_embed)r.   Úcrops_coordsra   r0   r÷   rø   rù   rú   r   r   ÚstoprL   rM   rP   Úmax_hÚmax_wÚdim_tÚdim_hÚdim_wrþ   rÿ   r   r  Út_cosÚt_sinÚh_cosÚh_sinÚw_cosÚw_sinr#   r"   r)   r  r*   Úget_3d_rotary_pos_embed>  sN    ÿ ÿÿr  ©r,   r,   r,   rd   c                 C   sú   |\}}|\}	}
|\}}}t jd||d  | ||t jd}t j|d |d |	d  |	 |	|t jd}t j|d |d |
d  |
 |
|t jd}| d }| d }| d }t||| |ddd}t||| |ddd}t||| |ddd}||||||fS )Nr   r   r7   r6   TF)r÷   rø   Úrepeat_interleave_real)r   rö   r   r  )r.   r  ra   r0   rd   r÷   r   r   r  r  r  Úinterpolation_scale_tÚinterpolation_scale_hÚinterpolation_scale_wrP   rL   rM   r  r  r  rþ   rÿ   r   r)   r)   r*   Úget_3d_rotary_pos_embed_allegro£  s2   

ÿ ÿ ÿÿÿÿr  c                 C   sì   |dkrd}t dd|dd t| |||dS |\}}tj|d |d |d d	  |d  |d |tjd
}	tj|d	 |d	 |d	 d	  |d	  |d	 |tjd
}
tj|
|	dd}tj|dd}| dd	g|jd	d… ¢¡}t	| ||d}|S )ak  
    RoPE for image tokens with 2d structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size
    crops_coords (`tuple[int]`)
        The top-left and bottom-right coordinates of the crop.
    grid_size (`tuple[int]`):
        The grid size of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.
    device: (`torch.device`, **optional**):
        The device used to create tensors.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r-   r`   rT   rU   FrV   )r.   r  ra   rø   r   r   r7   r8   r9   r   r   N©rø   )
r   Ú_get_2d_rotary_pos_embed_npr   rö   r   rC   rD   rE   r   Ú!get_2d_rotary_pos_embed_from_grid)r.   r  ra   rø   r   r3   r^   r   r  rL   rM   rN   rR   r)   r)   r*   Úget_2d_rotary_pos_embedÎ  s,   ÿü,ÿ,ÿr  c           
      C   s”   |\}}t j|d |d |d dt jd}t j|d |d |d dt jd}t  ||¡}t j|dd}| ddg|jdd… ¢¡}t| ||d}	|	S )	a  
    RoPE for image tokens with 2d structure.

    Args:
    embed_dim: (`int`):
        The embedding dimension size
    crops_coords (`tuple[int]`)
        The top-left and bottom-right coordinates of the crop.
    grid_size (`tuple[int]`):
        The grid size of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r   F)Úendpointr   r   rY   r   Nr  )r-   rö   r   rC   rD   rE   r   r  )
r.   r  ra   rø   r   r  rL   rM   rN   rR   r)   r)   r*   r     s   ""r  c                 C   sž   | d dksJ ‚t | d |d  d¡|d}t | d |d  d¡|d}|rDtj|d |d gdd}tj|d |d gdd}||fS tj||gdd}|S )a½  
    Get 2D RoPE from grid.

    Args:
    embed_dim: (`int`):
        The embedding dimension size, corresponding to hidden_size_head.
    grid (`np.ndarray`):
        The grid of the positional embedding.
    use_real (`bool`):
        If True, return real part and imaginary part separately. Otherwise, return complex numbers.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r4   r   r   r   r  r   r   )r  rE   r   r!   )r.   rN   rø   rk   rl   r#   r"   r(   r)   r)   r*   r    s   ÿÿr  c                 C   s”   | d dksJ ‚t | d |||d}t | d |||d}| |d| d d¡ d|dd¡}| d|| d d¡ |ddd¡}tj||gdd d¡}|S )as  
    Get 2D RoPE from grid.

    Args:
    embed_dim: (`int`):
        The embedding dimension size, corresponding to hidden_size_head.
    grid (`np.ndarray`):
        The grid of the positional embedding.
    linear_factor (`float`):
        The linear factor of the positional embedding, which is used to scale the positional embedding in the linear
        layer.
    ntk_factor (`float`):
        The ntk factor of the positional embedding, which is used to scale the positional embedding in the ntk layer.

    Returns:
        `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
    r4   r   r   )Úlinear_factorÚ
ntk_factorr   r   r   )r  r±   r\   r   r!   rˆ   )r.   Úlen_hÚlen_wr   r!  rk   rl   r(   r)   r)   r*   Úget_2d_rotary_pos_embed_lumina>  s   ÿÿ  r$  ç     ˆÃ@r   rm   c              	   C   s>  | d dksJ ‚t |tƒrt |¡}t |tjƒrt |¡}|| }d|tjd| d||jd|    | }t ||¡}|jj	dk}	|	rF| 
¡ }|rp|rp| ¡ jdd|jd d d 
¡ }
| ¡ jdd|jd d d 
¡ }|
|fS |r”tj| ¡ | ¡ gdd	 
¡ }
tj| ¡ | ¡ gdd	 
¡ }|
|fS t t |¡|¡}|S )
a  
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
    data type.

    Args:
        dim (`int`): Dimension of the frequency tensor.
        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
        theta (`float`, *optional*, defaults to 10000.0):
            Scaling factor for frequency computation. Defaults to 10000.0.
        use_real (`bool`, *optional*):
            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
        linear_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor for the context extrapolation. Defaults to 1.0.
        ntk_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
        repeat_interleave_real (`bool`, *optional*, defaults to `True`):
            If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
            Otherwise, they are concateanted with themselves.
        freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
            the dtype of the frequency tensor.
    Returns:
        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
    r   r   r,   r¯   Únpur   r=   r   r   )rA   rB   r   r   r-   ÚndarrayÚ
from_numpyr   rs   rq   r    r#   rH   r   r"   r!   ÚpolarÚ	ones_like)r   rm   r÷   rø   r   r!  r  Úfreqs_dtyper  Úis_npuÚ	freqs_cosÚ	freqs_sinr¶   r)   r)   r*   r  _  s,   $


$ÿ""r  r   rµ   r¶   Úuse_real_unbind_dimÚsequence_dimc                 C   sÌ  |r¼|\}}|dkr#|dddd…dd…f }|dddd…dd…f }n%|dkr@|ddd…ddd…f }|ddd…ddd…f }nt d|› dƒ‚| | j¡| | j¡}}|dkr}| jg | jdd… ¢d‘d‘R Ž  d¡\}}tj| |gdd d¡}	n-|d	kr¢| jg | jdd… ¢d‘d‘R Ž  d	¡\}}tj	| |gdd}	nt d
|› dƒ‚|  
¡ | |	 
¡ |   | j¡}
|
S t |  
¡ jg | jdd… ¢d‘d‘R Ž ¡}	| d¡}t |	| ¡ d¡}| | ¡S )a3  
    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
    tensors contain rotary embeddings and are returned as real tensors.

    Args:
        x (`torch.Tensor`):
            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
        freqs_cis (`tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)

    Returns:
        tuple[torch.Tensor, torch.Tensor]: tuple of modified query tensor and key tensor with rotary embeddings.
    r   Nr   z`sequence_dim=z` but should be 1 or 2.r   r   r6   rŸ   z`use_real_unbind_dim=z` but should be -1 or -2.)r@   r¡   r   rE   r   Úunbindr   rD   rˆ   r!   r    r   Úview_as_complexr“   Úview_as_realÚtype_as)rµ   r¶   rø   r/  r0  r#   r"   Úx_realÚx_imagÚ	x_rotatedru   Úx_outr)   r)   r*   Úapply_rotary_emb¢  s.   ,, ,

r9  c                 C   s€   dd„ }|\\}}\}}\}}	| j ddd\}
}}||
|d ||ƒ}
|||d ||ƒ}|||d ||	ƒ}tj|
||gdd} | S )	Nc                 S   s²   t  ||¡d d …d d d …d d …f }t  ||¡d d …d d d …d d …f }| dd | jd d …f | d| jd d d …f }}tj| |fdd}|  ¡ | | ¡ |   | j¡S )N.r   r   r   )ÚFÚ	embeddingr   r   r!   r    r¡   r   )Útokensrm   r#   r"   Úx1Úx2Útokens_rotatedr)   r)   r*   Úapply_1d_ropeÝ  s
   $$6 z/apply_rotary_emb_allegro.<locals>.apply_1d_roper6   r   r   r   r   r   )Úchunkr   r!   )rµ   r¶   Ú	positionsr@  r  r  r  r  r  r  ÚtÚhÚwr)   r)   r*   Úapply_rotary_emb_allegroÛ  s   rF  c                       sL   e Zd Z					ddedededededB f
‡ fd	d
„Zddd„Z‡  ZS )ÚTimestepEmbeddingÚsiluNTr•   Útime_embed_dimÚact_fnÚout_dimÚpost_act_fnc           	         sˆ   t ƒ  ¡  t |||¡| _|d urtj||dd| _nd | _t|ƒ| _|d ur*|}n|}t |||¡| _|d u r=d | _	d S t|ƒ| _	d S )NF©r€   )
r†   r‡   r   r­   Úlinear_1Ú	cond_projr   ÚactÚlinear_2Úpost_act)	r”   r•   rI  rJ  rK  rL  Úcond_proj_dimÚsample_proj_biasÚtime_embed_dim_outr˜   r)   r*   r‡   î  s   



zTimestepEmbedding.__init__c                 C   sV   |d ur||   |¡ }|  |¡}| jd ur|  |¡}|  |¡}| jd ur)|  |¡}|S rè   )rO  rN  rP  rQ  rR  )r”   ÚsampleÚ	conditionr)   r)   r*   r£     s   





zTimestepEmbedding.forward)rH  NNNTrè   )r¤   r¥   r¦   rB   Ústrr‡   r£   r¨   r)   r)   r˜   r*   rG  í  s$    øþýüûú rG  c                	       sF   e Zd Zddedededef‡ fdd„Zdejd	ejfd
d„Z	‡  Z
S )Ú	Timestepsr   Únum_channelsr   r   r   c                    s&   t ƒ  ¡  || _|| _|| _|| _d S rè   )r†   r‡   rZ  r   r   r   )r”   rZ  r   r   r   r˜   r)   r*   r‡     s
   

zTimesteps.__init__r
   r   c                 C   s   t || j| j| j| jd}|S )N)r   r   r   )r+   rZ  r   r   r   )r”   r
   Út_embr)   r)   r*   r£   %  s   ûzTimesteps.forward)r   )r¤   r¥   r¦   rB   ré   r    r‡   r   rê   r£   r¨   r)   r)   r˜   r*   rY    s     rY  c                       s6   e Zd ZdZ	ddedef‡ fdd	„Zd
d„ Z‡  ZS )ÚGaussianFourierProjectionz-Gaussian Fourier embeddings for noise levels.é   r,   TFÚembedding_sizer   c                    sf   t ƒ  ¡  tjt |¡| dd| _|| _|| _|r1| `tjt |¡| dd| _	| j	| _| `	d S d S )NFrÓ   )
r†   r‡   r   Ú	Parameterr   ÚrandnÚweightr   r   ÚW)r”   r^  r   Úset_W_to_weightr   r   r˜   r)   r*   r‡   3  s   
ûz"GaussianFourierProjection.__init__c                 C   s†   | j rt  |¡}|d d …d f | jd d d …f  d tj }| jr2tjt |¡t |¡gdd}|S tjt |¡t |¡gdd}|S )Nr   r   r   )	r   r   ra  r-   Úpir   r!   r#   r"   )r”   rµ   Úx_projru   r)   r)   r*   r£   B  s   
,ÿz!GaussianFourierProjection.forward)r]  r,   TTF)	r¤   r¥   r¦   r§   rB   r    r‡   r£   r¨   r)   r)   r˜   r*   r\  0  s    ÿÿÿr\  c                       s4   e Zd ZdZd	dedef‡ fdd„Zdd„ Z‡  ZS )
ÚSinusoidalPositionalEmbeddinga[  Apply positional information to a sequence of embeddings.

    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
    them

    Args:
        embed_dim: (int): Dimension of the positional embedding.
        max_seq_length: Maximum sequence length to apply positional embeddings

    é    r.   Úmax_seq_lengthc                    s    t ƒ  ¡  t |¡ d¡}t t d|d¡t d¡ |  ¡}t d||¡}t 	|| ¡|dd d …dd d…f< t 
|| ¡|dd d …dd d…f< |  d|¡ d S )Nr   r   r   r%  Úpe)r†   r‡   r   r   r“   r   r   r   rg   r"   r#   r’   )r”   r.   rh  ÚpositionÚdiv_termri  r˜   r)   r*   r‡   [  s   
$""z&SinusoidalPositionalEmbedding.__init__c                 C   s*   |j \}}}|| jd d …d |…f  }|S rè   )r   ri  )r”   rµ   Ú_Ú
seq_lengthr)   r)   r*   r£   d  s   z%SinusoidalPositionalEmbedding.forward)rg  ©r¤   r¥   r¦   r§   rB   r‡   r£   r¨   r)   r)   r˜   r*   rf  O  s    	rf  c                       s:   e Zd ZdZdedededef‡ fdd„Zdd	„ Z‡  ZS )
ÚImagePositionalEmbeddingsa•  
    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
    height and width of the latent space.

    For more details, see figure 10 of the dall-e paper: https://huggingface.co/papers/2102.12092

    For VQ-diffusion:

    Output vector embeddings are used as input for the transformer.

    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.

    Args:
        num_embed (`int`):
            Number of embeddings for the latent pixels embeddings.
        height (`int`):
            Height of the latent image i.e. the number of height embeddings.
        width (`int`):
            Width of the latent image i.e. the number of width embeddings.
        embed_dim (`int`):
            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
    Ú	num_embedr   r‘   r.   c                    sV   t ƒ  ¡  || _|| _|| _|| _t | j|¡| _t | j|¡| _	t | j|¡| _
d S rè   )r†   r‡   r   r‘   rp  r.   r   Ú	Embeddingr(   Ú
height_embÚ	width_emb)r”   rp  r   r‘   r.   r˜   r)   r*   r‡   ‚  s   
z"ImagePositionalEmbeddings.__init__c                 C   s¨   |   |¡}|  tj| j|jd d| j¡¡}| d¡}|  tj| j	|jd d| j	¡¡}| d¡}|| }| d| j| j	 d¡}||d d …d |j
d …d d …f  }|S )Nrà   r   r   r   )r(   rr  r   r   r   r   r±   r“   rs  r‘   r   )r”   Úindexr(   rr  rs  Úpos_embr)   r)   r*   r£   ”  s   
"
"
$z!ImagePositionalEmbeddings.forwardrn  r)   r)   r˜   r*   ro  j  s    þýüûro  c                       s<   e Zd ZdZ‡ fdd„Zd
dd„Zd
dejfdd	„Z‡  Z	S )ÚLabelEmbeddinga7  
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.

    Args:
        num_classes (`int`): The number of classes.
        hidden_size (`int`): The size of the vector embeddings.
        dropout_prob (`float`): The probability of dropping a label.
    c                    s4   t ƒ  ¡  |dk}t || |¡| _|| _|| _d S ©Nr   )r†   r‡   r   rq  Úembedding_tableÚnum_classesÚdropout_prob)r”   ry  rî   rz  Úuse_cfg_embeddingr˜   r)   r*   r‡   µ  s
   

zLabelEmbedding.__init__Nc                 C   sH   |du rt j|jd |jd| jk }nt  |dk¡}t  || j|¡}|S )zB
        Drops labels to enable classifier-free guidance.
        Nr   rà   r   )r   Úrandr   r   rz  ÚtensorÚwherery  )r”   ÚlabelsÚforce_drop_idsÚdrop_idsr)   r)   r*   Ú
token_drop¼  s
   zLabelEmbedding.token_dropr  c                 C   s6   | j dk}| jr
|s|d ur|  ||¡}|  |¡}|S rw  )rz  Útrainingr‚  rx  )r”   r  r€  Úuse_dropoutÚ
embeddingsr)   r)   r*   r£   Ç  s
   

zLabelEmbedding.forwardrè   )
r¤   r¥   r¦   r§   r‡   r‚  r   Ú
LongTensorr£   r¨   r)   r)   r˜   r*   rv  «  s
    	
rv  c                	       sN   e Zd Z				ddedededef‡ fdd	„Zd
ejdejfdd„Z‡  ZS )ÚTextImageProjectioné   r{   é
   rÈ   Úimage_embed_dimÚcross_attention_dimÚnum_image_text_embedsc                    s6   t ƒ  ¡  || _t || j| ¡| _t ||¡| _d S rè   )r†   r‡   rŒ  r   r­   rÝ   rÑ   )r”   rÈ   rŠ  r‹  rŒ  r˜   r)   r*   r‡   Ð  s   
zTextImageProjection.__init__rÜ   rÝ   c                 C   s@   |j d }|  |¡}| || jd¡}|  |¡}tj||gddS )Nr   r   r   r   )r   rÝ   rE   rŒ  rÑ   r   r!   )r”   rÜ   rÝ   r¹   Úimage_text_embedsr)   r)   r*   r£   Ý  s
   


zTextImageProjection.forward)rˆ  r{   r{   r‰  rõ   r)   r)   r˜   r*   r‡  Ï  s    ûþýüûr‡  c                       sB   e Zd Z			ddededef‡ fdd„Zdejfd	d
„Z‡  ZS )ÚImageProjectionr{   rg  rŠ  r‹  rŒ  c                    s4   t ƒ  ¡  || _t || j| ¡| _t |¡| _d S rè   )r†   r‡   rŒ  r   r­   rÝ   r   rŽ   )r”   rŠ  r‹  rŒ  r˜   r)   r*   r‡   ë  s   
zImageProjection.__init__rÝ   c                 C   s>   |j d }|  | | jjj¡¡}| || jd¡}|  |¡}|S )Nr   r   )r   rÝ   r¡   ra  r   rE   rŒ  rŽ   )r”   rÝ   r¹   r)   r)   r*   r£   ÷  s
   

zImageProjection.forward)r{   r{   rg  rõ   r)   r)   r˜   r*   rŽ  ê  s    üþýürŽ  c                       s.   e Zd Zd‡ fdd„	Zdejfdd„Z‡  ZS )ÚIPAdapterFullImageProjectionrˆ  c                    s8   t ƒ  ¡  ddlm} |||ddd| _t |¡| _d S ©Nr   ©ÚFeedForwardÚgelu)ÚmultÚactivation_fn)r†   r‡   Ú	attentionr’  Úffr   r   rŽ   )r”   rŠ  r‹  r’  r˜   r)   r*   r‡     s   
z%IPAdapterFullImageProjection.__init__rÝ   c                 C   s   |   |  |¡¡S rè   )rŽ   r—  )r”   rÝ   r)   r)   r*   r£   	  s   z$IPAdapterFullImageProjection.forward)rˆ  rˆ  ©r¤   r¥   r¦   r‡   r   rê   r£   r¨   r)   r)   r˜   r*   r    s    r  c                       s.   e Zd Zd‡ fdd„	Zdejfdd„Z‡  ZS )	ÚIPAdapterFaceIDImageProjectionrˆ  r   c                    sH   t ƒ  ¡  ddlm} || _|| _|||| |dd| _t |¡| _	d S r  )
r†   r‡   r–  r’  Ú
num_tokensr‹  r—  r   r   rŽ   )r”   rŠ  r‹  r”  rš  r’  r˜   r)   r*   r‡     s   
z'IPAdapterFaceIDImageProjection.__init__rÝ   c                 C   s&   |   |¡}| d| j| j¡}|  |¡S )Nr   )r—  rE   rš  r‹  rŽ   )r”   rÝ   rµ   r)   r)   r*   r£     s   

z&IPAdapterFaceIDImageProjection.forward)rˆ  rˆ  r   r   r˜  r)   r)   r˜   r*   r™    s    	r™  c                       s(   e Zd Zd‡ fdd„	Zddd„Z‡  ZS )	ÚCombinedTimestepLabelEmbeddingsçš™™™™™¹?c                    s:   t ƒ  ¡  tdddd| _td|d| _t|||ƒ| _d S )Nr]  Tr   ©rZ  r   r   ©r•   rI  )r†   r‡   rY  Ú	time_projrG  Útimestep_embedderrv  Úclass_embedder)r”   ry  r   Úclass_dropout_probr˜   r)   r*   r‡     s   
z(CombinedTimestepLabelEmbeddings.__init__Nc                 C   s2   |   |¡}|  |j|d¡}|  |¡}|| }|S ©NrX   )rŸ  r   r¡   r¡  )r”   ÚtimestepÚclass_labelsÚhidden_dtypeÚtimesteps_projÚtimesteps_embÚconditioningr)   r)   r*   r£   %  s
   

z'CombinedTimestepLabelEmbeddings.forward)rœ  rè   ©r¤   r¥   r¦   r‡   r£   r¨   r)   r)   r˜   r*   r›    s    r›  c                       ó$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )Ú"CombinedTimestepTextProjEmbeddingsc                    s<   t ƒ  ¡  tdddd| _td|d| _t||dd| _d S ©Nr]  Tr   r  rž  rH  ©rJ  )r†   r‡   rY  rŸ  rG  r   ÚPixArtAlphaTextProjectionÚtext_embedder©r”   r   Úpooled_projection_dimr˜   r)   r*   r‡   1  s   
z+CombinedTimestepTextProjEmbeddings.__init__c                 C   s4   |   |¡}|  |j|jd¡}|  |¡}|| }|S r£  )rŸ  r   r¡   r   r°  )r”   r¤  Úpooled_projectionr§  r¨  Úpooled_projectionsr©  r)   r)   r*   r£   8  s
   

z*CombinedTimestepTextProjEmbeddings.forwardrª  r)   r)   r˜   r*   r¬  0  s    r¬  c                       r«  )Ú*CombinedTimestepGuidanceTextProjEmbeddingsc                    sJ   t ƒ  ¡  tdddd| _td|d| _td|d| _t||dd| _d S r­  )	r†   r‡   rY  rŸ  rG  r   Úguidance_embedderr¯  r°  r±  r˜   r)   r*   r‡   D  s
   
z3CombinedTimestepGuidanceTextProjEmbeddings.__init__c                 C   sZ   |   |¡}|  |j|jd¡}|   |¡}|  |j|jd¡}|| }|  |¡}	||	 }
|
S r£  )rŸ  r   r¡   r   r¶  r°  )r”   r¤  Úguidancer³  r§  r¨  Úguidance_projÚguidance_embÚtime_guidance_embr´  r©  r)   r)   r*   r£   L  s   


z2CombinedTimestepGuidanceTextProjEmbeddings.forwardrª  r)   r)   r˜   r*   rµ  C  s    rµ  c                       s^   e Zd Zddedededef‡ fdd„Zdejd	ejd
ejdejdejdejfdd„Z‡  Z	S )Ú&CogView3CombinedTimestepSizeEmbeddingsr]  r   Úcondition_dimr²  Útimesteps_dimc                    sL   t ƒ  ¡  t|ddd| _t|ddd| _t||d| _t||dd| _d S )NTr   r  rž  rH  r®  )	r†   r‡   rY  rŸ  Úcondition_projrG  r   r¯  Úcondition_embedder)r”   r   r¼  r²  r½  r˜   r)   r*   r‡   \  s
   
z/CogView3CombinedTimestepSizeEmbeddings.__init__r¤  Úoriginal_sizeÚtarget_sizeÚcrop_coordsr¦  r   c                 C   s¢   |   |¡}|  | ¡ ¡ | d¡d¡}|  | ¡ ¡ | d¡d¡}|  | ¡ ¡ | d¡d¡}	tj|||	gdd}
|  |j|d¡}|  	|
j|d¡}|| }|S )Nr   r   r   r   rX   )
rŸ  r¾  rˆ   r±   r°   r   r!   r   r¡   r¿  )r”   r¤  rÀ  rÁ  rÂ  r¦  r§  Úoriginal_size_projÚcrop_coords_projÚtarget_size_projr¾  r¨  Úcondition_embr©  r)   r)   r*   r£   d  s   
z.CogView3CombinedTimestepSizeEmbeddings.forward)r]  )
r¤   r¥   r¦   rB   r‡   r   rê   r   r£   r¨   r)   r)   r˜   r*   r»  [  s     þýüûúùr»  c                	       s8   e Zd Zd
dedededef‡ fdd„Zdd	„ Z‡  ZS )ÚHunyuanDiTAttentionPoolNÚspacial_dimr.   Ú	num_headsÚ
output_dimc                    sp   t ƒ  ¡  t t |d |¡|d  ¡| _t ||¡| _t ||¡| _	t ||¡| _
t ||p0|¡| _|| _d S )Nr   rƒ   )r†   r‡   r   r_  r   r`  Úpositional_embeddingr­   Úk_projÚq_projÚv_projÚc_projrÉ  )r”   rÈ  r.   rÉ  rÊ  r˜   r)   r*   r‡     s   
 
z HunyuanDiTAttentionPool.__init__c              	   C   s  |  ddd¡}tj|jddd|gdd}|| jd d …d d d …f  |j¡ }tjdi d|d d… “d|“d	|“d
|j	d “d| j
“d| jj“d| jj“d| jj“dd “dt | jj| jj| jjg¡“dd “dd “dd“dd“d| jj“d| jj“dd“d| j“dd“Ž\}}| d¡S )Nr   r   r   T©r   Úkeepdimr   ÚqueryÚkeyÚvalueÚembed_dim_to_checkr   rÉ  Úq_proj_weightÚk_proj_weightÚv_proj_weightÚin_proj_weightÚin_proj_biasÚbias_kÚbias_vÚadd_zero_attnFÚ	dropout_pÚout_proj_weightÚout_proj_biasÚuse_separate_proj_weightrƒ  Úneed_weightsr)   )r²   r   r!   ÚmeanrË  r¡   r   r:  Úmulti_head_attention_forwardr   rÉ  rÍ  ra  rÌ  rÎ  r€   rÏ  rƒ  Úsqueeze)r”   rµ   rl  r)   r)   r*   r£   ˆ  sV   $
ÿþý
üûúùø	÷
öõôóòñðïîí
zHunyuanDiTAttentionPool.forwardrè   ©r¤   r¥   r¦   rB   r‡   r£   r¨   r)   r)   r˜   r*   rÇ  |  s     	rÇ  c                       s0   e Zd Z				d
‡ fdd„	Zddd	„Z‡  ZS )Ú-HunyuanCombinedTimestepTextSizeStyleEmbeddingrˆ  r]  é   Tc                    sŽ   t ƒ  ¡  tdddd| _td|d| _tdddd| _t||d|d| _|| _	|r8t
 d|¡| _d	| | }n|}t||d
 |dd| _d S )Nr]  Tr   r  rž  rü   )rÉ  rÊ  r   é   r4   Ú	silu_fp32)r«   rî   r¬   rJ  )r†   r‡   rY  rŸ  rG  r   Ú	size_projrÇ  ÚpoolerÚ"use_style_cond_and_image_meta_sizer   rq  Ústyle_embedderr¯  Úextra_embedder)r”   r   r²  Úseq_lenr‹  rí  Úextra_in_dimr˜   r)   r*   r‡   ¥  s$   
ÿüz6HunyuanCombinedTimestepTextSizeStyleEmbedding.__init__Nc                 C   s–   |   |¡}|  |j|d¡}|  |¡}| jr:|  | d¡¡}|j|d}| dd¡}|  |¡}	tj	|||	gdd}
ntj	|gdd}
||  
|
¡ }|S )NrX   r   ré  r   r   )rŸ  r   r¡   rì  rí  rë  r±   rî  r   r!   rï  )r”   r¤  rñ   Úimage_meta_sizeÚstyler¦  r§  r¨  r´  Ústyle_embeddingÚ
extra_condr©  r)   r)   r*   r£   Ç  s   


z5HunyuanCombinedTimestepTextSizeStyleEmbedding.forward)rˆ  r]  rè  Trè   rª  r)   r)   r˜   r*   rç  ¤  s    ú"rç  c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )	Ú&LuminaCombinedTimestepCaptionEmbeddingrÁ   rè  r]  c                    sL   t ƒ  ¡  t|ddd| _t||d| _t t |¡tj	||dd¡| _
d S )NTç        r  rž  rM  )r†   r‡   rY  rŸ  rG  r   r   Ú
Sequentialr   r­   Úcaption_embedder)r”   rî   r‹  Úfrequency_embedding_sizer˜   r)   r*   r‡   â  s   
ÿý
þz/LuminaCombinedTimestepCaptionEmbedding.__init__c           
      C   sh   |   |¡}|  |j|jd¡}| ¡  d¡}|| jdd|jdd }| |¡}|  |¡}|| }	|	S ©NrX   r   r   r   )rŸ  r   r¡   r   r    r“   Úsumrù  )
r”   r¤  Úcaption_featÚcaption_maskÚ	time_freqÚ
time_embedÚcaption_mask_floatÚcaption_feats_poolÚcaption_embedr©  r)   r)   r*   r£   ó  s   


z.LuminaCombinedTimestepCaptionEmbedding.forward)rÁ   rè  r]  rª  r)   r)   r˜   r*   rö  á  s    rö  c                       sf   e Zd Z		ddedededededd	f‡ fd
d„Z		ddejdejdejdejd	B fdd„Z	‡  Z
S )Ú%MochiCombinedTimestepCaptionEmbeddingr]  rü   r   r²  rÈ   rI  Únum_attention_headsr   Nc                    sJ   t ƒ  ¡  t|ddd| _t||d| _t|||d| _t 	||¡| _
d S )NTr÷  r  rž  )r  r.   rÊ  )r†   r‡   rY  rŸ  rG  r   ÚMochiAttentionPoolrì  r   r­   Úcaption_proj)r”   r   r²  rÈ   rI  r  r˜   r)   r*   r‡     s   
ÿz.MochiCombinedTimestepCaptionEmbedding.__init__r¤  rñ   Úencoder_attention_maskr¦  c           
      C   sB   |   |¡}|  |j|d¡}|  ||¡}|  |¡}|| }	|	|fS r£  )rŸ  r   r¡   rì  r  )
r”   r¤  rñ   r  r¦  rŸ  Útime_embr´  r  r©  r)   r)   r*   r£     s   

z-MochiCombinedTimestepCaptionEmbedding.forward)r]  rü   rè   )r¤   r¥   r¦   rB   r‡   r   r†  rê   r   r£   r¨   r)   r)   r˜   r*   r    s4    úþýüûúùûþýüûr  c                       s4   e Zd Zd	dededef‡ fdd„Zdd„ Z‡  ZS )
ÚTextTimeEmbeddingé@   Úencoder_dimrI  rÉ  c                    s@   t ƒ  ¡  t |¡| _t||ƒ| _t ||¡| _t |¡| _	d S rè   )
r†   r‡   r   r   Únorm1ÚAttentionPoolingÚpoolr­   rŒ   Únorm2)r”   r  rI  rÉ  r˜   r)   r*   r‡   '  s
   
zTextTimeEmbedding.__init__c                 C   s,   |   |¡}|  |¡}|  |¡}|  |¡}|S rè   )r  r  rŒ   r  )r”   rð   r)   r)   r*   r£   .  s
   



zTextTimeEmbedding.forward)r  ræ  r)   r)   r˜   r*   r
  &  s    r
  c                       sB   e Zd Zddededef‡ fdd„Zdejd	ejfd
d„Z‡  ZS )ÚTextImageTimeEmbeddingr{   ré  rÈ   rŠ  rI  c                    s6   t ƒ  ¡  t ||¡| _t |¡| _t ||¡| _d S rè   )r†   r‡   r   r­   rÑ   r   Ú	text_normÚ
image_proj)r”   rÈ   rŠ  rI  r˜   r)   r*   r‡   7  s   
zTextImageTimeEmbedding.__init__rÜ   rÝ   c                 C   s&   |   |¡}|  |¡}|  |¡}|| S rè   )rÑ   r  r  )r”   rÜ   rÝ   Útime_text_embedsÚtime_image_embedsr)   r)   r*   r£   =  s   


zTextImageTimeEmbedding.forward)r{   r{   ré  rõ   r)   r)   r˜   r*   r  6  s    r  c                       s8   e Zd Zd
dedef‡ fdd„Zdejfdd	„Z‡  ZS )ÚImageTimeEmbeddingr{   ré  rŠ  rI  c                    s(   t ƒ  ¡  t ||¡| _t |¡| _d S rè   )r†   r‡   r   r­   r  r   Ú
image_norm©r”   rŠ  rI  r˜   r)   r*   r‡   I  s   
zImageTimeEmbedding.__init__rÝ   c                 C   s   |   |¡}|  |¡}|S rè   )r  r  )r”   rÝ   r  r)   r)   r*   r£   N  s   

zImageTimeEmbedding.forward©r{   ré  rõ   r)   r)   r˜   r*   r  H  s    r  c                       s>   e Zd Zddedef‡ fdd„Zdejdejfd	d
„Z‡  ZS )ÚImageHintTimeEmbeddingr{   ré  rŠ  rI  c                    sâ   t ƒ  ¡  t ||¡| _t |¡| _t tjdddddt 	¡ tjdddddt 	¡ tjddddddt 	¡ tjdddddt 	¡ tjddddddt 	¡ tjdddddt 	¡ tjdd	ddddt 	¡ tjd	d
ddd¡| _
d S )Nr6   r_   r   )Úpaddingrg  r   )r  r   é`   r]  r4   )r†   r‡   r   r­   r  r   r  rø  r‹   ÚSiLUÚinput_hint_blockr  r˜   r)   r*   r‡   V  s(   

ñzImageHintTimeEmbedding.__init__rÝ   Úhintc                 C   s&   |   |¡}|  |¡}|  |¡}||fS rè   )r  r  r  )r”   rÝ   r  r  r)   r)   r*   r£   l  s   


zImageHintTimeEmbedding.forwardr  rõ   r)   r)   r˜   r*   r  U  s    r  c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )r  Nc                    s~   t ƒ  ¡  || _t t d|¡|d  ¡| _tj||| jd| _	tj||| jd| _
tj||| jd| _|| _|| j | _d S )Nr   rƒ   rX   )r†   r‡   r   r   r_  r   r`  rË  r­   rÌ  rÍ  rÎ  rÉ  Údim_per_head)r”   rÉ  r.   r   r˜   r)   r*   r‡   w  s   
zAttentionPooling.__init__c                    sô   |  ¡ \‰ }}‡ ‡fdd„}|jdddˆj |j¡ }tj||gdd}|ˆ |¡ƒ}|ˆ |¡ƒ}|ˆ 	|¡ƒ}dt
 t
 ˆj¡¡ }	t d||	 ||	 ¡}
tj|
 ¡ dd |
j¡}
t d	|
|¡}| ˆ dd¡ dd
¡}|d d …dd d …f S )Nc                    sF   |   ˆ dˆjˆj¡} |  dd¡} |  ˆ ˆj dˆj¡} |  dd¡} | S )Nr   r   r   )r±   rÉ  r   r    rE   )rµ   ©Úbsr”   r)   r*   r   „  s
   z'AttentionPooling.forward.<locals>.shaper   TrÐ  r   zbct,bcs->btsr   zbts,bcs->bctr   r   )r°   rã  rË  r¡   r   r   r!   rÍ  rÌ  rÎ  r   Úsqrtr   rx   Úsoftmaxr    rq   rE   r    )r”   rµ   Úlengthr‘   r   Úclass_tokenÚqÚkÚvr   ra  Úar)   r!  r*   r£     s   zAttentionPooling.forwardrè   rª  r)   r)   r˜   r*   r  t  s    
r  c                	       sx   e Zd Z	ddedededB ddf‡ fdd„Zedd	œd
ejdejdejfdd„ƒZd
ejdej	dejfdd„Z
‡  ZS )r  Nr  r.   rÊ  r   c                    sN   t ƒ  ¡  |p|| _|| _t |d| ¡| _t ||¡| _t || j¡| _d S )Nr   )	r†   r‡   rÊ  r  r   r­   Úto_kvÚto_qÚto_out)r”   r  r.   rÊ  r˜   r)   r*   r‡   §  s   

zMochiAttentionPool.__init__F©rÑ  rµ   r½   c                C   s€   |   d¡|  d¡ksJ ‚|   d¡|  d¡ksJ ‚|dd…dd…df j| jd}||jdddjdd }| | jd|d}|S )a6  
        Pool tokens in x using mask.

        NOTE: We assume x does not require gradients.

        Args:
            x: (B, L, D) tensor of tokens.
            mask: (B, L) boolean tensor indicating which tokens are not padding.

        Returns:
            pooled: (B, D) tensor of pooled tokens.
        r   r   NrX   TrÐ  )Úmin)r°   r¡   r   rü  Úclamp)rµ   r½   rÑ  Úpooledr)   r)   r*   Úpool_tokens¶  s    zMochiAttentionPool.pool_tokensc                 C   s   |  d¡}|dd…dddd…f  ¡ }tj|ddd}| j||dd}tj||gdd}|  |¡}|  |dd…d	f ¡}|| j	 }| 
dd| j	|f¡}| dd
¡}| d¡\}	}
| 
d| j	|f¡}| d¡}tj||	|
|dd}| d¡ dd¡}|  |¡}|S )aP  
        Args:
            x (`torch.Tensor`):
                Tensor of shape `(B, S, D)` of input tokens.
            mask (`torch.Tensor`):
                Boolean ensor of shape `(B, S)` indicating which tokens are not padding.

        Returns:
            `torch.Tensor`:
                `(B, D)` tensor of pooled tokens.
        r   N)r   r   T)rÔ  r.  r   r   r   r6   r÷  )Ú	attn_maskrÞ  )r°   ré   r:  r%   r2  r   r!   r+  r,  r  Ú	unflattenr    r1  r“   Úscaled_dot_product_attentionrå  rˆ   r-  )r”   rµ   r½   ÚDr3  Úx_poolÚkvr'  Úhead_dimr(  r)  r)   r)   r*   r£   Ë  s"   




zMochiAttentionPool.forwardrè   )r¤   r¥   r¦   rB   r‡   Ústaticmethodr   rê   r2  Ú
BoolTensorr£   r¨   r)   r)   r˜   r*   r  ¦  s    üþýüû$$r  c                 C   sŠ   |j dd… \}}dt | ¡|   }|d j|j|jd}|| d¡ }tj| ¡ | 	¡ fdd}| 
dd	d
dd¡ ||| d d ¡}|S )zÉ
    Args:
        embed_dim: int
        box: a 3-D tensor [B x N x 4] representing the bounding boxes for GLIGEN pipeline
    Returns:
        [B x N x embed_dim] tensor of positional embeddings
    Nr   éd   )NNNr7   r   r   r   r   r6   r4   )r   r   r   r¡   r   r   r“   rD   r"   r#   r²   rE   )r.   Úboxr¹   Ú	num_boxesr(   r)   r)   r*   Ú#get_fourier_embeds_from_boundingboxø  s   	$r?  c                       s2   e Zd Zd‡ fdd„	Z					d	dd„Z‡  ZS )
ÚGLIGENTextBoundingboxProjectionú	text-onlyrü   c              
      sf  t ƒ  ¡  || _|| _|| _|d d | _t|tƒr|d }|dkrMt 	t 
| j| j d¡t ¡ t 
dd¡t ¡ t 
d|¡¡| _tj t | jg¡¡| _nX|dkr¥t 	t 
| j| j d¡t ¡ t 
dd¡t ¡ t 
d|¡¡| _t 	t 
| j| j d¡t ¡ t 
dd¡t ¡ t 
d|¡¡| _tj t | jg¡¡| _tj t | jg¡¡| _tj t | jg¡¡| _d S )Nr   r4   r   rA  é   z
text-image)r†   r‡   Úpositive_lenrK  Úfourier_embedder_dimÚposition_dimrA   Útupler   rø  r­   r  Úlinearsr   r_  rg   Únull_positive_featureÚlinears_textÚlinears_imageÚnull_text_featureÚnull_image_featureÚnull_position_feature)r”   rC  rK  Úfeature_typeÚfourier_freqsr˜   r)   r*   r‡     sD   



û

û

ûz(GLIGENTextBoundingboxProjection.__init__Nc                 C   s$  |  d¡}t| j|ƒ}| j ddd¡}	|| d| |	  }|d urA| j ddd¡}
|| d| |
  }|  tj||gdd¡}|S |  d¡}|  d¡}| j	 ddd¡}| j
 ddd¡}|| d| |  }|| d| |  }|  tj||gdd¡}|  tj||gdd¡}tj||gdd}|S )Nr   r   r   )r“   r?  rD  rM  r±   rH  rG  r   r!   rK  rL  rI  rJ  )r”   ÚboxesÚmasksÚpositive_embeddingsÚphrases_masksÚimage_masksÚphrases_embeddingsÚimage_embeddingsÚxyxy_embeddingÚ	xyxy_nullÚpositive_nullÚobjsÚ	text_nullÚ
image_nullÚ	objs_textÚ
objs_imager)   r)   r*   r£   7  s&   


ñ
z'GLIGENTextBoundingboxProjection.forward)rA  rü   )NNNNNrª  r)   r)   r˜   r*   r@    s    -ør@  c                       s0   e Zd ZdZddef‡ fdd„Zdd„ Z‡  ZS )	Ú)PixArtAlphaCombinedTimestepSizeEmbeddingszº
    For PixArt-Alpha.

    Reference:
    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
    FÚuse_additional_conditionsc                    sl   t ƒ  ¡  || _tdddd| _td|d| _|| _|r4tdddd| _td|d| _	td|d| _
d S d S )Nr]  Tr   r  rž  )r†   r‡   ÚoutdimrY  rŸ  rG  r   r`  Úadditional_condition_projÚresolution_embedderÚaspect_ratio_embedder)r”   r   Úsize_emb_dimr`  r˜   r)   r*   r‡   r  s   
ýz2PixArtAlphaCombinedTimestepSizeEmbeddings.__init__c                 C   s   |   |¡}|  |j|d¡}| jrD|  | ¡ ¡ |¡}|  |¡ |d¡}|  | ¡ ¡ |¡}	|  |	¡ |d¡}	|t	j
||	gdd }
|
S |}
|
S rû  )rŸ  r   r¡   r`  rb  rˆ   rc  rE   rd  r   r!   )r”   r¤  Ú
resolutionÚaspect_ratior¹   r¦  r§  r¨  Úresolution_embÚaspect_ratio_embr©  r)   r)   r*   r£     s   
þz1PixArtAlphaCombinedTimestepSizeEmbeddings.forward©F)r¤   r¥   r¦   r§   ré   r‡   r£   r¨   r)   r)   r˜   r*   r_  j  s    r_  c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )	r¯  zÊ
    Projects caption embeddings. Also handles dropout for classifier-free guidance.

    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
    NÚ	gelu_tanhc                    sŠ   t ƒ  ¡  |d u r|}tj||dd| _|dkr tjdd| _n|dkr*t ¡ | _n|dkr3tƒ | _nt	d|› ƒ‚tj||dd| _
d S )	NTrª   rk  Útanh)ÚapproximaterH  rê  zUnknown activation function: )r†   r‡   r   r­   rN  ÚGELUÚact_1r  r   r@   rQ  )r”   r«   rî   r¬   rJ  r˜   r)   r*   r‡   –  s   

z"PixArtAlphaTextProjection.__init__c                 C   s"   |   |¡}|  |¡}|  |¡}|S rè   )rN  ro  rQ  )r”   Úcaptionrð   r)   r)   r*   r£   ¥  s   


z!PixArtAlphaTextProjection.forward)Nrk  r¾   r)   r)   r˜   r*   r¯    s    r¯  c                       sD   e Zd Z				ddedededed	d
f
‡ fdd„Zdd„ Z‡  ZS )Ú!IPAdapterPlusImageProjectionBlockr{   r  r_   r4   Ú
embed_dimsÚdim_headÚheadsÚ	ffn_ratior   Nc              
      sf   t ƒ  ¡  ddlm} t |¡| _t |¡| _t|||dd| _	t 
t |¡|||d|dd¡| _d S )Nr   r‘  F)Ú	query_dimrs  rt  Úout_biasr“  ©r•  r”  r€   )r†   r‡   r–  r’  r   r   Úln0Úln1r   Úattnrø  r—  )r”   rr  rs  rt  ru  r’  r˜   r)   r*   r‡   ­  s   
ü
þz*IPAdapterPlusImageProjectionBlock.__init__c                 C   sH   |   |¡}|  |¡}tj||gdd}|  ||¡| }|  |¡| }|S )NrŸ   r   )ry  rz  r   r!   r{  r—  )r”   rµ   ÚlatentsÚresidualrñ   r)   r)   r*   r£   Ä  s   

z)IPAdapterPlusImageProjectionBlock.forward)r{   r  r_   r4   )r¤   r¥   r¦   rB   r    r‡   r£   r¨   r)   r)   r˜   r*   rq  ¬  s"    ûþýüûúrq  c                       sn   e Zd ZdZ								dd	ed
edededededededdf‡ fdd„Zdejdejfdd„Z	‡  Z
S )ÚIPAdapterPlusImageProjectionaä  Resampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_queries (int):
            The number of queries. Defaults to 8. ffn_ratio (float): The expansion ratio
        of feedforward network hidden
            layer channels. Defaults to 4.
    r{   rˆ  é   r4   r  r_   rü   rr  Úoutput_dimsÚhidden_dimsÚdepthrs  rt  Únum_queriesru  r   Nc	           	         sx   t ƒ  ¡  t t d|ˆ¡ˆd  ¡| _t |ˆ¡| _t ˆ|¡| _	t 
|¡| _t ‡ ‡‡‡fdd„t|ƒD ƒ¡| _d S )Nr   rƒ   c                    ó   g | ]	}t ˆˆ ˆˆƒ‘qS r)   ©rq  ©Ú.0rl  ©rs  ru  rt  r  r)   r*   Ú
<listcomp>ñ  ó    z9IPAdapterPlusImageProjection.__init__.<locals>.<listcomp>)r†   r‡   r   r_  r   r`  r|  r­   Úproj_inÚproj_outr   Únorm_outÚ
ModuleListÚrangeÚlayers)	r”   rr  r€  r  r‚  rs  rt  rƒ  ru  r˜   rˆ  r*   r‡   Ý  s   

ÿz%IPAdapterPlusImageProjection.__init__rµ   c                 C   sP   | j  | d¡dd¡}|  |¡}| jD ]
}|}||||ƒ}q|  |¡}|  |¡S )z‹Forward pass.

        Args:
            x (torch.Tensor): Input Tensor.
        Returns:
            torch.Tensor: Output Tensor.
        r   r   )r|  r\   r°   r‹  r  rŒ  r  )r”   rµ   r|  Úblockr}  r)   r)   r*   r£   ô  s   



z$IPAdapterPlusImageProjection.forward)r{   rˆ  r  r4   r  r_   rü   r4   ©r¤   r¥   r¦   r§   rB   r    r‡   r   rê   r£   r¨   r)   r)   r˜   r*   r~  Í  s<    ÷þýüûúùø	÷
ör~  c                       s€   e Zd ZdZ												dd
edededededededededededdf‡ fdd„Zdejdejfdd„Z	‡  Z
S )Ú"IPAdapterFaceIDPlusImageProjectiona—  FacePerceiverResampler of IP-Adapter Plus.

    Args:
        embed_dims (int): The feature dimension. Defaults to 768. output_dims (int): The number of output channels,
        that is the same
            number of the channels in the `unet.config.cross_attention_dim`. Defaults to 1024.
        hidden_dims (int):
            The number of hidden channels. Defaults to 1280. depth (int): The number of blocks. Defaults
        to 8. dim_head (int): The number of head channels. Defaults to 64. heads (int): Parallel attention heads.
        Defaults to 16. num_tokens (int): Number of tokens num_queries (int): The number of queries. Defaults to 8.
        ffn_ratio (float): The expansion ratio of feedforward network hidden
            layer channels. Defaults to 4.
        ffproj_ratio (float): The expansion ratio of feedforward network hidden
            layer channels (for ID embeddings). Defaults to 4.
    r{   r  rB  r4   r  r_   rü   r   rr  r€  r  Úid_embeddings_dimr‚  rs  rt  rš  rƒ  ru  Úffproj_ratior   Nc                    s¦   t ƒ  ¡  ddlm} || _ˆ| _d | _d| _d| _||ˆ| d|d| _	t
 ˆ¡| _t
 |ˆ¡| _t
 ˆ|¡| _t
 |¡| _t
 ‡ ‡‡‡fdd„t|ƒD ƒ¡| _d S )	Nr   r‘  Fr,   r“  )r•  r”  c                    s   g | ]	}t ˆˆ ˆˆƒ‘qS r)   r…  r†  ©rs  rr  ru  rt  r)   r*   r‰  9	  rŠ  z?IPAdapterFaceIDPlusImageProjection.__init__.<locals>.<listcomp>)r†   r‡   r–  r’  rš  r.   Úclip_embedsÚshortcutÚshortcut_scalerŒ   r   r   rŽ   r­   r‹  rŒ  r  rŽ  r  r  )r”   rr  r€  r  r”  r‚  rs  rt  rš  rƒ  ru  r•  r’  r˜   r–  r*   r‡   	  s   

ÿz+IPAdapterFaceIDPlusImageProjection.__init__Ú	id_embedsc                 C   s¦   |  | jj¡}|  |¡}| d| j| j¡}|  |¡}|}|  | j¡}| d|j	d |j	d ¡}| j
D ]
}|}||||ƒ}q2|  |¡}|  |¡}| jrQ|| j|  }|S )zŸForward pass.

        Args:
            id_embeds (torch.Tensor): Input Tensor (ID embeds).
        Returns:
            torch.Tensor: Output Tensor.
        r   r   r6   )r¡   r—  r   rŒ   rE   rš  r.   rŽ   r‹  r   r  rŒ  r  r˜  r™  )r”   rš  r|  r—  rµ   r‘  r}  ru   r)   r)   r*   r£   <	  s   




z*IPAdapterFaceIDPlusImageProjection.forward)r{   r{   r  rB  r4   r  r_   r4   rü   r4   r   r’  r)   r)   r˜   r*   r“  	  sN    ôþýüûúùø	÷
öõôó#r“  c                       sb   e Zd ZdZ				ddededed	ed
df
‡ fdd„Zdejdejdejd
ejfdd„Z‡  Z	S )Ú!IPAdapterTimeImageProjectionBlocka¨  Block for IPAdapterTimeImageProjection.

    Args:
        hidden_dim (`int`, defaults to 1280):
            The number of hidden channels.
        dim_head (`int`, defaults to 64):
            The number of head channels.
        heads (`int`, defaults to 20):
            Parallel attention heads.
        ffn_ratio (`int`, defaults to 4):
            The expansion ratio of feedforward network hidden layer channels.
    r  r  é   r4   Ú
hidden_dimrs  rt  ru  r   Nc                    s¶   t ƒ  ¡  ddlm} t |¡| _t |¡| _t||||ddd| _	|||d|dd| _
t ¡ | _t |d| ¡| _t |¡| _dt t |¡¡ | j	_| j	 ¡  d | j	_d | j	_d S )Nr   r‘  F)rv  r‹  rs  rt  r€   rw  r“  rx  r4   )r†   r‡   r–  r’  r   r   ry  rz  r   r{  r—  r  Ú
adaln_silur­   Ú
adaln_projÚ
adaln_normr   r#  r   Úfuse_projectionsÚto_kÚto_v)r”   r  rs  rt  ru  r’  r˜   r)   r*   r‡   f	  s(   
ú

z*IPAdapterTimeImageProjectionBlock.__init__rµ   r|  Útimestep_embc                 C   sÖ  |   |  |¡¡}|jddd\}}}}|}	|  |¡}|  |¡d|dd…df   |dd…df  }|jd }
| j |¡}tj	||fdd}| j 
|¡jddd\}}|jd }|| jj }| |
d| jj|¡ dd¡}| |
d| jj|¡ dd¡}| |
d| jj|¡ dd¡}|| jj || jj  dd¡ }tj| ¡ dd |j¡}|| }| dd¡ |
d| jj| ¡}| jjd |ƒ}| jjd |ƒ}||	 }|}	|  |¡d|dd…df   |dd…df  }|  |¡|	 S )	aA  Forward pass.

        Args:
            x (`torch.Tensor`):
                Image features.
            latents (`torch.Tensor`):
                Latent features.
            timestep_emb (`torch.Tensor`):
                Timestep embedding.

        Returns:
            `torch.Tensor`: Output latent features.
        r4   r   r   Nr   rŸ   r   r   )rŸ  rž  rA  ry  rz  r   r{  r,  r   r!   r+  rt  r±   r    r   r$  r    rq   r   rE   r-  r   r—  )r”   rµ   r|  r¤  r(   Ú	shift_msaÚ	scale_msaÚ	shift_mlpÚ	scale_mlpr}  r¹   rÒ  Úkv_inputrÓ  rÔ  Ú	inner_dimr9  ra  r)   r)   r*   r£   ‡	  s0   
.

 .z)IPAdapterTimeImageProjectionBlock.forward)r  r  rœ  r4   )
r¤   r¥   r¦   r§   rB   r‡   r   rê   r£   r¨   r)   r)   r˜   r*   r›  X	  s$    ûþýüûú*!r›  c                       s   e Zd ZdZ												
ddedededededededededededdf‡ fdd„Zdejdejde	ejejf fdd„Z
‡  ZS )ÚIPAdapterTimeImageProjectionaŠ  Resampler of SD3 IP-Adapter with timestep embedding.

    Args:
        embed_dim (`int`, defaults to 1152):
            The feature dimension.
        output_dim (`int`, defaults to 2432):
            The number of output channels.
        hidden_dim (`int`, defaults to 1280):
            The number of hidden channels.
        depth (`int`, defaults to 4):
            The number of blocks.
        dim_head (`int`, defaults to 64):
            The number of head channels.
        heads (`int`, defaults to 20):
            Parallel attention heads.
        num_queries (`int`, defaults to 64):
            The number of queries.
        ffn_ratio (`int`, defaults to 4):
            The expansion ratio of feedforward network hidden layer channels.
        timestep_in_dim (`int`, defaults to 320):
            The number of input channels for timestep embedding.
        timestep_flip_sin_to_cos (`bool`, defaults to True):
            Flip the timestep embedding order to `cos, sin` (if True) or `sin, cos` (if False).
        timestep_freq_shift (`int`, defaults to 0):
            Controls the timestep delta between frequencies between dimensions.
    é€  é€	  r  r4   r  rœ  é@  Tr   r.   rÊ  r  r‚  rs  rt  rƒ  ru  Útimestep_in_dimÚtimestep_flip_sin_to_cosÚtimestep_freq_shiftr   Nc                    s–   t ƒ  ¡  t t d|ˆ¡ˆd  ¡| _t |ˆ¡| _t ˆ|¡| _	t 
|¡| _t ‡ ‡‡‡fdd„t|ƒD ƒ¡| _t|	|
|ƒ| _t|	ˆdd| _d S )Nr   rƒ   c                    r„  r)   )r›  r†  ©rs  ru  rt  r  r)   r*   r‰  ì	  rŠ  z9IPAdapterTimeImageProjection.__init__.<locals>.<listcomp>rH  r®  )r†   r‡   r   r_  r   r`  r|  r­   r‹  rŒ  r   r  rŽ  r  r  rY  rŸ  rG  Útime_embedding)r”   r.   rÊ  r  r‚  rs  rt  rƒ  ru  r¯  r°  r±  r˜   r²  r*   r‡   Ø	  s   
ÿz%IPAdapterTimeImageProjection.__init__rµ   r¤  c                 C   s†   |   |¡j|jd}|  |¡}| j | d¡dd¡}|  |¡}||dd…df  }| jD ]}||||ƒ}q,|  	|¡}|  
|¡}||fS )a#  Forward pass.

        Args:
            x (`torch.Tensor`):
                Image features.
            timestep (`torch.Tensor`):
                Timestep in denoising process.
        Returns:
            `tuple`[`torch.Tensor`, `torch.Tensor`]: The pair (latents, timestep_emb).
        rX   r   r   N)rŸ  r¡   r   r³  r|  r\   r°   r‹  r  rŒ  r  )r”   rµ   r¤  r¤  r|  r‘  r)   r)   r*   r£   ñ	  s   




z$IPAdapterTimeImageProjection.forward)r¬  r­  r  r4   r  rœ  r  r4   r®  Tr   )r¤   r¥   r¦   r§   rB   ré   r‡   r   rê   rF  r£   r¨   r)   r)   r˜   r*   r«  ¼	  sN    ôþýüûúùø	÷
öõôó.r«  c                       sX   e Zd Zdeej eej B f‡ fdd„Zede	fdd„ƒZ
deej fdd	„Z‡  ZS )
ÚMultiIPAdapterImageProjectionÚIPAdapterImageProjectionLayersc                    s   t ƒ  ¡  t |¡| _d S rè   )r†   r‡   r   rŽ  Úimage_projection_layers)r”   rµ  r˜   r)   r*   r‡   
  s   
z&MultiIPAdapterImageProjection.__init__r   c                 C   s
   t | jƒS )zNumber of IP-Adapters loaded.)r   r¶  )r”   r)   r)   r*   Únum_ip_adapters
  s   
z-MultiIPAdapterImageProjection.num_ip_adaptersrÝ   c                 C   sÚ   g }t |tƒsd}tdd|dd | d¡g}t|ƒt| jƒkr/tdt|ƒ› dt| jƒ› ƒ‚t|| jƒD ]5\}}|jd	 |jd }}| 	|| f|jd
d …  ¡}||ƒ}| 	||f|jdd …  ¡}| 
|¡ q5|S )NzÓYou have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning.zimage_embeds not a listú1.0.0FrV   r   zGimage_embeds must have the same length as image_projection_layers, got z and r   r   )rA   Úlistr   r“   r   r¶  r@   Úzipr   rE   Úappend)r”   rÝ   Úprojected_image_embedsr^   Úimage_embedÚimage_projection_layerr¹   Ú
num_imagesr)   r)   r*   r£   
  s"   
ÿÿz%MultiIPAdapterImageProjection.forward)r¤   r¥   r¦   r¹  r   ÚModulerF  r‡   ÚpropertyrB   r·  r   rê   r£   r¨   r)   r)   r˜   r*   r´  
  s
    "r´  c                   @   s   e Zd Zdd„ ZdS )ÚFluxPosEmbedc                 O   s*   d}t dd|ƒ ddlm} ||i |¤ŽS )NzœImporting and using `FluxPosEmbed` from `diffusers.models.embeddings` is deprecated. Please import it from `diffusers.models.transformers.transformer_flux`.rÂ  r¸  r   )rÂ  )r   Útransformers.transformer_fluxrÂ  )ÚclsÚargsÚkwargsr^   rÂ  r)   r)   r*   Ú__new__6
  s   zFluxPosEmbed.__new__N)r¤   r¥   r¦   rÇ  r)   r)   r)   r*   rÂ  5
  s    rÂ  )Fr   r   r	   )r,   r,   Nr-   )r,   r,   )Fr   r,   r_   Nr-   )r-   )r-   FN)Fr   r,   r_   )r	   Trö   NN)r  r	   N)TNr-   )Trj  )Tr   r   )Rr   Únumpyr-   r   Útorch.nn.functionalr   r$   r:  Úutilsr   Úactivationsr   r   Úattention_processorr   rê   rB   ré   r    r+   rF  r   rX  rS   r'  r?   rh   rF   rG   rf   rj   rp   rÀ  ry   r©   r¿   rë   r  r  r  r  r  r$  r   r  r9  rF  rG  rY  r\  rf  ro  rv  r‡  rŽ  r  r™  r›  r¬  rµ  r»  rÇ  rç  rö  r  r
  r  r  r  r  r  r?  r@  r_  r¯  rq  r~  r“  r›  r«  r´  rÂ  r)   r)   r)   r*   Ú<module>   s|  úÿþýüûú
ù:ùÿþýüûúù
øNûÿþýüû
úEøù
ø
D
!2
ÿ& 6 =÷ûúùø	÷

öjùûúù
ø,ÿÿ
ÿ
2

"$øÿþ
ýFûÿþýüû
ú90A$!(="#2R]%!;PdQ(