o
    Gi:                     @   s~  d dl Zd dlZd dlmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ e rUd dlmZ G dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dZ"G dd dZ#G dd deZ$G dd  d ejZ%G d!d" d"ejZ&G d#d$ d$ejZ'G d%d& d&eee	Z(dS )'    N   )ConfigMixinregister_to_config)FromOriginalModelMixin)is_torchvision_available   )FeedForward)dispatch_attention_fn)	Attention)	Timesteps)Transformer2DModelOutput)
ModelMixin)RMSNorm)
transformsc                       sV   e Zd Z	ddededeeeef deddf
 fdd	Zd
ejdejfddZ	  Z
S )CosmosPatchEmbedTin_channelsout_channels
patch_sizebiasreturnNc                    s>   t    || _tj||d  |d  |d  ||d| _d S )Nr      r   r   )super__init__r   nnLinearproj)selfr   r   r   r   	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_cosmos.pyr   $   s   
.zCosmosPatchEmbed.__init__hidden_statesc           
   
   C   sn   |j \}}}}}| j\}}}	||||| ||| |||	 |	}|dddddddddd}| |}|S )	Nr   r         r   r         )shaper   reshapepermuteflattenr   )
r   r"   
batch_sizenum_channels
num_framesheightwidthp_tp_hp_wr    r    r!   forward,   s    
zCosmosPatchEmbed.forward)T)__name__
__module____qualname__inttupleboolr   torchTensorr3   __classcell__r    r    r   r!   r   #   s    r   c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  ZS )
CosmosTimestepEmbeddingin_featuresout_featuresr   Nc                    s@   t    tj||dd| _t | _tj|d| dd| _d S )NFr   r   )r   r   r   r   linear_1SiLU
activationlinear_2)r   r>   r?   r   r    r!   r   8   s   

z CosmosTimestepEmbedding.__init__	timestepsc                 C   s"   |  |}| |}| |}|S N)r@   rB   rC   )r   rD   embr    r    r!   r3   >   s   


zCosmosTimestepEmbedding.forward	r4   r5   r6   r7   r   r:   r;   r3   r<   r    r    r   r!   r=   7   s    r=   c                       sF   e Zd Zdededdf fddZdejdejdejfd	d
Z  Z	S )CosmosEmbeddingembedding_dimcondition_dimr   Nc                    s:   t    t|ddd| _t||| _t|ddd| _d S )NT        )flip_sin_to_cosdownscale_freq_shiftư>epselementwise_affine)r   r   r   	time_projr=   
t_embedderr   norm)r   rI   rJ   r   r    r!   r   F   s   
zCosmosEmbedding.__init__r"   timestepc                 C   s,   |  ||}| |}| |}||fS rE   )rR   type_asrS   rT   )r   r"   rU   timesteps_projtembembedded_timestepr    r    r!   r3   M   s   

zCosmosEmbedding.forward)
r4   r5   r6   r7   r   r:   r;   
LongTensorr3   r<   r    r    r   r!   rH   E   s    $rH   c                	       sT   e Zd Zdededdf fddZ	ddejdejd	ejdB dejfd
dZ  ZS )CosmosAdaLayerNormr>   hidden_featuresr   Nc                    sX   t    || _t | _tj|ddd| _tj||dd| _	tj|d| dd| _
d S )NFrN   rQ   rP   r   r   )r   r   rI   r   rA   rB   	LayerNormrT   r   r@   rC   r   r>   r\   r   r    r!   r   U   s   

zCosmosAdaLayerNorm.__init__r"   rY   rX   c                 C   s   |  |}| |}| |}|d ur ||dd d| j f  }|jddd\}}| |}|jdkr>dd ||fD \}}|d|  | }|S )N.r   dimc                 s       | ]}| d V  qdS r   N	unsqueeze.0xr    r    r!   	<genexpr>l       z-CosmosAdaLayerNorm.forward.<locals>.<genexpr>r   )rB   r@   rC   rI   chunkrT   ndim)r   r"   rY   rX   shiftscaler    r    r!   r3   ^   s   




zCosmosAdaLayerNorm.forwardrE   rG   r    r    r   r!   r[   T   s    
r[   c                	       sZ   e Zd ZddededB ddf fddZ	ddejdejd	ejdB dejfd
dZ  ZS )CosmosAdaLayerNormZeroNr>   r\   r   c                    sf   t    tj|ddd| _t | _|d u rt | _n	tj	||dd| _tj	|d| dd| _
d S )NFrN   r]   r   r   )r   r   r   r^   rT   rA   rB   Identityr@   r   rC   r_   r   r    r!   r   s   s   

zCosmosAdaLayerNormZero.__init__r"   rY   rX   c                 C   s   |  |}| |}| |}|d ur|| }|jddd\}}}| |}|jdkr8dd |||fD \}}}|d|  | }||fS )Nr   r`   ra   r   c                 s   rc   rd   re   rg   r    r    r!   rj      rk   z1CosmosAdaLayerNormZero.forward.<locals>.<genexpr>r   )rB   r@   rC   rl   rT   rm   )r   r"   rY   rX   rn   ro   gater    r    r!   r3      s   




zCosmosAdaLayerNormZero.forwardrE   rG   r    r    r   r!   rp   r   s     rp   c                   @   sT   e Zd Zdd Z			ddedejdejdB dejdB dejdB d	ejfd
dZdS )CosmosAttnProcessor2_0c                 C      t tjjdstdd S )Nscaled_dot_product_attentionzVCosmosAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.hasattrr:   r   
functionalImportErrorr   r    r    r!   r         zCosmosAttnProcessor2_0.__init__Nattnr"   encoder_hidden_statesattention_maskimage_rotary_embr   c                 C   s  |d u r|}| |}||}||}|d|jdfdd}|d|jdfdd}|d|jdfdd}||}||}|d ur`ddlm	}	 |	||ddd}|	||ddd}t
j rt
j|d|jd	}
t
j|d|jd	}t
j|d|jd	}n|d}
|d}|d}|j|
| dd
}|j|
| dd
}t|dd|dd|dd|ddd}|dd|}|jd |}|jd |}|S )Nr   r`   r   apply_rotary_embTuse_realuse_real_unbind_dimr   devicera   rK   F	attn_mask	dropout_p	is_causalr   )to_qto_kto_v	unflattenheads	transposenorm_qnorm_k
embeddingsr   r:   onnxis_in_onnx_exporttensorsizer   repeat_interleaver	   r*   rV   to_out)r   r|   r"   r}   r~   r   querykeyvaluer   	query_idxkey_idx	value_idxr    r    r!   __call__   sF   	











zCosmosAttnProcessor2_0.__call__)NNN)r4   r5   r6   r   r
   r:   r;   r   r    r    r    r!   rs      s$    rs   c                   @   sR   e Zd Zdd Z	ddedejdeejejf deejejf dejf
d	d
ZdS )CosmosAttnProcessor2_5c                 C   rt   )Nru   zTCosmosAttnProcessor2_5 requires PyTorch 2.0. Please upgrade PyTorch to 2.0 or newer.rv   rz   r    r    r!   r      r{   zCosmosAttnProcessor2_5.__init__Nr|   r"   r}   r~   r   c                 C   s  t |ts	td|r|nd\}}|r|nd\}}	|d u r|}||}
||}||}|
d|jdfdd}
|d|jdfdd}|d|jdfdd}|	|
}
|
|}|d uryddlm} ||
|ddd	}
|||ddd	}tj rtj|
d
|
jd}tj|d
|jd}tj|d
|jd}n|
d
}|d
}|d
}|j|| d
d}|j|| d
d}t|
dd|dd|dd|ddd}|dd
|
}|d uru||}||}||}|jd }|j|j }||d|j|dd}||d|j|dd}||d|j|dd}||}||}|d
}|d
}|d
}|j|| d
d}|j|| d
d}t|dd|dd|dd|	ddd}|dd
|}|| }n|}|jd |}|jd |}|S )NzDExpected encoder_hidden_states as (text_context, img_context) tuple.)NNr   r`   r   r   Tr   r   r   r   ra   rK   Fr   r   ) 
isinstancer8   
ValueErrorr   r   r   r   r   r   r   r   r   r   r:   r   r   r   r   r   r   r	   r*   rV   q_imgk_imgv_imgr'   out_dimview
q_img_norm
k_img_normr   )r   r|   r"   r}   r~   r   text_contextimg_context	text_maskimg_maskr   r   r   r   r   r   r   attn_outr   r   r   r+   dim_head	q_img_idx	k_img_idx	v_img_idximg_outr    r    r!   r      s   


























zCosmosAttnProcessor2_5.__call__rE   )	r4   r5   r6   r   r
   r:   r;   r8   r   r    r    r    r!   r      s    
r   c                	       sT   e Zd Z fddZ	d
dejdeejejf dejdB dejf fdd	Z  ZS )CosmosAttentionc                    s   t  j|i | | j| jj | j }tj| j|dd| _tj| j|dd| _	tj| j|dd| _
t| jj| j ddd| _t| jj| j ddd| _d S )NFr   rN   TrO   )r   r   r   r   r?   r   r   	query_dimr   r   r   r   r   r   r   )r   argskwargs	inner_dimr   r    r!   r   ;  s   zCosmosAttention.__init__Nr"   r}   r~   r   c                    s   t  jd|||d|S )N)r"   r}   r~   r    )r   r3   )r   r"   r}   r~   cross_attention_kwargsr   r    r!   r3   F  s   zCosmosAttention.forwardrE   )	r4   r5   r6   r   r:   r;   r8   r3   r<   r    r    r   r!   r   :  s    r   c                       s   e Zd Z							ddedededed	ed
edededededdf fddZ							d dej	dej	dB e
ej	dB ej	dB f B dej	dej	dB dej	dB dej	dB dej	dB dej	dB dej	dB dedB dej	e
ej	ej	f B fddZ  ZS )!CosmosTransformerBlock      @   rms_normFnum_attention_headsattention_head_dimcross_attention_dim	mlp_ratioadaln_lora_dimqk_normout_biasr   before_proj
after_projr   Nc              
      s   t    || }t||d| _|| _t|d |||d|t d| _t||d| _|r9t	|||||d|t
 d| _nt|||||d|t d| _t||d| _t||d|d| _d | _d | _|	rft||| _|
rqt||| _d S d S )N)r>   r\   T)r   r   r   r   r   rQ   r   	processorgelu)multactivation_fnr   )r   r   rp   norm1r   r
   rs   attn1norm2r   r   attn2norm3r   ffr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   hidden_sizer   r    r!   r   W  sZ   

zCosmosTransformerBlock.__init__r"   r}   rY   rX   r   extra_pos_embr~   controlnet_residuallatents	block_idxc                 C   s   | j d ur|  ||	 }|d ur|| }| |||\}}| j||d}|||  }| |||\}}| j|||d}|||  }| |||\}}| |}|||  }|d urd| jd u s`J ||7 }| jd urx|d u soJ | |}||fS |S )N)r   )r}   r~   )r   r   r   r   r   r   r   r   )r   r"   r}   rY   rX   r   r   r~   r   r   r   norm_hidden_statesrr   attn_output	ff_outpuths_projr    r    r!   r3     s.   



zCosmosTransformerBlock.forward)r   r   r   FFFF)NNNNNNN)r4   r5   r6   r7   floatstrr9   r   r:   r;   r8   r3   r<   r    r    r   r!   r   V  sz    	
F 	
r   c                       s   e Zd Z				ddedeeeef deeeef ded	eeeef d
df fddZddejdedB d
eejejf fddZ	  Z
S )CosmosRotaryPosEmbed      r   r   r   r      g       @      ?r   r   max_sizer   base_fps
rope_scaler   Nc                    s   t    dd t||D | _|| _|| _|d d | _|d d | _|| j | j | _|d | j| jd   | _	|d | j| jd   | _
|d | j| jd   | _d S )Nc                 S      g | ]\}}|| qS r    r    rh   r   patchr    r    r!   
<listcomp>      z1CosmosRotaryPosEmbed.__init__.<locals>.<listcomp>r$   r   r   r   )r   r   zipr   r   r   dim_hdim_wdim_th_ntk_factorw_ntk_factort_ntk_factor)r   r   r   r   r   r   r   r    r!   r     s   
zCosmosRotaryPosEmbed.__init__r"   fpsc                 C   sL  |j \}}}}}|| jd  || jd  || jd  g}|j}	d| j }
d| j }d| j }tjt| j	|	tj
d}tjd| jd|	tj
dd | jd  | j }tjd| jd|	tj
dd | jd  | j }tjd| jd|	tj
dd | jd  | j }d|
|  }d||  }d||  }t|d |d  |d d d d d d f |d d|d d}t|d |d  |d d d d d d f |d |d dd}|d u rt|d |d  |}nt|d |d  | | j |}|d d d d d d f d|d |d d}tj|||gd dddd }t|}t|}||fS )	Nr   r   r   g     @)r   dtyper   r`   ra   )r'   r   r   r   r   r   r:   arangemaxr   float32r   r   r   outerrepeatr   catr*   r   cossin)r   r"   r   r+   r,   r-   r.   r/   pe_sizer   h_thetaw_thetat_thetaseqdim_h_rangedim_w_rangedim_t_rangeh_spatial_freqsw_spatial_freqstemporal_freqsemb_hemb_wemb_tfreqsr   r   r    r    r!   r3     s4   (


***@@",$

zCosmosRotaryPosEmbed.forward)r   r   r   r   rE   r4   r5   r6   r7   r8   r   r   r:   r;   r3   r<   r    r    r   r!   r     s&    2r   c                       s`   e Zd Z	ddedeeeef deeeef deddf
 fdd	Zd
ejdejfddZ	  Z
S )CosmosLearnablePositionalEmbedrN   r   r   r   rP   r   Nc                    s~   t    dd t||D | _|| _|| _tt	| jd || _
tt	| jd || _tt	| jd || _d S )Nc                 S   r   r    r    r   r    r    r!   r     r   z;CosmosLearnablePositionalEmbed.__init__.<locals>.<listcomp>r   r   r   )r   r   r   r   r   rP   r   	Parameterr:   zeros	pos_emb_t	pos_emb_h	pos_emb_w)r   r   r   r   rP   r   r    r!   r   
  s   
z'CosmosLearnablePositionalEmbed.__init__r"   c                 C   sR  |j \}}}}}|| jd  || jd  || jd  g}| jd |d  d d d d d d d f |d|d |d d}| jd |d  d d d d d d d f ||d d|d d}	| jd |d  d d d d d d d f ||d |d dd}
||	 |
 }|dd}tjj	|ddtj
d}tj| j|t| |  d}|| |S )	Nr   r   r   r   r`   T)rb   keepdimr   )alpha)r'   r   r  r   r  r  r*   r:   linalgvector_normr   addrP   npsqrtnumelrV   )r   r"   r+   r,   r-   r.   r/   r   r  r
  r  rF   rT   r    r    r!   r3     s   (>>>$z&CosmosLearnablePositionalEmbed.forward)rN   r  r    r    r   r!   r  	  s    r  c                ,       sV  e Zd ZdZdZg dZdgZdgZe							
														d6de	de	de	de	de	de
de	de	dee	e	e	f dee	e	e	f dee
e
e
f ded edB d!ed"e	d#e	d$e	dB d%e	dB d&e	d'e	d(df* fd)d*Z						d7d+ejd,ejd-ejd.eej dB d/ejdB d0e	dB d1ejdB d2ejdB d3ed(eej eB fd4d5Z  ZS )8CosmosTransformer3DModela
  
    A Transformer model for video-like data used in [Cosmos](https://github.com/NVIDIA/Cosmos).

    Args:
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        num_attention_heads (`int`, defaults to `32`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`, defaults to `128`):
            The number of channels in each attention head.
        num_layers (`int`, defaults to `28`):
            The number of layers of transformer blocks to use.
        mlp_ratio (`float`, defaults to `4.0`):
            The ratio of the hidden layer size to the input size in the feedforward network.
        text_embed_dim (`int`, defaults to `4096`):
            Input dimension of text embeddings from the text encoder.
        adaln_lora_dim (`int`, defaults to `256`):
            The hidden dimension of the Adaptive LayerNorm LoRA layer.
        max_size (`tuple[int, int, int]`, defaults to `(128, 240, 240)`):
            The maximum size of the input latent tensors in the temporal, height, and width dimensions.
        patch_size (`tuple[int, int, int]`, defaults to `(1, 2, 2)`):
            The patch size to use for patchifying the input latent tensors in the temporal, height, and width
            dimensions.
        rope_scale (`tuple[float, float, float]`, defaults to `(2.0, 1.0, 1.0)`):
            The scaling factor to use for RoPE in the temporal, height, and width dimensions.
        concat_padding_mask (`bool`, defaults to `True`):
            Whether to concatenate the padding mask to the input latent tensors.
        extra_pos_embed_type (`str`, *optional*, defaults to `learnable`):
            The type of extra positional embeddings to use. Can be one of `None` or `learnable`.
        controlnet_block_every_n (`int`, *optional*):
            Interval between transformer blocks that should receive control residuals (for example, `7` to inject after
            every seventh block). Required for Cosmos Transfer2.5.
        img_context_dim_in (`int`, *optional*):
            The dimension of the input image context feature vector, i.e. it is the D in [B, N, D].
        img_context_num_tokens (`int`):
            The number of tokens in the image context feature vector, i.e. it is the N in [B, N, D]. If
            `img_context_dim_in` is not provided, then this parameter is ignored.
        img_context_dim_out (`int`):
            The output dimension of the image context projection layer. If `img_context_dim_in` is not provided, then
            this parameter is ignored.
    T)patch_embedfinal_layerrT   r   learnable_pos_embed       r      r      r   r   r   r   	learnableFN   r   r   r   r   
num_layersr   text_embed_dimr   r   r   r   concat_padding_maskextra_pos_embed_typeuse_crossattn_projectioncrossattn_proj_in_channelsencoder_hidden_states_channelscontrolnet_block_every_nimg_context_dim_inimg_context_num_tokensimg_context_dim_outr   c                    s0  t     }|r|d n|}t|||
dd_t|	|
|d_d _|dkr2t||	|
d_t||_	t
 fddt|D _t| _t
j||
d	 |
d  |
d
  | dd_jjrzt
t
j||ddt
 _d_jjrt
t
jjjjjddt
 _d S d S )Nr   Fr   )r   r   r   r   r%  )r   r   r   c                    s8   g | ]}t  d djjduojjdkdqS )r   FNr   )r   r   r   r   r   r   r   r   )r   configr/  )rh   _r   r   r   r   r   r(  r    r!   r     s    z5CosmosTransformer3DModel.__init__.<locals>.<listcomp>r   r   T)r   r   r   r  r   roper   r  rH   
time_embedr   
ModuleListrangetransformer_blocksr[   norm_outr   proj_outr2  r+  
SequentialGELUcrossattn_projgradient_checkpointingr/  r1  img_context_proj)r   r   r   r   r   r'  r   r(  r   r   r   r   r)  r*  r+  r,  r-  r.  r/  r0  r1  r   patch_embed_in_channelsr   r4  r!   r   \  sH   

z!CosmosTransformer3DModel.__init__r"   rU   r}   block_controlnet_hidden_statesr~   r   condition_maskpadding_maskreturn_dictc
                    s  |j \ }
}}}|d urtj||gdd}| jjr>tjj|t|j dd  tj	j
d}tj||d d|ddgdd}|d urJ|dd}| j||d}| jjrZ| |nd }| jj\}}}|| || || | |}|dd}|jdkr| ||\}}n<|jdkr|j  d|ddfksJ d	|j  | }| ||\}} fd
d||fD \}}ntd|j  t|tr|n|d f\}}| jjr| |}|d ur| jjr| |}t|tr||fn|}i }d urt| j}fddtttd|| jj D }t| jD ]-\}}|!|}t" r7| j#r7| $|||||||||	}q|||||||||}q| %|||}| &|}|'d|||df}|'df}|(dddddddd}|dddddd}|	s|fS t)|dS )Nr   ra   r   )interpolationr   )r   r   r%   z9Expected timestep to have shape [B, 1, T, 1, 1], but got c                 3   s8    | ]}|  d d ddddd dV  qdS )r   r`   r   N)r   expandr*   rg   )r+   post_patch_heightpost_patch_num_framespost_patch_widthr    r!   rj     s    
z3CosmosTransformer3DModel.forward.<locals>.<genexpr>z@Expected timestep to have shape [B, 1, T, 1, 1] or [T], but got c                    s   i | ]	\}}| | qS r    r    )rh   idxr   )rB  r    r!   
<dictcomp>  s    z4CosmosTransformer3DModel.forward.<locals>.<dictcomp>r   r`   r&   r$   r#   )sample)*r'   r:   r   r2  r)  r   rx   resizelistInterpolationModeNEARESTrf   r   r5  r*  r   r   r  r*   rm   r6  r   r   r8   r+  r>  r/  r@  lenr9  	enumerater8  r.  getis_grad_enabledr?  _gradient_checkpointing_funcr:  r;  r   r)   r   )r   r"   rU   r}   rB  r~   r   rC  rD  rE  r,   r-   r.   r/   padding_mask_resizedr   r   r0   r1   r2   rX   rY   r   r   processed_encoder_hidden_statescontrolnet_block_index_mapn_blocksr   blockr   r    )r+   rB  rH  rI  rJ  r!   r3     s   











z CosmosTransformer3DModel.forward)r!  r!  r"  r   r#  r   r$  r   r   r   r   Tr%  Fr$  r$  NNr   r&  )NNNNNT)r4   r5   r6   __doc__ _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_keep_in_fp32_modulesr   r7   r   r8   r9   r   r   r:   r;   rO  r   r3   r<   r    r    r   r!   r  *  s    ,	
X	
r  ))numpyr  r:   torch.nnr   configuration_utilsr   r   loadersr   utilsr   	attentionr   attention_dispatchr	   attention_processorr
   r   r   modeling_outputsr   modeling_utilsr   normalizationr   torchvisionr   Moduler   r=   rH   r[   rp   rs   r   r   r   r   r  r  r    r    r    r!   <module>   s6   %@cs@!