o
    Giw                  	   @   s  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ ddl	m
Z
mZ ddlmZmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* e+e,Z-dddej.dej.fddZ/dddej.fddZ0G dd dZ1G dd dZ2G dd dejj3eZ4G dd  d ejj3Z5G d!d" d"ej3Z6G d#d$ d$ej3Z7eG d%d& d&ej3Z8G d'd( d(e(e
eeeeZ9dS ))    N)Any   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)apply_lora_scale	deprecatelogging)maybe_allow_in_graph   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixinFeedForward)dispatch_attention_fn)
CacheMixin)PixArtAlphaTextProjectionTimestepEmbedding	Timestepsget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)FP32LayerNormattnWanAttentionhidden_statesencoder_hidden_statesc                 C   s   |d u r|}| j r,| js| |jddd\}}}n!| |}| |jddd\}}n| |}| |}| |}|||fS )Nr   dimr   )fused_projectionsis_cross_attentionto_qkvchunkto_qto_kvto_kto_v)r   r   r   querykeyvalue r-   h/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_chronoedit.py_get_qkv_projections(   s   




r/   encoder_hidden_states_imgc                 C   sB   | j r| |jddd\}}||fS | |}| |}||fS )Nr   r   r    )r"   to_added_kvr%   
add_k_proj
add_v_proj)r   r0   key_img	value_imgr-   r-   r.   _get_added_kv_projections=   s   

r6   c                   @   sf   e Zd ZdZdZdd Z			ddddejdejdB dejdB d	eejejf dB d
ejfddZ	dS )WanAttnProcessorNc                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzbWanAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher.)hasattrFImportErrorselfr-   r-   r.   __init__K   s
   
zWanAttnProcessor.__init__r   r   r   r   attention_mask
rotary_embreturnc              
   C   s  d }|j d ur"|jd d }|d d d |f }|d d |d f }t|||\}}	}
||}||	}	|d|jdf}|	d|jdf}	|
d|jdf}
|d urrdtjdtjdtjfdd	}||g|R  }||	g|R  }	d }|d urt	||\}}|
|}|d|jdf}|d|jdf}t|||d d
d| jd d}|dd}||}t||	|
|d
d| j|d u r| jnd d}|dd}||}|d ur|| }|jd |}|jd |}|S )N   i   r   r   r   	freqs_cos	freqs_sinc                 S   s   |  ddd\}}|ddd df }|ddd df }t| }|| ||  |ddd df< || ||  |ddd df< || S )Nr   )r   r   .r   r   rB   )	unflattenunbindtorch
empty_liketype_as)r   rC   rD   x1x2cossinoutr-   r-   r.   apply_rotary_embk   s   

z3WanAttnProcessor.__call__.<locals>.apply_rotary_emb        F)	attn_mask	dropout_p	is_causalbackendparallel_configr   r   )r2   shaper/   norm_qnorm_krE   headsrG   Tensorr6   norm_added_kr   _attention_backendflattenrI   _parallel_configto_out)r=   r   r   r   r?   r@   r0   image_context_lengthr*   r+   r,   rO   hidden_states_imgr4   r5   r-   r-   r.   __call__Q   sp   






zWanAttnProcessor.__call__NNN)
__name__
__module____qualname__r\   r^   r>   rG   rZ   tuplerb   r-   r-   r-   r.   r7   G   s(    
r7   c                   @   s   e Zd Zdd ZdS )WanAttnProcessor2_0c                 O   s"   d}t dd|dd t|i |S )NzzThe WanAttnProcessor2_0 class is deprecated and will be removed in a future version. Please use WanAttnProcessor instead. rh   z1.0.0F)standard_warn)r	   r7   )clsargskwargsdeprecation_messager-   r-   r.   __new__   s   zWanAttnProcessor2_0.__new__N)rd   re   rf   rn   r-   r-   r-   r.   rh      s    rh   c                       s   e Zd ZeZegZ								ddededed	ed
ededB dedB f fddZdd Z	e
 dd Z			dde
jde
jdB de
jdB dee
je
jf dB de
jf
ddZ  ZS )r      @   h㈵>rP   Nr!   rY   dim_headepsdropoutadded_kv_proj_dimcross_attention_dim_headc
           
         sX  t    || | _|| _|| _|| _|d u r| jn|| | _tjj	|| jdd| _
tjj	|| jdd| _tjj	|| jdd| _tjtjj	| j|ddtj|g| _tjj|| |dd| _tjj|| |dd| _d  | _| _|d urtjj	|| jdd| _tjj	|| jdd| _tjj|| |d| _|	d ur|	| _n|d u| _| | d S )NTbias)rs   elementwise_affine)rs   )superr>   	inner_dimrY   ru   rv   kv_inner_dimrG   nnLinearr&   r(   r)   
ModuleListDropoutr_   RMSNormrW   rX   r2   r3   r[   r#   set_processor)
r=   r!   rY   rr   rs   rt   ru   rv   	processorr#   	__class__r-   r.   r>      s2   



zWanAttention.__init__c                 C   s  t | ddrd S | js]t| jjj| jjj| jjjg}t| jj	j| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sKw   Y  | jj||dddd nIt| jjj| jjjg}t| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sw   Y  | jj||dddd | jd urt| jjj| jjjg}t| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sw   Y  | jj||dddd d| _d S )Nr"   FmetaTrw   )weightrx   )strictassign)getattrr#   rG   catr&   r   datar(   r)   rx   rV   devicer}   r~   r$   load_state_dictr'   ru   r2   r3   r1   r"   )r=   concatenated_weightsconcatenated_biasout_featuresin_featuresr-   r-   r.   fuse_projections   s@   ""




zWanAttention.fuse_projectionsc                 C   sV   t | ddsd S t| drt| d t| drt| d t| dr&t| d d| _d S )Nr"   Fr$   r'   r1   )r   r9   delattrr"   r<   r-   r-   r.   unfuse_projections  s   






zWanAttention.unfuse_projectionsr   r   r?   r@   rA   c                 K   s   | j | ||||fi |S N)r   )r=   r   r   r?   r@   rl   r-   r-   r.   forward  s   zWanAttention.forward)ro   rp   rq   rP   NNNNrc   )rd   re   rf   r7   _default_processor_cls_available_processorsintfloatr>   r   rG   no_gradr   rZ   rg   r   __classcell__r-   r-   r   r.   r      sV    -#
c                       s>   e Zd Zd
dedef fddZdejdejfdd	Z  ZS )WanImageEmbeddingNr   r   c                    s\   t    t|| _t||ddd| _t|| _|d ur)tt	
d||| _d S d | _d S )NrB   gelu)multactivation_fn)rz   r>   r   norm1r   ffnorm2r}   	ParameterrG   zeros	pos_embed)r=   r   r   pos_embed_seq_lenr   r-   r.   r>   #  s   



zWanImageEmbedding.__init__encoder_hidden_states_imagerA   c                 C   sT   | j d ur|j\}}}|dd| |}|| j  }| |}| |}| |}|S )Nr   r   )r   rV   viewr   r   r   )r=   r   
batch_sizeseq_len	embed_dimr   r-   r-   r.   r   .  s   




zWanImageEmbedding.forwardr   	rd   re   rf   r   r>   rG   rZ   r   r   r-   r-   r   r.   r   "  s    r   c                       sr   e Zd Z		ddedededededB dedB f fdd	Z		dd
ejdejdejdB dedB fddZ  ZS )WanTimeTextImageEmbeddingNr!   time_freq_dimtime_proj_dimtext_embed_dimimage_embed_dimr   c                    sv   t    t|ddd| _t||d| _t | _t	||| _
t||dd| _d | _|d ur9t|||d| _d S d S )NTr   )num_channelsflip_sin_to_cosdownscale_freq_shift)in_channelstime_embed_dim	gelu_tanh)act_fn)r   )rz   r>   r   timesteps_projr   time_embedderr}   SiLUr   r~   	time_projr   text_embedderimage_embedderr   )r=   r!   r   r   r   r   r   r   r-   r.   r>   <  s   
	
z"WanTimeTextImageEmbedding.__init__timestepr   r   timestep_seq_lenc                 C   s   |  |}|d ur|dd|f}tt| j j}|j|kr*|tjkr*|	|}| |
|}| | |}| |}|d urH| |}||||fS )Nr   r   )r   rE   nextiterr   
parametersdtyperG   int8torI   r   r   r   r   )r=   r   r   r   r   time_embedder_dtypetembtimestep_projr-   r-   r.   r   Q  s   



z!WanTimeTextImageEmbedding.forward)NNr   r-   r-   r   r.   r   ;  s6    r   c                       sX   e Zd Z		ddedeeeef dededef
 fdd	Zd
ejdejfddZ	  Z
S )ChronoEditRotaryPosEmbed     @ro   attention_head_dim
patch_sizemax_seq_lenthetatemporal_skip_lenc              	      s   t    || _|| _|| _|| _d|d   }}|| | }tjj	 r(tj
ntj}	g }
g }|||fD ]}t|||dd|	d\}}|
| || q4| jdtj|
dddd	 | jd
tj|dddd	 d S )Nr      T)use_realrepeat_interleave_realfreqs_dtyperC   rB   r    F)
persistentrD   )rz   r>   r   r   r   r   rG   backendsmpsis_availablefloat32float64r   appendregister_bufferr   )r=   r   r   r   r   r   h_dimw_dimt_dimr   rC   rD   r!   freq_cosfreq_sinr   r-   r.   r>   j  s.   


z!ChronoEditRotaryPosEmbed.__init__r   rA   c                 C   s>  |j \}}}}}| j\}}}	|| || ||	 }
}}| jd| jd   | jd | jd g}| jj|dd}| jj|dd}|dkr]|d d | j ddg |
ddd|
||d}n|d d |
 |
ddd|
||d}|d d | d|dd|
||d}|d d | dd|d|
||d}|dkr|d d | j ddg |
ddd|
||d}n|d d |
 |
ddd|
||d}|d d | d|dd|
||d}|d d | dd|d|
||d}t	j
|||gddd|
| | dd}t	j
|||gddd|
| | dd}||fS )Nr   r   rB   r    r   r   )rV   r   r   rC   splitrD   r   r   expandrG   r   reshape)r=   r   r   r   
num_framesheightwidthp_tp_hp_wppfpphppwsplit_sizesrC   rD   freqs_cos_ffreqs_cos_hfreqs_cos_wfreqs_sin_ffreqs_sin_hfreqs_sin_wr-   r-   r.   r     s,   4(((4(((((z ChronoEditRotaryPosEmbed.forward)r   ro   )rd   re   rf   r   rg   r   r>   rG   rZ   r   r   r-   r-   r   r.   r   i  s    %r   c                       sp   e Zd Z				ddedededed	ed
ededB f fddZdej	dej	dej	dej	dej	f
ddZ
  ZS )WanTransformerBlockrms_norm_across_headsFư>Nr!   ffn_dim	num_headsqk_normcross_attn_normrs   ru   c              	      s   t    t||dd| _t|||| |d t d| _t|||| |||| t d| _|r5t||ddnt	 | _
t||dd| _t||dd| _ttdd	||d
  | _d S )NFry   )r!   rY   rr   rs   rv   r   )r!   rY   rr   rs   ru   rv   r   Tzgelu-approximate)r{   r   rB   r         ?)rz   r>   r   r   r   r7   attn1attn2r}   Identityr   r   ffnnorm3r   rG   randnscale_shift_table)r=   r!   r   r   r   r   rs   ru   r   r-   r.   r>     s.   


	"zWanTransformerBlock.__init__r   r   r   r@   rA   c                 C   sP  |j dkr9| jd|  jddd\}}}}}	}
|d}|d}|d}|d}|	d}	|
d}
n| j|  jddd\}}}}}	}
| | d|  | |}| |d d |}| ||  |}| 	| |}| 
||d d }|| }| | d|	  | |}| |}| | |
  |}|S )N   r   r   r   r    rB   )ndimr  	unsqueezer   r%   squeezer   rI   r   r   r  r  r  )r=   r   r   r   r@   	shift_msa	scale_msagate_msac_shift_msac_scale_msa
c_gate_msanorm_hidden_statesattn_output	ff_outputr-   r-   r.   r     s2   





 
zWanTransformerBlock.forward)r   Fr   N)rd   re   rf   r   strboolr   r>   rG   rZ   r   r   r-   r-   r   r.   r     s>    )r   c                &       sf  e Zd ZdZdZg dZdgZg dZdgZdgZ	e
dddd	e
dddd	d
de
dddd	ieddddZe																	d7dee dededed ed!ed"ed#ed$ed%ed&edB d'ed(edB d)edB d*ed+edB d,ed-df$ fd.d/Zed0			d8dejd1ejd2ejd3ejdB d4ed0eeef dB d-ejeeejf B fd5d6Z  ZS )9ChronoEditTransformer3DModela  
    A Transformer model for video-like data used in the ChronoEdit model.

    Args:
        patch_size (`tuple[int]`, defaults to `(1, 2, 2)`):
            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
        num_attention_heads (`int`, defaults to `40`):
            Fixed length for text embeddings.
        attention_head_dim (`int`, defaults to `128`):
            The number of channels in each head.
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        text_dim (`int`, defaults to `512`):
            Input dimension for text embeddings.
        freq_dim (`int`, defaults to `256`):
            Dimension for sinusoidal time embeddings.
        ffn_dim (`int`, defaults to `13824`):
            Intermediate dimension in feed-forward network.
        num_layers (`int`, defaults to `40`):
            The number of layers of transformer blocks to use.
        window_size (`tuple[int]`, defaults to `(-1, -1)`):
            Window size for local attention (-1 indicates global attention).
        cross_attn_norm (`bool`, defaults to `True`):
            Enable cross-attention normalization.
        qk_norm (`bool`, defaults to `True`):
            Enable query/key normalization.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        add_img_emb (`bool`, defaults to `False`):
            Whether to use img_emb.
        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
            The number of channels to use for the added key and value projections. If `None`, no projection is used.
    T)patch_embeddingcondition_embeddernormr   )r   r  r   r   r  norm_added_qrB   r  )	split_dimexpected_dimssplit_output)r   rB   r   r   F)
gather_dimr  )ropezblocks.0proj_outrB   r   r   (                6  r   r   N   ro   r   num_attention_headsr   r   out_channelstext_dimfreq_dimr   
num_layersr   r   rs   	image_dimru   rope_max_seq_lenr   rope_temporal_skip_lenrA   c              	      s   t    | |p|}t||||d| _tj|||d| _t|d |||d| _t	 fddt
|	D | _tdd| _t|t| | _ttd	d
d  | _d| _d S )N)r   )kernel_sizestrider   )r!   r   r   r   r   r   c              
      s    g | ]}t  qS r-   )r   ).0_ru   r   rs   r   r{   r)  r   r-   r.   
<listcomp>u  s    z9ChronoEditTransformer3DModel.__init__.<locals>.<listcomp>Fr   rB   r   r   )rz   r>   r   r  r}   Conv3dr  r   r  r   rangeblocksr   norm_outr~   mathprodr   r   rG   r  r  gradient_checkpointing)r=   r   r)  r   r   r*  r+  r,  r   r-  r   r   rs   r.  ru   r/  r   r0  r   r5  r.   r>   H  s0   



z%ChronoEditTransformer3DModel.__init__attention_kwargsr   r   r   return_dictc              
   C   s2  |j \}}}	}
}| jj\}}}|	| }|
| }|| }| |}| |}|ddd}|jdkr=|j d }| }nd }| j||||d\}}}}|d urW|	dd}n|	dd}|d urjt
j||gdd}t
 r| jr| jD ]}| |||||}qtn| jD ]	}|||||}q|jdkr| jd|j|d jddd\}}|d}|d}n| j|j|d jddd\}}||j}||j}| | d|  | |}| |}||||||||d}|dd	dd
dddd}|dd	d
ddd}|s|fS t|dS )Nr   rB   )r   )r   r   r    r   r   r      r     r   )sample)rV   configr   r  r  r]   	transposer  r  rE   rG   concatis_grad_enabledr=  r9  _gradient_checkpointing_funcr  r	  r   r   r%   r
  r:  r   rI   r   r   permuter   )r=   r   r   r   r   r?  r>  r   r   r   r   r   r   r   r   post_patch_num_framespost_patch_heightpost_patch_widthr@   
ts_seq_lenr   r   blockshiftscaleoutputr-   r-   r.   r     sZ   









,
& 

z$ChronoEditTransformer3DModel.forward)r!  r"  r#  r$  r$  r%  r&  r'  r"  Tr   r   NNr(  Nro   )NTN)rd   re   rf   __doc__ _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_keep_in_fp32_modules"_keys_to_ignore_on_load_unexpected_repeated_blocksr   r   _cp_planr   rg   r   r  r  r   r>   r   rG   rZ   
LongTensordictr   r   r   r-   r-   r   r.   r    s    $
	
;r  ):r;  typingr   rG   torch.nnr}   torch.nn.functional
functionalr:   configuration_utilsr   r   loadersr   r   utilsr   r	   r
   utils.torch_utilsr   _modeling_parallelr   r   	attentionr   r   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerrd   loggerrZ   r/   r6   r7   rh   Moduler   r   r   r   r   r  r-   r-   r-   r.   <module>   s>   

bn.H
X