o
    Gis                  	   @   s  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ ddl	m
Z
mZ ddlmZmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* e+e,Z-dddej.dej.fddZ/dddej.fddZ0G dd dZ1G dd dZ2G dd dejj3eZ4G dd  d ejj3Z5G d!d" d"ej3Z6G d#d$ d$ej3Z7eG d%d& d&ej3Z8G d'd( d(e(e
eeeeZ9dS ))    N)Any   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)apply_lora_scale	deprecatelogging)maybe_allow_in_graph   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixinFeedForward)dispatch_attention_fn)
CacheMixin)PixArtAlphaTextProjectionTimestepEmbedding	Timestepsget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)FP32LayerNormattnWanAttentionhidden_statesencoder_hidden_statesc                 C   s   |d u r|}| j r,| js| |jddd\}}}n!| |}| |jddd\}}n| |}| |}| |}|||fS )Nr   dimr   )fused_projectionsis_cross_attentionto_qkvchunkto_qto_kvto_kto_v)r   r   r   querykeyvalue r-   a/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_wan.py_get_qkv_projections'   s   




r/   encoder_hidden_states_imgc                 C   sB   | j r| |jddd\}}||fS | |}| |}||fS )Nr   r   r    )r"   to_added_kvr%   
add_k_proj
add_v_proj)r   r0   key_img	value_imgr-   r-   r.   _get_added_kv_projections;   s   

r6   c                   @   sf   e Zd ZdZdZdd Z			ddddejdejdB dejdB d	eejejf dB d
ejfddZ	dS )WanAttnProcessorNc                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzbWanAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher.)hasattrFImportErrorselfr-   r-   r.   __init__H   s
   
zWanAttnProcessor.__init__r   r   r   r   attention_mask
rotary_embreturnc              
   C   s  d }|j d ur"|jd d }|d d d |f }|d d |d f }t|||\}}	}
||}||	}	|d|jdf}|	d|jdf}	|
d|jdf}
|d urrdtjdtjdtjfdd	}||g|R  }||	g|R  }	d }|d urt	||\}}|
|}|d|jdf}|d|jdf}t|||d d
d| jd d}|dd}||}t||	|
|d
d| j|d u r| jnd d}|dd}||}|d ur|| }|jd |}|jd |}|S )N   i   r   r   r   	freqs_cos	freqs_sinc                 S   s   |  ddd\}}|ddd df }|ddd df }t| }|| ||  |ddd df< || ||  |ddd df< || S )Nr   )r   r   .r   r   rB   )	unflattenunbindtorch
empty_liketype_as)r   rC   rD   x1x2cossinoutr-   r-   r.   apply_rotary_embh   s   

z3WanAttnProcessor.__call__.<locals>.apply_rotary_emb        F)	attn_mask	dropout_p	is_causalbackendparallel_configr   r   )r2   shaper/   norm_qnorm_krE   headsrG   Tensorr6   norm_added_kr   _attention_backendflattenrI   _parallel_configto_out)r=   r   r   r   r?   r@   r0   image_context_lengthr*   r+   r,   rO   hidden_states_imgr4   r5   r-   r-   r.   __call__N   sp   






zWanAttnProcessor.__call__NNN)
__name__
__module____qualname__r\   r^   r>   rG   rZ   tuplerb   r-   r-   r-   r.   r7   D   s(    
r7   c                   @   s   e Zd Zdd ZdS )WanAttnProcessor2_0c                 O   s"   d}t dd|dd t|i |S )NzzThe WanAttnProcessor2_0 class is deprecated and will be removed in a future version. Please use WanAttnProcessor instead. rh   z1.0.0F)standard_warn)r	   r7   )clsargskwargsdeprecation_messager-   r-   r.   __new__   s   zWanAttnProcessor2_0.__new__N)rd   re   rf   rn   r-   r-   r-   r.   rh      s    rh   c                       s   e Zd ZeZegZ								ddededed	ed
ededB dedB f fddZdd Z	e
 dd Z			dde
jde
jdB de
jdB dee
je
jf dB de
jf
ddZ  ZS )r      @   h㈵>rP   Nr!   rY   dim_headepsdropoutadded_kv_proj_dimcross_attention_dim_headc
           
         sX  t    || | _|| _|| _|| _|d u r| jn|| | _tjj	|| jdd| _
tjj	|| jdd| _tjj	|| jdd| _tjtjj	| j|ddtj|g| _tjj|| |dd| _tjj|| |dd| _d  | _| _|d urtjj	|| jdd| _tjj	|| jdd| _tjj|| |d| _|	d ur|	| _n|d u| _| | d S )NTbias)rs   elementwise_affine)rs   )superr>   	inner_dimrY   ru   rv   kv_inner_dimrG   nnLinearr&   r(   r)   
ModuleListDropoutr_   RMSNormrW   rX   r2   r3   r[   r#   set_processor)
r=   r!   rY   rr   rs   rt   ru   rv   	processorr#   	__class__r-   r.   r>      s2   



zWanAttention.__init__c                 C   s  t | ddrd S | js]t| jjj| jjj| jjjg}t| jj	j| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sKw   Y  | jj||dddd nIt| jjj| jjjg}t| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sw   Y  | jj||dddd | jd urt| jjj| jjjg}t| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sw   Y  | jj||dddd d| _d S )Nr"   FmetaTrw   )weightrx   )strictassign)getattrr#   rG   catr&   r   datar(   r)   rx   rV   devicer}   r~   r$   load_state_dictr'   ru   r2   r3   r1   r"   )r=   concatenated_weightsconcatenated_biasout_featuresin_featuresr-   r-   r.   fuse_projections   s@   ""




zWanAttention.fuse_projectionsc                 C   sV   t | ddsd S t| drt| d t| drt| d t| dr&t| d d| _d S )Nr"   Fr$   r'   r1   )r   r9   delattrr"   r<   r-   r-   r.   unfuse_projections  s   






zWanAttention.unfuse_projectionsr   r   r?   r@   rA   c                 K   s   | j | ||||fi |S N)r   )r=   r   r   r?   r@   rl   r-   r-   r.   forward  s   zWanAttention.forward)ro   rp   rq   rP   NNNNrc   )rd   re   rf   r7   _default_processor_cls_available_processorsintfloatr>   r   rG   no_gradr   rZ   rg   r   __classcell__r-   r-   r   r.   r      sV    -#
c                       s>   e Zd Zd
dedef fddZdejdejfdd	Z  ZS )WanImageEmbeddingNr   r   c                    s\   t    t|| _t||ddd| _t|| _|d ur)tt	
d||| _d S d | _d S )NrB   gelu)multactivation_fn)rz   r>   r   norm1r   ffnorm2r}   	ParameterrG   zeros	pos_embed)r=   r   r   pos_embed_seq_lenr   r-   r.   r>     s   



zWanImageEmbedding.__init__encoder_hidden_states_imagerA   c                 C   sT   | j d ur|j\}}}|dd| |}|| j  }| |}| |}| |}|S )Nr   r   )r   rV   viewr   r   r   )r=   r   
batch_sizeseq_len	embed_dimr   r-   r-   r.   r   (  s   




zWanImageEmbedding.forwardr   	rd   re   rf   r   r>   rG   rZ   r   r   r-   r-   r   r.   r     s    r   c                       sr   e Zd Z		ddedededededB dedB f fdd	Z		dd
ejdejdejdB dedB fddZ  ZS )WanTimeTextImageEmbeddingNr!   time_freq_dimtime_proj_dimtext_embed_dimimage_embed_dimr   c                    sv   t    t|ddd| _t||d| _t | _t	||| _
t||dd| _d | _|d ur9t|||d| _d S d S )NTr   )num_channelsflip_sin_to_cosdownscale_freq_shift)in_channelstime_embed_dim	gelu_tanh)act_fn)r   )rz   r>   r   timesteps_projr   time_embedderr}   SiLUr   r~   	time_projr   text_embedderimage_embedderr   )r=   r!   r   r   r   r   r   r   r-   r.   r>   5  s   
	
z"WanTimeTextImageEmbedding.__init__timestepr   r   timestep_seq_lenc                 C   s   |  |}|d ur|dd|f}tt| j j}|j|kr*|tjkr*|	|}| |
|}| | |}| |}|d urH| |}||||fS )Nr   r   )r   rE   nextiterr   
parametersdtyperG   int8torI   r   r   r   r   )r=   r   r   r   r   time_embedder_dtypetembtimestep_projr-   r-   r.   r   J  s   



z!WanTimeTextImageEmbedding.forward)NNr   r-   r-   r   r.   r   4  s6    r   c                	       sR   e Zd Z	ddedeeeef dedef fddZdejd	ejfd
dZ	  Z
S )WanRotaryPosEmbed     @attention_head_dim
patch_sizemax_seq_lenthetac              	      s   t    || _|| _|| _d|d   }}|| | }|| _|| _|| _tj	j
 r.tjntj}g }	g }
|||fD ]}t|||dd|d\}}|	| |
| q:| jdtj|	dddd	 | jd
tj|
dddd	 d S )Nr      T)use_realrepeat_interleave_realfreqs_dtyperC   rB   r    F)
persistentrD   )rz   r>   r   r   r   t_dimh_dimw_dimrG   backendsmpsis_availablefloat32float64r   appendregister_bufferr   )r=   r   r   r   r   r   r   r   r   rC   rD   r!   freq_cosfreq_sinr   r-   r.   r>   c  s2   


zWanRotaryPosEmbed.__init__r   rA   c                 C   s  |j \}}}}}| j\}}}	|| || ||	 }
}}| j| j| jg}| jj|dd}| jj|dd}|d d |
 |
ddd	|
||d}|d d | d|dd	|
||d}|d d | dd|d	|
||d}|d d |
 |
ddd	|
||d}|d d | d|dd	|
||d}|d d | dd|d	|
||d}t
j|||gddd|
| | dd}t
j|||gddd|
| | dd}||fS )NrB   r    r   r   r   )rV   r   r   r   r   rC   splitrD   r   expandrG   r   reshape)r=   r   r   r   
num_framesheightwidthp_tp_hp_wppfpphppwsplit_sizesrC   rD   freqs_cos_ffreqs_cos_hfreqs_cos_wfreqs_sin_ffreqs_sin_hfreqs_sin_wr-   r-   r.   r     s   ((((((((zWanRotaryPosEmbed.forward)r   )rd   re   rf   r   rg   r   r>   rG   rZ   r   r   r-   r-   r   r.   r   b  s    (r   c                       sp   e Zd Z				ddedededed	ed
ededB f fddZdej	dej	dej	dej	dej	f
ddZ
  ZS )WanTransformerBlockrms_norm_across_headsFư>Nr!   ffn_dim	num_headsqk_normcross_attn_normrs   ru   c              	      s   t    t||dd| _t|||| |d t d| _t|||| |||| t d| _|r5t||ddnt	 | _
t||dd| _t||dd| _ttdd	||d
  | _d S )NFry   )r!   rY   rr   rs   rv   r   )r!   rY   rr   rs   ru   rv   r   Tzgelu-approximate)r{   r   rB   r         ?)rz   r>   r   r   r   r7   attn1attn2r}   Identityr   r   ffnnorm3r   rG   randnscale_shift_table)r=   r!   r   r   r   r   rs   ru   r   r-   r.   r>     s.   


	"zWanTransformerBlock.__init__r   r   r   r@   rA   c                 C   sP  |j dkr9| jd|  jddd\}}}}}	}
|d}|d}|d}|d}|	d}	|
d}
n| j|  jddd\}}}}}	}
| | d|  | |}| |d d |}| ||  |}| 	| |}| 
||d d }|| }| | d|	  | |}| |}| | |
  |}|S )N   r   r   r   r    rB   )ndimr  	unsqueezer   r%   squeezer   rI   r   r   r   r  r  )r=   r   r   r   r@   	shift_msa	scale_msagate_msac_shift_msac_scale_msa
c_gate_msanorm_hidden_statesattn_output	ff_outputr-   r-   r.   r     s2   





 
zWanTransformerBlock.forward)r   Fr   N)rd   re   rf   r   strboolr   r>   rG   rZ   r   r   r-   r-   r   r.   r     s>    )r   c                $       st  e Zd ZdZdZg dZdgZg dZdgZdgZ	e
dddd	e
dddd	d
de
dddd	iedddde
dddd	idZe																d7deedf ded ed!ed"ed#ed$ed%ed&ed'ed(edB d)ed*edB d+edB d,ed-edB d.df" fd/d0Zed1			d8dejdejd2ejd3ejdB d4ed1eeef dB d.ejeeejf B fd5d6Z  ZS )9WanTransformer3DModela  
    A Transformer model for video-like data used in the Wan model.

    Args:
        patch_size (`tuple[int]`, defaults to `(1, 2, 2)`):
            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
        num_attention_heads (`int`, defaults to `40`):
            Fixed length for text embeddings.
        attention_head_dim (`int`, defaults to `128`):
            The number of channels in each head.
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        text_dim (`int`, defaults to `512`):
            Input dimension for text embeddings.
        freq_dim (`int`, defaults to `256`):
            Dimension for sinusoidal time embeddings.
        ffn_dim (`int`, defaults to `13824`):
            Intermediate dimension in feed-forward network.
        num_layers (`int`, defaults to `40`):
            The number of layers of transformer blocks to use.
        window_size (`tuple[int]`, defaults to `(-1, -1)`):
            Window size for local attention (-1 indicates global attention).
        cross_attn_norm (`bool`, defaults to `True`):
            Enable cross-attention normalization.
        qk_norm (`bool`, defaults to `True`):
            Enable query/key normalization.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        add_img_emb (`bool`, defaults to `False`):
            Whether to use img_emb.
        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
            The number of channels to use for the added key and value projections. If `None`, no projection is used.
    T)patch_embeddingcondition_embeddernormr   )r   r  r   r   r  norm_added_qrB   r  )	split_dimexpected_dimssplit_output)r   rB   r   r   F)
gather_dimr  r   r   )ropezblocks.0proj_out rB   r   r   (                6  r   r   N   r   .num_attention_headsr   r   out_channelstext_dimfreq_dimr   
num_layersr   r   rs   	image_dimru   rope_max_seq_lenr   rA   c              	      s   t    | |p|}t|||| _tj|||d| _t|d |||d| _t	 fddt
|	D | _tdd| _t|t| | _ttdd	d
  | _d| _d S )N)kernel_sizestrider   )r!   r   r   r   r   r   c              
      s    g | ]}t  qS r-   )r   ).0_ru   r   rs   r   r{   r)  r   r-   r.   
<listcomp>e  s    z2WanTransformer3DModel.__init__.<locals>.<listcomp>Fr   rB   r   r   )rz   r>   r   r  r}   Conv3dr  r   r  r   rangeblocksr   norm_outr~   mathprodr  r   rG   r  r  gradient_checkpointing)r=   r   r)  r   r   r*  r+  r,  r   r-  r   r   rs   r.  ru   r/  r   r   r4  r.   r>   ;  s,   



zWanTransformer3DModel.__init__attention_kwargsr   r   return_dictc              
   C   s2  |j \}}}	}
}| jj\}}}|	| }|
| }|| }| |}| |}|ddd}|jdkr=|j d }| }nd }| j||||d\}}}}|d urW|	dd}n|	dd}|d urjt
j||gdd}t
 r| jr| jD ]}| |||||}qtn| jD ]	}|||||}q|jdkr| jd|j|d jddd\}}|d}|d}n| j|j|d jddd\}}||j}||j}| | d|  | |}| |}||||||||d}|dd	dd
dddd}|dd	d
ddd}|s|fS t|dS )Nr   rB   )r   )r   r   r    r   r   r      r     r   )sample)rV   configr   r  r  r]   	transposer  r  rE   rG   concatis_grad_enabledr<  r8  _gradient_checkpointing_funcr  r  r   r   r%   r	  r9  r   rI   r  r   permuter   )r=   r   r   r   r   r>  r=  r   r   r   r   r   r   r   r   post_patch_num_framespost_patch_heightpost_patch_widthr@   
ts_seq_lenr   r   blockshiftscaleoutputr-   r-   r.   r   t  sZ   









,
& 

zWanTransformer3DModel.forward)r!  r"  r#  r$  r$  r%  r&  r'  r"  Tr   r   NNr(  N)NTN)rd   re   rf   __doc__ _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_keep_in_fp32_modules"_keys_to_ignore_on_load_unexpected_repeated_blocksr   r   _cp_planr   rg   r   r  r  r   r>   r   rG   rZ   
LongTensordictr   r   r   r-   r-   r   r.   r    s    $

	
8r  ):r:  typingr   rG   torch.nnr}   torch.nn.functional
functionalr:   configuration_utilsr   r   loadersr   r   utilsr   r	   r
   utils.torch_utilsr   _modeling_parallelr   r   	attentionr   r   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerrd   loggerrZ   r/   r6   r7   rh   Moduler   r   r   r   r   r  r-   r-   r-   r.   <module>   s>   
	a
m.A
W