o
    پi                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZ d dlmZmZmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 e7e8Z9e1: Z;G dd dejj<Z=G dd dej<Z>G dd dej<Z?G dd de?Z@G dd de?ZAG dd dej<ZBG d d! d!ej<ZCG d"d# d#e.e5ZDeDZEdS )$    N)	lru_cache)Any)WanVideoConfig)WanTeaCacheParams)divideget_sp_groupget_sp_world_sizeget_tp_world_size"sequence_model_parallel_all_gather)MinimalA2AAttnOpUlyssesAttention_VSAUSPAttention)MulAdd)FP32LayerNormLayerNormScaleShiftRMSNorm ScaleResidualLayerNormScaleShifttensor_parallel_rms_norm)ColumnParallelLinearRowParallelLinear)MLP)NDRotaryEmbedding_apply_rotary_emb apply_flashinfer_rope_qk_inplace)ModulateProjection
PatchEmbedTimestepEmbedder)get_forward_context)CachableDiT)AttentionBackendEnumcurrent_platform)get_global_server_args)OffloadableDiTMixin)init_loggerc                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	WanImageEmbeddingin_featuresout_featuresc                    s4   t    t|| _t|||dd| _t|| _d S )Ngeluact_type)super__init__r   norm1r   ffnorm2)selfr%   r&   	__class__ f/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/dits/wanvideo.pyr+   A   s   

zWanImageEmbedding.__init__encoder_hidden_states_imagereturnc                 C   s.   |j }| |}| |}| ||}|S N)dtyper,   r-   r.   to)r/   r4   r7   hidden_statesr2   r2   r3   forwardH   s
   

zWanImageEmbedding.forward	__name__
__module____qualname__intr+   torchTensorr:   __classcell__r2   r2   r0   r3   r$   ?   s    r$   c                
       sd   e Zd Z	ddededededB f fddZ		ddejd	ejd
ejdB dedB fddZ  ZS )WanTimeTextImageEmbeddingNdimtime_freq_dimtext_embed_dimimage_embed_dimc                    s`   t    t||dd| _t|ddd| _t|||ddd| _d | _|d ur.t	||| _d S d S )Nsilu)frequency_embedding_size	act_layer   )factorrJ   Tgelu_pytorch_tanh)biasr)   )
r*   r+   r   time_embedderr   time_modulationr   text_embedderimage_embedderr$   )r/   rD   rE   rF   rG   r0   r2   r3   r+   R   s   

z"WanTimeTextImageEmbedding.__init__timestepencoder_hidden_statesr4   timestep_seq_lenc                 C   sL   |  ||}| |}| |}|d ur | jd usJ | |}||||fS r6   )rO   rP   rQ   rR   )r/   rS   rT   r4   rU   tembtimestep_projr2   r2   r3   r:   g   s   

z!WanTimeTextImageEmbedding.forwardr6   NNr;   r2   r2   r0   r3   rC   P   s,    rC   c                	       s\   e Zd Z					ddededee dB d	df fd
dZdejdejdefddZ	  Z
S )WanSelfAttentionr[   Tư>FNrD   	num_headssupported_attention_backendsr5   c           	         s   || dksJ t    || _|| _|| | _|| _|| _|| _|| _t	 }t
||dd| _t
||dd| _t
||dd| _t||dd| _|rOt||dnt | _|r\t||dnt | _|dkof|| _t||| _t| j| jdd d|d| _d S )	Nr   Fgather_outputT)input_is_paralleleps   )r]   	head_sizedropout_ratesoftmax_scalecausalr^   )r*   r+   rD   r]   head_dimwindow_sizeqk_normrc   parallel_attentionr	   r   to_qto_kto_vr   to_outr   nnIdentitynorm_qnorm_k
tp_rmsnormr   local_num_headsr   attn)	r/   rD   r]   rj   rk   rc   rl   r^   tp_sizer0   r2   r3   r+   }   s4   


zWanSelfAttention.__init__xcontextcontext_lensc                 C   s   dS )a  
        Args:
            x(Tensor): Shape [B, L, num_heads, C / num_heads]
            seq_lens(Tensor): Shape [B]
            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
        Nr2   )r/   ry   rz   r{   r2   r2   r3   r:      s   zWanSelfAttention.forward)rZ   Tr\   FN)r<   r=   r>   r?   setr   r+   r@   rA   r:   rB   r2   r2   r0   r3   rY   {   s     
	")rY   c                   @   s   e Zd Zdd ZdS )WanT2VCrossAttentionc                 C   s   |  |\}}| jrt|| j}n| |}|d| j| jf}| |\}}| jr1t|| j}n| |}|d| j| jf}| 	|\}}|d| j| jf}| 
|||}|d}| |\}}|S )
        Args:
            x(Tensor): Shape [B, L1, C]
            context(Tensor): Shape [B, L2, C]
            context_lens(Tensor): Shape [B]
           )rm   ru   r   rs   	unflattenrv   ri   rn   rt   ro   rw   flattenrp   )r/   ry   rz   r{   q_kvr2   r2   r3   r:      s    


zWanT2VCrossAttention.forwardN)r<   r=   r>   r:   r2   r2   r2   r3   r}      s    r}   c                	       sH   e Zd Z				ddededee dB ddf fd	d
Zdd Z  ZS )WanI2VCrossAttentionrZ   Tr\   NrD   r]   r^   r5   c                    s\   t  j||||||d t||dd| _t||dd| _|r't||d| _d S t | _d S )N)r^   Fr_   rb   )	r*   r+   r   
add_k_proj
add_v_projr   rq   rr   norm_added_k)r/   rD   r]   rj   rk   rc   r^   r0   r2   r3   r+      s   		$zWanI2VCrossAttention.__init__c                 C   sv  |ddddf }|ddddf }|  |\}}| jr%t|| j}n| |}|d| j| jf}| |\}}| jrEt|| j}n| |}|d| j| jf}| 	|\}}|d| j| jf}| 
|\}	}| jrvt|	| j}	n| |	}	|	d| j| jf}	| |\}
}|
d| j| jf}
| ||	|
}| |||}|d}|d}|| }| |\}}|S )r~   Ni  r   )rm   ru   r   rs   r   rv   ri   rn   rt   ro   r   r   r   rw   r   rp   )r/   ry   rz   r{   context_imgr   r   r   r   k_imgv_imgimg_xr2   r2   r3   r:      s8   




zWanI2VCrossAttention.forward)rZ   Tr\   N)	r<   r=   r>   r?   r|   r   r+   r:   rB   r2   r2   r0   r3   r      s    
r   c                       s   e Zd Z								dded	ed
edededededB dee dB dededef fddZ	de
jde
jde
jdee
je
jf de
jf
ddZ  ZS )WanTransformerBlockrms_norm_across_headsFr\   N original皙?rD   ffn_dimr]   rk   cross_attn_normrc   added_kv_proj_dimr^   prefixattention_typesla_topkc                    s  t    t||dtjd| _t||ddd| _t||ddd| _t||ddd| _	t
||ddd| _t }t||| _|}|
dv rTt| j|| |
|tjtjhd| _nt| j|| d||	 dd	| _|| _|| _|| | _|d
krt| j|d| _t| j|d| _n|dkrt||d| _t||d| _ntd t|du sJ || _|dko|dk| _t ||dtjd| _!dd |D }|d urt"|||||d| _#n
t$|||||d| _#t ||dtjd| _%t&||dd| _'t( | _)t*+t,dd||d  | _-d S )NFrc   elementwise_affiner7   TrN   r`   )rN   reduce_results)slasagesla)r]   re   r   topkr^   .attn1r]   re   rh   r^   r   rms_normrb   r   QK Norm type not supportedrd   c                 S      h | ]}|j s|qS r2   	is_sparse.0br2   r2   r3   	<setcomp>j  
    z/WanTransformerBlock.__init__.<locals>.<setcomp>rk   rc   r^   rM   r(   rK         ?).r*   r+   r   r@   float32r,   r   rm   rn   ro   r   rp   r	   r   rv   r   r   SLA_ATTNSAGE_SLA_ATTNattn1r   
hidden_dimnum_attention_headsdim_headr   rs   rt   loggererror	Exceptionrk   ru   r   self_attn_residual_normr   attn2r}   cross_attn_residual_normr   ffnr   mlp_residualrq   	Parameterrandnscale_shift_table)r/   rD   r   r]   rk   r   rc   r   r^   r   r   r   rx   self_attn_backendscross_attn_backendsr0   r2   r3   r+     s   




	"zWanTransformerBlock.__init__r9   rT   rV   	freqs_cisr5   c                 C   s  |  dkr|d}|j\}}}|j}|  dkrN| jd|  jddd\}	}
}}}}|	d}	|
d}
|d}|d}|d}|d}n| j|  }|jddd\}	}
}}}}|	jtj	ksjJ | 
||	|
}| |\}}| |\}}| |\}}| jd ur| jrt|| j}n| |}| jd ur| jrt|| j}n| |}|dd| j| jf}|dd| j| jf}|dd| j| jf}|\}}tr|j|jkrtj|jtj	d |jtj	d gdd}t|||d	d
\}}nt|||d	dt|||d	d}}| |||}|d}| |\}}|d}tjd|j|jd }}|  |||||\}}||||}}| j!||d d}| "||d||\}}||||}}| #|}| $|||}||}|S )N   rd   r   rK   r   rD   r7   r[   Fis_neoxis_neox_stylerd   devicer7   rz   r{   )%rD   squeezeshaper7   r   	unsqueezefloatchunkr@   r   r,   rm   rn   ro   rs   ru   r   rt   r   rv   r   _is_cudacatr8   
contiguousr   r   r   r   rp   zerosr   r   r   r   r   r   )r/   r9   rT   rV   r   bs
seq_lengthr   
orig_dtype	shift_msa	scale_msagate_msac_shift_msac_scale_msa
c_gate_msaenorm_hidden_statesquerykeyvaluecossincos_sin_cacheattn_output
null_shift
null_scale	ff_outputr2   r2   r3   r:     s   

















zWanTransformerBlock.forward)r   Fr\   NNr   r   r   r<   r=   r>   r?   strboolr   r|   r   r+   r@   rA   tupler:   rB   r2   r2   r0   r3   r     sV    
	
nr   c                       s   e Zd Z						ddededed	ed
edededB dee dB def fddZ	de
jde
jde
jdee
je
jf de
jf
ddZ  ZS )WanTransformerBlock_VSAr   Fr\   Nr   rD   r   r]   rk   r   rc   r   r^   r   c
                    s  t    t||dtjd| _t||ddd| _t||ddd| _t||ddd| _	t||ddd| _
t||ddd| _t||| d||	 dd| _|| _|| _|| }
|dkrht|
|d| _t|
|d| _n|d	kr{t||d| _t||d| _ntd
 t|du sJ t||dtjd| _dd |D }|d urt|||||d| _n
t|||||d| _t||dtjd| _t||dd| _t | _t !t"dd||d  | _#d S )NFr   Tr   r   r   r   rb   r   r   c                 S   r   r2   r   r   r2   r2   r3   r   /  r   z3WanTransformerBlock_VSA.__init__.<locals>.<setcomp>r   rM   r(   rd   rK   r   )$r*   r+   r   r@   r   r,   r   rm   rn   ro   to_gate_compressrp   r   r   r   r   r   rs   rt   r   r   r   r   r   r   r   r}   r   r   r   r   r   rq   r   r   r   )r/   rD   r   r]   rk   r   rc   r   r^   r   r   r   r0   r2   r3   r+     s   


	"z WanTransformerBlock_VSA.__init__r9   rT   rV   r   r5   c                 C   s  |  dkr|d}|j\}}}|j}| j|  }	|	jddd\}
}}}}}|
jtjks0J | 	||
|}| 
|\}}| |\}}| |\}}| |\}}| jd ur]| |}| jd urg| |}|dd| jdf}|dd| jdf}|dd| jdf}|dd| jdf}|\}}tr|j|jkrtj|jtjd |jtjd gdd}t|||dd	\}}nt|||dd
t|||dd
}}| j||||d}|d}| |\}}|d}tjd|jd }}| |||||\}}||||}}| j||d d}| ||d||\}}||||}}|  |}| !|||}||}|S )Nr   rd   rK   r   r   r[   r   Fr   r   )gate_compressr   )r   r   )"rD   r   r   r7   r   r   r   r@   r   r,   rm   rn   ro   r   rs   rt   r   r   r   r   r8   r   r   r   r   r   rp   r   r   r   r   r   r   r   )r/   r9   rT   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r2   r3   r:   Q  s   












zWanTransformerBlock_VSA.forward)r   Fr\   NNr   r   r2   r2   r0   r3   r     sJ    
	
\r   c                       s   e Zd Ze jZe jZe jZe jZe jZe j	Z	dede
eef ddf fddZeddd	ed
edededejdeejejf fddZ		ddejdejeej B dejdejeej B dB dejf
ddZdejdejddfddZdefddZdejdejfddZ  ZS )WanTransformer3DModelconfig	hf_configr5   Nc                    s  t  j |d  j j  j_ j_ j_ j_ j_ j_ j	_	t
 j jdd_t j j jd_t j}|rO| dkrOtntt fddt jD _t jdtjd_t  jt!" j d	d	d
_#t$t%ddd  _&d_'(  t) _*jj }|d|d   d|d  d|d  g_+t,j+dt-. st-/ rtjntj0d_1dg_2d S )N)r   r   F)in_chans	embed_dim
patch_sizer   )rD   rE   rF   rG   video_sparse_attnc                    sP   g | ]$} j  j j j j jjtjhB  j	 d |  j
 jdqS )z.blocks.)r   r   r   )r   r   rk   r   rc   r   _supported_attention_backendsr   VIDEO_SPARSE_ATTNr   r   r   )r   ir   	inner_dimr/   transformer_blockr2   r3   
<listcomp>  s$    z2WanTransformer3DModel.__init__.<locals>.<listcomp>r   Tr   rd   r   r   r   r   rK   i'  )rope_dim_list
rope_thetar7   blocks)3r*   r+   r   attention_head_dimhidden_sizein_channelsout_channelsnum_channels_latentsr   text_lenr   patch_embeddingrC   freq_dimtext_dim	image_dimcondition_embedderr!   attention_backendlowerr   r   rq   
ModuleListrange
num_layersr  r   rc   r@   r   norm_outr   mathprodproj_outr   r   r   cnt__post_init__r   sp_sizer  r   r    is_mpsis_musafloat64
rotary_emblayer_names)r/   r   r   attn_backenddr0   r   r3   r+     s~   (
zWanTransformer3DModel.__init__rd   )maxsize	local_lenrankframe_stride_localwidth_localr   c                 C   s`   || }t j||| |t jd}|| }|| }	|	| }
|	| }t j||
|fdd}| j|S )Nr   rd   r   )r@   arangelongstackr  forward_uncached)r/   r#  r$  r%  r&  r   token_starttoken_indicest_idxremh_idxw_idx	positionsr2   r2   r3    _compute_rope_for_sequence_shard  s   	z6WanTransformer3DModel._compute_rope_for_sequence_shardr9   rT   rS   r4   c           (   
   K   s0  t  j}|d ur|jo| jdk}nd}|d uo|j| _|j}	t|tjs(|d }t|t	r8t
|dkr8|d }nd }|j\}
}}}}| j\}}}|| }|| }|| }|s| jj|| j ||fdd|jd\}}|jtjksqJ |j|jksyJ |d ur| | fnd }| |}|ddd}|jd }d}|r|| j dkr| j|| j  }tj|
||jd f|j|jd}tj||gdd}t j}|jd | j }||
| j||jd }|d d |d d d d f }|| }| |||||j\}}| | f}| dkr|jd }| }nd }| j||||d\}} }}|d ur1| dd	} n| dd	} |d urEtj||gdd}t  sO|!|	n|}|j|	ksYJ | j"| |d
}!|!ri| #|}n | jrq|$ }"| j%D ]
}#|#||| |}qt| jr| &||" |  j'd7  _'|r|( }t)|dd}|dkr|d d d |d d f }| dkr| j*+d|+d j,ddd\}$}%|$-d}$|%-d}%n| j*|+d j,ddd\}$}%| .||$|%}| /|\}}&|0|
||||||d}|1dddddddd}|dddddd}'|'S )Nrd   Fr   )	shard_dimstart_framer   r   )r7   r   r   )rU   )rK   r[   )rW   rV      r[      r      rK   )2r   forward_batchenable_sequence_shardr  enable_teacacher7   
isinstancer@   rA   listlenr   r   r  forward_from_gridr   r   r   r
  r   	transposer   r   r   rank_in_groupviewr2  rD   r  r   concatr    is_amp_supportedr8   %should_skip_forward_for_cached_statesretrieve_cached_statescloner  maybe_cache_statesr  r   r
   r   r   r   r   r  r  reshapepermute)(r/   r9   rT   rS   r4   guidancekwargsr8  sequence_shard_enabledr   
batch_sizenum_channels
num_framesheightwidthp_tp_hp_wpost_patch_num_framespost_patch_heightpost_patch_width	freqs_cos	freqs_sinr   seq_len_origseq_shard_padpadsp_ranklocal_seq_lenframe_stride
ts_seq_lenrV   rW   should_skip_forwardoriginal_hidden_statesblockshiftscaler   outputr2   r2   r3   r:   '  s   	














zWanTransformer3DModel.forwardrb  c                 C   s(   | d| }| js|| _dS || _dS )z5Cache residual with CFG positive/negative separation.r   N)r   is_cfg_negativeprevious_residualprevious_residual_negative)r/   r9   rb  residualr2   r2   r3   rG    s   

z(WanTransformer3DModel.maybe_cache_statesc                 K   s   | j sdS |  }|d u rdS |j}t|tsJ d|j}||j}|j}|j	s2|d }|d }|d }|d }|r>|n|}	|j
| _
| j|k pM| j|k}
| j|	|
|j|jd}| S )NFz*teacache_params is not a WanTeaCacheParamsr   rW   rV   )modulated_inpis_boundary_stepcoefficientsteacache_thresh)r:  _get_teacache_contextteacache_paramsr;  r   use_ret_stepsget_cutoff_stepsnum_inference_steps	ret_stepsdo_cfgrg  r  _compute_teacache_decisionrm  rn  )r/   rK  ctxrp  rq  cutoff_stepsrt  rW   rV   rk  rl  should_calcr2   r2   r3   rD    s:   z;WanTransformer3DModel.should_skip_forward_for_cached_statesc                 C   s   | j s|| j S || j S )z?Retrieve cached residual with CFG positive/negative separation.)rg  rh  ri  )r/   r9   r2   r2   r3   rE    s   

z,WanTransformer3DModel.retrieve_cached_statesrX   )r<   r=   r>   r   _fsdp_shard_conditions_compile_conditionsr   param_names_mappingreverse_param_names_mappinglora_param_names_mappingdictr   r   r+   r   r?   r@   r   r   rA   r2  r<  
LongTensorr:   rG  r   rD  rE  rB   r2   r2   r0   r3   r     sZ    "`
 9

+r   )Fr  	functoolsr   typingr   r@   torch.nnrq   )sglang.multimodal_gen.configs.models.ditsr   (sglang.multimodal_gen.configs.sample.wanr   )sglang.multimodal_gen.runtime.distributedr   r   r   r	   r
   .sglang.multimodal_gen.runtime.layers.attentionr   r   r   0sglang.multimodal_gen.runtime.layers.elementwiser   .sglang.multimodal_gen.runtime.layers.layernormr   r   r   r   r   +sglang.multimodal_gen.runtime.layers.linearr   r   (sglang.multimodal_gen.runtime.layers.mlpr   5sglang.multimodal_gen.runtime.layers.rotary_embeddingr   r   r   5sglang.multimodal_gen.runtime.layers.visual_embeddingr   r   r   6sglang.multimodal_gen.runtime.managers.forward_contextr   .sglang.multimodal_gen.runtime.models.dits.baser   'sglang.multimodal_gen.runtime.platformsr   r    )sglang.multimodal_gen.runtime.server_argsr!   5sglang.multimodal_gen.runtime.utils.layerwise_offloadr"   1sglang.multimodal_gen.runtime.utils.logging_utilsr#   r<   r   is_cudar   Moduler$   rC   rY   r}   r   r   r   r   
EntryClassr2   r2   r2   r3   <module>   sH   +6#H X 6  v