o
    پi~                     @   sp  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, G dd dej-Z.G dd dej-Z/G dd de%e,Z0G dd dej-Z1G dd dej-Z2G dd dej-Z3e0Z4dS )    )AnyN)HunyuanVideoConfig)TeaCacheParams)get_sp_world_size)LocalAttentionUlyssesAttention)MulAdd)LayerNormScaleShiftRMSNorm ScaleResidualLayerNormScaleShift)ReplicatedLinear)MLP)_apply_rotary_embget_rotary_pos_embed)ModulateProjection
PatchEmbedTimestepEmbedder
unpatchify)get_forward_context)CachableDiT)modulate)AttentionBackendEnumcurrent_platform)OffloadableDiTMixinc                       s   e Zd ZdZ			ddedededejdB dee	 dB d	e
f fd
dZdejdejdejdedeejejf f
ddZ  ZS )MMDoubleStreamBlockz
    A multimodal DiT block with separate modulation for text and image/video,
    using distributed attention and linear layers.
    N hidden_sizenum_attention_heads	mlp_ratiodtypesupported_attention_backendsprefixc           	         s  t    d| _|| _|| }t|| }t|dd|| dd| _t|d|d| _t	|d|d| _
t | _t||d d|| d	d
| _t|d|d| _t|d|d| _t||d|| dd
| _t||d|| dd| _t|dd|| dd| _t|d|d| _t	|d|d| _t | _t||d d|d| _t|d|d| _t|d|d| _t||d|d| _t||d|d| _t||d|| dd| _d S )NF   siluz.img_modfactor	act_layerr   r!   )elementwise_affiner      Tz.img_attn_qkvbiasparams_dtyper!   ư>epsr   z.img_attn_projz.img_mlp)r*   r   r!   z.txt_mod)r*   r+   )r*   r   .attn	num_heads	head_sizecausalr    r!   ) super__init__deterministicr   intr   img_modr	   img_attn_normr   img_attn_residual_mlp_normr   img_mlp_residualr   img_attn_qkvr
   img_attn_q_normimg_attn_k_normimg_attn_projr   img_mlptxt_modtxt_attn_normtxt_attn_residual_mlp_normtxt_mlp_residualtxt_attn_qkvtxt_attn_q_normtxt_attn_k_normtxt_attn_projtxt_mlpr   attn	selfr   r   r   r   r    r!   head_dimmlp_hidden_dim	__class__ j/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/dits/hunyuanvideo.pyr5   4   s   
				zMMDoubleStreamBlock.__init__imgtxtvec	freqs_cisreturnc           -      C   s  |  |}tj|ddd\}}}}	}
}| |}tj|ddd\}}}}}}| |||}| |\}}|jd |jd }}|||d| jd}|d d d d df |d d d d df |d d d d df }}}| 	|
 |}| |
 |}|\}}t|||dd	t|||dd	}}| |||}| |\}}|jd |jd }}|||d| jd}|d d d d df |d d d d df |d d d d df } }!}"| | 
 | j} | |!
 |!j}!| |||| |!|"\}#}$| |#||d\}%}| ||%||	|
\}&}'| |&}(| |(||'}| |$||d\})}| ||)|||\}*}+| |*},| |,||+}||fS )
Nr"   dimr      r(      Fis_neox_style)r8   torchchunkrA   r9   r<   shapeviewr   r=   
contiguoustor>   r   rB   rE   rF   r   rG   rJ   r?   r:   r@   r;   rH   reshaperC   rI   rD   )-rL   rS   rT   rU   rV   img_mod_outputsimg_attn_shiftimg_attn_scaleimg_attn_gateimg_mlp_shiftimg_mlp_scaleimg_mlp_gatetxt_mod_outputstxt_attn_shifttxt_attn_scaletxt_attn_gatetxt_mlp_shifttxt_mlp_scaletxt_mlp_gateimg_attn_inputimg_qkv_
batch_sizeimage_seq_lenimg_qimg_kimg_vcossintxt_attn_inputtxt_qkvtext_seq_lentxt_qtxt_ktxt_vimg_attntxt_attnimg_attn_outimg_mlp_inputimg_residualimg_mlp_outtxt_attn_outtxt_mlp_inputtxt_residualtxt_mlp_outrQ   rQ   rR   forward   sx   

FF



zMMDoubleStreamBlock.forward)NNr   __name__
__module____qualname____doc__r7   floatr_   r   setr   strr5   Tensortupler   __classcell__rQ   rQ   rO   rR   r   .   s:    

hr   c                       s   e Zd ZdZ				ddedededejdB d	ee	 dB d
e
f fddZdejdejdedeejejf dejf
ddZ  ZS )MMSingleStreamBlockzi
    A DiT block with parallel linear layers using distributed attention
    and tensor parallelism.
          @Nr   r   r   r   r   r    r!   c           	         s   t    d| _|| _|| _|| }t|| }|| _t||d | d|| dd| _t|| |d|| dd| _	t
|d|d| _t
|d|d| _t|dd|d	| _t | _tjd
d| _t|dd|| dd| _t||d|| dd| _d S )NFr(   Tz.linear1r)   z.linear2r,   r-   r.   r'   r   tanh)approximater#   z.modulationr$   r/   r0   )r4   r5   r6   r   r   r7   rN   r   linear1linear2r
   q_normk_normr	   input_norm_scale_shiftr   output_residualnnGELUmlp_actr   
modulationr   rJ   rK   rO   rQ   rR   r5     sZ   
	
			zMMSingleStreamBlock.__init__xrU   txt_lenrV   rW   c                  C   s  |  |jddd\}}}| |||}| |\}	}
tj|	d| j | jgdd\}}|jd |jd }}|	||d| j
d}|d d d d df |d d d d df |d d d d df }}}| | |j}| | |j}|d d d | f |d d | d f }}|d d d | f |d d | d f }}|d d d | f |d d | d f }}|\}}t|||ddt|||dd}}| ||||||\}}tj||fdd	||d}| |}tj||fdd}| |\}}
| |||S )	Nr(   rX   rY   r   r[   r\   Fr]   )r   r`   r   r   r_   splitr   rN   ra   rb   r   r   rc   rd   r   r   r   rJ   catr   r   r   ) rL   r   rU   r   rV   	mod_shift	mod_scalemod_gatex_modlinear1_outrv   qkvmlprw   seq_lenqkvry   r   rz   r   r{   r   r|   r}   img_attn_outputtxt_attn_outputattn_output
mlp_outputcombinedoutputrQ   rQ   rR   r   I  s<   
F...
zMMSingleStreamBlock.forward)r   NNr   r   rQ   rQ   rO   rR   r      s<    	
Fr   c                       s   e Zd ZdZe jZe jZe jZe jZe j	Z	e j
Z
dedeeef f fddZ		ddejdejeej B d	ejd
ejeej B dB fddZdejdejddfddZdefddZdejdejfddZ  ZS )HunyuanVideoTransformer3DModelaY  
    HunyuanVideo Transformer backbone adapted for distributed training.

    This implementation uses distributed attention and linear layers for efficient
    parallel processing across multiple GPUs.

    Based on the architecture from:
    - Flux.1: https://github.com/black-forest-labs/flux
    - MMDiT: http://arxiv.org/abs/2403.03206
    config	hf_configc                    s  t  j |d  j j jg_ j_ j_ jd u r! jn j_j_ j_t	 j
_ j_ j_ j_ j_ j j }t j
|kr[td j
 d|  j_ j_ j_tjjj j j dd_tj j j j j j dd_tjd j j d	d
_tjjjd j j dd_jrtjd j j dd
nd _ t!" fddt# j$D _%t!" fddt# j&D _'t( jjj j j dd_)*  ddg_+d S )N)r   r   zGot z but expected positional dim z.img_in)r   r!   z.txt_in)depthr   r!   r#   z.time_inr&   r   r!   z
.vector_inact_typer   r!   z.guidance_inc                    s6   g | ]}t  j j j jj j d | dqS )z.double_blocks.r   r   r    r!   )r   r   r   r   r   _supported_attention_backendsr!   .0ir   rL   rQ   rR   
<listcomp>  s    	z;HunyuanVideoTransformer3DModel.__init__.<locals>.<listcomp>c                    s<   g | ]}t  j j j jj j d | j  dqS )z.single_blocks.r   )r   r   r   r   r   r   r!   
num_layersr   r   rQ   rR   r     s    	z.final_layerdouble_blockssingle_blocks),r4   r5   patch_size_t
patch_sizein_channelsnum_channels_latentsout_channelsunpatchify_channelsguidance_embedslistrope_axes_dimrope_dim_list
rope_thetatext_embed_dimtext_states_dimpooled_projection_dimtext_states_dim_2r   r   r   sum
ValueErrorr   r!   img_inSingleTokenRefinernum_refiner_layerstxt_inr   time_inr   	vector_inguidance_inr   
ModuleListranger   r   num_single_layersr   
FinalLayerfinal_layer__post_init__layer_names)rL   r   r   pe_dimrO   r   rR   r5     s   





		
z'HunyuanVideoTransformer3DModel.__init__Nhidden_statesencoder_hidden_statestimestepencoder_hidden_states_imagec           "      K   sj  t  }|j}|duo|j}	|du rtjdg|j|jd}| }
}|}t|tjr?|ddddf }|dddd| j	f }n|d }|d }|j
\}}}}}|| jd  || jd  || jd  }}}t|t  ||f| j| j| j| j\}}||j}||j}| |}|| | }| jr|dur|| | }| |
}
| ||}|j
d }|
j
d }|dur||fnd}| j|
|d}|r| |
}
nT|	r|
 }t| jD ]\}}|
|||g} ||  \}
}qt|
|fd}t| jdkrt| jD ]\}}||||g}!||! }q|ddd|df }
|	r"|  |
| | !|
|}
t"|
|||| j| j#}
|
S )	aN  
        Forward pass of the HunyuanDiT model.

        Args:
            hidden_states: Input image/video latents [B, C, T, H, W]
            encoder_hidden_states: Text embeddings [B, L, D]
            timestep: Diffusion timestep
            guidance: Guidance scale for CFG

        Returns:
            Tuple of (output)
        Ng     @)devicer   r[   r   r\   )rS   rU   .)$r   forward_batchenable_teacacher_   tensorr   r   
isinstancer   r   ra   r   r   r   r   r   r   r   rd   r   r   r   r   r   %should_skip_forward_for_cached_statesretrieve_cached_statesclone	enumerater   r   lenr   maybe_cache_statesr   r   r   )"rL   r   r   r   r   guidancekwargsforward_contextr   r   rS   r   trT   text_states_2rv   otohowttthtw	freqs_cos	freqs_sinrU   txt_seq_lenimg_seq_lenrV   should_skip_forwardoriginal_imgindexblockdouble_block_argssingle_block_argsrQ   rQ   rR   r     s|   





z&HunyuanVideoTransformer3DModel.forwardoriginal_hidden_statesrW   c                 C   s   || | _ d S Nprevious_residual)rL   r   r  rQ   rQ   rR   r     s   z1HunyuanVideoTransformer3DModel.maybe_cache_statesc                 K   s4   t  }|j}|d u rdS |j}|j}|sdS td)NFz.teacache is not supported yet for HunyuanVideo)'r   r   current_timestepr   NotImplementedErrorteacache_paramsr   r   num_inference_stepsteacache_threshcoefficientscntr   r_   distributedr   DTensor
from_local
DeviceMeshr   device_typer   r   r   	Replicater   r8   r`   r9   normr   accumulated_rel_l1_distancenppoly1dprevious_modulated_inputabsmeancpuitem)rL   r   r   r   r  r   r  r  teache_threshr  inpvec_img_mod1_shiftimg_mod1_scaleimg_mod1_gateimg_mod2_shiftimg_mod2_scaleimg_mod2_gate
normed_inpmodulated_inpshould_calcrescale_funcrQ   rQ   rR   r     s   zDHunyuanVideoTransformer3DModel.should_skip_forward_for_cached_statesc                 C   s
   || j  S r  r  )rL   r   rQ   rQ   rR   r     s   
z5HunyuanVideoTransformer3DModel.retrieve_cached_states)NN)r   r   r   r   r   _fsdp_shard_conditions_compile_conditionsr   param_names_mappingreverse_param_names_mappinglora_param_names_mappingdictr   r   r5   r_   r   r   
LongTensorr   r   boolr   r   r   rQ   rQ   rO   rR   r     s:    }
u
cr   c                       <   e Zd ZdZ				ddeddf fdd	Zd
d Z  ZS )r   z
    A token refiner that processes text embeddings with attention to improve
    their representation for cross-attention with image features.
    r\   TNr   r!   rW   c                    s   t    t|d  dd| _td  dd| _t|d  dd| _t	 fd	d
t
|D | _d S )NTz.input_embedderr)   r#   z.t_embedderr   z.c_embedderr   c              
      s(   g | ]}t   d | dqS )z.refiner_blocks.)qkv_biasr   r!   )IndividualTokenRefinerBlockr   r   r   r   r!   rC  rQ   rR   r      s    z/SingleTokenRefiner.__init__.<locals>.<listcomp>)r4   r5   r   input_embedderr   
t_embedderr   
c_embedderr   r   r   refiner_blocks)rL   r   r   r   r   rC  r   r!   rO   rE  rR   r5     s0   

	

zSingleTokenRefiner.__init__c                 C   sR   |  |}tj|dd}| |}|| }| |\}}| jD ]}|||}q|S )Nr[   rY   )rG  r_   r*  rH  rF  rI  )rL   r   r  timestep_aware_representationscontext_aware_representationscrv   r  rQ   rQ   rR   r   ,  s   


zSingleTokenRefiner.forward)r\   TNr   r   r   r   r   r   r5   r   r   rQ   rQ   rO   rR   r     s    
	2r   c                       rB  )rD  zQ
    A transformer block for refining individual tokens with self-attention.
    r   TNr   r!   rW   c                    s   t    || _t|| }tj|dd|d| _t||d ||| dd| _t||||| dd| _	tj|dd|d| _
t||dd|| d	d
| _t|dd|| dd| _t||| tjtjtjfd| _d S )Nr,   Tr   r(   z.self_attn_qkvr)   z.self_attn_projr#   z.mlp)r*   r   r   r!   r\   .adaLN_modulationr$   )r1   r2   r    )r4   r5   r   r7   r   	LayerNormnorm1r   self_attn_qkvself_attn_projnorm2r   r   r   adaLN_modulationr   r   FAAITER
TORCH_SDPArJ   )rL   r   r   r   rC  r   r!   rN   rO   rQ   rR   r5   C  s\   
		
	z$IndividualTokenRefinerBlock.__init__c                 C   s  |  |jddd\}}| |}| |\}}|jd |jd }}	|||	d| jd}|d d d d df |d d d d df |d d d d df }
}}| |
||}|||	d}| 	|\}}|||
d  }| | |}|||
d  }|S )Nr\   rX   rY   r   r[   r(   )rT  r`   rP  rQ  ra   rb   r   rJ   re   rR  	unsqueezer   rS  )rL   r   rL  gate_msagate_mlpnorm_xr   rv   rw   r   r   r   r   r   attn_outmlp_outrQ   rQ   rR   r     s   
Fz#IndividualTokenRefinerBlock.forward)r   TNr   rM  rQ   rQ   rO   rR   rD  >  s    DrD  c                       s6   e Zd ZdZ	d
deddf fddZdd	 Z  ZS )r   zG
    The final layer of DiT that projects features to pixel space.
    Nr   r!   rW   c                    sr   t    tj|dd|d| _|d |d  |d  | }t||d|| dd	| _t|dd
|| dd| _d S )Nr,   Fr   r   r[   r\   Tz.linearr)   r#   rN  r$   )	r4   r5   r   rO  
norm_finalr   linearr   rT  )rL   r   r   r   r   r!   
output_dimrO   rQ   rR   r5     s&   
	zFinalLayer.__init__c                 C   sL   |  |jddd\}}| |d|d  |d }| |\}}|S )Nr\   rX   rY   g      ?r[   )rT  r`   r^  rX  r_  )rL   r   rL  scaleshiftrv   rQ   rQ   rR   r     s   "zFinalLayer.forward)Nr   rM  rQ   rQ   rO   rR   r     s    r   )5typingr   numpyr&  r_   torch.nnr   )sglang.multimodal_gen.configs.models.ditsr   -sglang.multimodal_gen.configs.sample.teacacher   8sglang.multimodal_gen.runtime.distributed.parallel_stater   .sglang.multimodal_gen.runtime.layers.attentionr   r   0sglang.multimodal_gen.runtime.layers.elementwiser   .sglang.multimodal_gen.runtime.layers.layernormr	   r
   r   +sglang.multimodal_gen.runtime.layers.linearr   (sglang.multimodal_gen.runtime.layers.mlpr   5sglang.multimodal_gen.runtime.layers.rotary_embeddingr   r   5sglang.multimodal_gen.runtime.layers.visual_embeddingr   r   r   r   6sglang.multimodal_gen.runtime.managers.forward_contextr   .sglang.multimodal_gen.runtime.models.dits.baser   *sglang.multimodal_gen.runtime.models.utilsr   'sglang.multimodal_gen.runtime.platformsr   r   5sglang.multimodal_gen.runtime.utils.layerwise_offloadr   Moduler   r   r   r   rD  r   
EntryClassrQ   rQ   rQ   rR   <module>   s<    P 	  qJc*