o
    پi"                     @   s   d dl mZmZ d dlZd dlmZmZ dedefddZ	dedefdd	Z
dedefd
dZdedefddZeG dd deZeG dd deZdS )    )	dataclassfieldN)DiTArchConfig	DiTConfignreturnc                 C      d| v ot | dd S )Ndouble.strisdigitsplitr   m r   j/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/configs/models/dits/hunyuanvideo.pyis_double_block      r   c                 C   r   )Nsingler
   r   r   r   r   r   r   is_single_block   r   r   c                 C   r   )Nrefinerr
   r   r   r   r   r   r   is_refiner_block   r   r   c                 C   s   |  dd dkS )Nr
   r   txt_in)r   r   r   r   r   	is_txt_in   s   r   c                       sl  e Zd ZU edd dZeed< edd dZeed< edd dZe	ed< ed	d dZ
e	ed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeeeef ed< dZeed < d!Zejd!B ed"< d#Zeed$< d%Zeed&< d'Zeed(< d)Z e!ed*< ed+d dZ"ee! ed,<  fd-d.Z#  Z$S )/HunyuanVideoArchConfigc                   C   
   t ttgS N)r   r   r   r   r   r   r   <lambda>      
 zHunyuanVideoArchConfig.<lambda>default_factory_fsdp_shard_conditionsc                   C   r   r   )r   r   r   r   r   r   r   r   "   r    _compile_conditionsc                   C   s   i ddddddddd	d
dddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTS )UNzF^context_embedder\.time_text_embed\.timestep_embedder\.linear_1\.(.*)$ztxt_in.t_embedder.mlp.fc_in.\1zF^context_embedder\.time_text_embed\.timestep_embedder\.linear_2\.(.*)$ztxt_in.t_embedder.mlp.fc_out.\1z!^context_embedder\.proj_in\.(.*)$ztxt_in.input_embedder.\1zB^context_embedder\.time_text_embed\.text_embedder\.linear_1\.(.*)$ztxt_in.c_embedder.fc_in.\1zB^context_embedder\.time_text_embed\.text_embedder\.linear_2\.(.*)$ztxt_in.c_embedder.fc_out.\1zE^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.norm1\.(.*)$z!txt_in.refiner_blocks.\1.norm1.\2zE^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.norm2\.(.*)$z!txt_in.refiner_blocks.\1.norm2.\2zJ^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.attn\.to_q\.(.*)$))txt_in.refiner_blocks.\1.self_attn_qkv.\2r      zJ^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.attn\.to_k\.(.*)$)r%      r&   zJ^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.attn\.to_v\.(.*)$)r%      r&   zO^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.attn\.to_out\.0\.(.*)$z*txt_in.refiner_blocks.\1.self_attn_proj.\2zU^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.ff\.net\.0(?:\.proj)?\.(.*)$z%txt_in.refiner_blocks.\1.mlp.fc_in.\2zU^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.ff\.net\.2(?:\.proj)?\.(.*)$z&txt_in.refiner_blocks.\1.mlp.fc_out.\2zP^context_embedder\.token_refiner\.refiner_blocks\.(\d+)\.norm_out\.linear\.(.*)$z3txt_in.refiner_blocks.\1.adaLN_modulation.linear.\2z^x_embedder\.proj\.(.*)$zimg_in.proj.\1z4^time_text_embed\.timestep_embedder\.linear_1\.(.*)$ztime_in.mlp.fc_in.\1z4^time_text_embed\.timestep_embedder\.linear_2\.(.*)$ztime_in.mlp.fc_out.\1z4^time_text_embed\.guidance_embedder\.linear_1\.(.*)$zguidance_in.mlp.fc_in.\1z4^time_text_embed\.guidance_embedder\.linear_2\.(.*)$zguidance_in.mlp.fc_out.\1z0^time_text_embed\.text_embedder\.linear_1\.(.*)$zvector_in.fc_in.\1z0^time_text_embed\.text_embedder\.linear_2\.(.*)$zvector_in.fc_out.\1z0^transformer_blocks\.(\d+)\.norm1\.linear\.(.*)$z"double_blocks.\1.img_mod.linear.\2z8^transformer_blocks\.(\d+)\.norm1_context\.linear\.(.*)$z"double_blocks.\1.txt_mod.linear.\2z/^transformer_blocks\.(\d+)\.attn\.norm_q\.(.*)$z#double_blocks.\1.img_attn_q_norm.\2z/^transformer_blocks\.(\d+)\.attn\.norm_k\.(.*)$z#double_blocks.\1.img_attn_k_norm.\2z-^transformer_blocks\.(\d+)\.attn\.to_q\.(.*)$) double_blocks.\1.img_attn_qkv.\2r   r&   z-^transformer_blocks\.(\d+)\.attn\.to_k\.(.*)$)r)   r'   r&   z-^transformer_blocks\.(\d+)\.attn\.to_v\.(.*)$)r)   r(   r&   z3^transformer_blocks\.(\d+)\.attn\.add_q_proj\.(.*)$) double_blocks.\1.txt_attn_qkv.\2r   r&   z3^transformer_blocks\.(\d+)\.attn\.add_k_proj\.(.*)$)r*   r'   r&   z3^transformer_blocks\.(\d+)\.attn\.add_v_proj\.(.*)$)r*   r(   r&   z2^transformer_blocks\.(\d+)\.attn\.to_out\.0\.(.*)$z!double_blocks.\1.img_attn_proj.\2z3^transformer_blocks\.(\d+)\.attn\.to_add_out\.(.*)$z!double_blocks.\1.txt_attn_proj.\2z5^transformer_blocks\.(\d+)\.attn\.norm_added_q\.(.*)$z#double_blocks.\1.txt_attn_q_norm.\2z#double_blocks.\1.txt_attn_k_norm.\2z!double_blocks.\1.img_mlp.fc_in.\2z"double_blocks.\1.img_mlp.fc_out.\2z!double_blocks.\1.txt_mlp.fc_in.\2z"double_blocks.\1.txt_mlp.fc_out.\2zsingle_blocks.\1.q_norm.\2zsingle_blocks.\1.k_norm.\2)single_blocks.\1.linear1.\2r      )r+   r'   r,   )r+   r(   r,   )r+   r&   r,   zsingle_blocks.\1.linear2.\2z%single_blocks.\1.modulation.linear.\2z&final_layer.adaLN_modulation.linear.\1zfinal_layer.linear.\1)z5^transformer_blocks\.(\d+)\.attn\.norm_added_k\.(.*)$z8^transformer_blocks\.(\d+)\.ff\.net\.0(?:\.proj)?\.(.*)$z8^transformer_blocks\.(\d+)\.ff\.net\.2(?:\.proj)?\.(.*)$z@^transformer_blocks\.(\d+)\.ff_context\.net\.0(?:\.proj)?\.(.*)$z@^transformer_blocks\.(\d+)\.ff_context\.net\.2(?:\.proj)?\.(.*)$z6^single_transformer_blocks\.(\d+)\.attn\.norm_q\.(.*)$z6^single_transformer_blocks\.(\d+)\.attn\.norm_k\.(.*)$z4^single_transformer_blocks\.(\d+)\.attn\.to_q\.(.*)$z4^single_transformer_blocks\.(\d+)\.attn\.to_k\.(.*)$z4^single_transformer_blocks\.(\d+)\.attn\.to_v\.(.*)$z2^single_transformer_blocks\.(\d+)\.proj_mlp\.(.*)$z2^single_transformer_blocks\.(\d+)\.proj_out\.(.*)$z6^single_transformer_blocks\.(\d+)\.norm\.linear\.(.*)$z^norm_out\.linear\.(.*)$z^proj_out\.(.*)$r   r   r   r   r   r   &   s    	 !"#$&'()*/49>CHJKLparam_names_mappingc                   C   s   i S r   r   r   r   r   r   r      s    reverse_param_names_mappingr(   
patch_sizer'   patch_size_t   in_channelsout_channels   num_attention_heads   attention_head_dimg      @	mlp_ratio   
num_layers(   num_single_layersnum_refiner_layers)r1   8   r>   rope_axes_dimFguidance_embedsNdtypei   text_embed_dimi   pooled_projection_dim   
rope_thetarms_normqk_normc                   C   s   g dS )N)img_inr   time_in	vector_inr   r   r   r   r   r      s    exclude_lora_layersc                    s$   t    | j| j | _| j| _d S r   )super__post_init__r7   r5   hidden_sizer2   num_channels_latents)self	__class__r   r   rM      s   
z$HunyuanVideoArchConfig.__post_init__)%__name__
__module____qualname__r   r#   list__annotations__r$   r-   dictr.   r/   intr0   r2   r3   r5   r7   r8   floatr:   r<   r=   r?   tupler@   boolrA   torchrB   rC   rE   rG   r   rK   rM   __classcell__r   r   rQ   r   r      s@   
 rr   c                   @   s,   e Zd ZU eedZeed< dZe	ed< dS )HunyuanVideoConfigr!   arch_configHunyuanprefixN)
rS   rT   rU   r   r   r`   r   rW   rb   r   r   r   r   r   r_      s   
 r_   )dataclassesr   r   r]   .sglang.multimodal_gen.configs.models.dits.baser   r   r   r\   r   r   r   r   r   r_   r   r   r   r   <module>   s    