o
    پib                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( zd dl)m*Z* W n e+y   dZ*Y nw e(e,Z-e$. Z/dZ0dZ1G dd dej2Z3G dd dej2Z4G dd dej2Z5G dd dej2Z6G dd dej2Z7G dd dej2Z8G dd  d Z9G d!d" d"e"e&Z:e:Z;dS )#    N)AnyListOptionalTuple)ZImageDitConfig)get_tp_world_size)
SiluAndMul)USPAttention)RMSNormapply_qk_norm)ColumnParallelLinearMergedColumnParallelLinearReplicatedLinearRowParallelLinear)QuantizationConfig)NunchakuConfigis_nunchaku_available)_apply_rotary_emb apply_flashinfer_rope_qk_inplace)CachableDiT)current_platform)OffloadableDiTMixin)init_logger)NunchakuFeedForward       c                       $   e Zd Z fddZdd Z  ZS )SelectFirstElementc                    s   t    d S N)super__init__)self	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/dits/zimage.pyr    /   s   zSelectFirstElement.__init__c                 C   s   |d S )Nr   r$   )r!   xr$   r$   r%   forward2   s   zSelectFirstElement.forward__name__
__module____qualname__r    r'   __classcell__r$   r$   r"   r%   r   .   s    r   c                       s4   e Zd Zd
 fdd	ZedddZdd	 Z  ZS )TimestepEmbedderNr   c              
      sN   t    |d u r|}tt||dddt t||dddg| _|| _d S )NTFbiasgather_outputr/   input_is_parallel)	r   r    nn
ModuleListr   SiLUr   mlpfrequency_embedding_size)r!   out_sizemid_sizer7   r"   r$   r%   r    7   s   

zTimestepEmbedder.__init__'  c              
   C   s   t jjtjddY |d }t t| t jd|t j	| j
d | }| d d d f  |d   }t jt |t |gdd}|d rYt j|t |d d d df gdd}|W  d    S 1 sew   Y  d S )	NF)enabled   r   )startenddtypedevicedim   )torchampautocastr   device_typeexpmathlogarangefloat32r@   floatcatcossin
zeros_like)trC   
max_periodhalffreqsargs	embeddingr$   r$   r%   timestep_embeddingJ   s"   
$z#TimestepEmbedder.timestep_embeddingc                 C   sT   |  || j| jd jj}| jd |\}}| jd |}| jd |\}}|S )Nr   rD   r<   )rY   r7   tor6   weightr?   )r!   rS   t_freqt_emb_r$   r$   r%   r'   [   s   zTimestepEmbedder.forward)Nr   )r:   )r)   r*   r+   r    staticmethodrY   r'   r,   r$   r$   r"   r%   r-   6   s
    r-   c                       s.   e Zd Zdedef fddZdd Z  ZS )FeedForwardrC   
hidden_dimc                    s>   t    t|||gddd| _t||ddd| _t | _d S )NFr.   Tr1   )r   r    r   w13r   w2r   act)r!   rC   ra   r"   r$   r%   r    f   s   
zFeedForward.__init__c                 C   s*   |  |\}}| |}| |\}}|S r   )rb   rd   rc   )r!   r&   x13r^   outr$   r$   r%   r'   o   s   
zFeedForward.forward)r)   r*   r+   intr    r'   r,   r$   r$   r"   r%   r`   e   s    	r`   c                       st   e Zd Z				ddedededed	ed
ee deddf fddZ		dde
jdeee
je
jf  fddZ  ZS )ZImageAttentionTư>N rC   	num_headsnum_kv_headsqk_normepsquant_configprefixreturnc           
   
      st  t    || _|| | _|| _|| _|| _t }|| dks)J d| d| || dks9J d| d| || | _|| | _	| j| }	t
|t| _| jrct|||	|	gdd|| dd| _nt||ddd| _t||	ddd| _t||	ddd| _| jrt| j|d	| _t| j|d	| _nd | _d | _tt||dd
|| ddg| _t| j| j| j	dd dd| _d S )Nr   z
num_heads z$ must be divisible by tp world size znum_kv_heads Fz.to_qkv)r/   r0   ro   rp   r.   rn   Tz	.to_out.0)r/   r2   ro   rp   )rk   	head_sizerl   dropout_ratesoftmax_scalecausal)r   r    rC   head_dimrk   rl   rm   r   local_num_headslocal_num_kv_heads
isinstancer   use_fused_qkvr   to_qkvr   to_qto_kto_vr
   norm_qnorm_kr3   r4   r   to_outr	   attn)
r!   rC   rk   rl   rm   rn   ro   rp   tp_sizekv_dimr"   r$   r%   r    w   sr   






	zZImageAttention.__init__hidden_states	freqs_cisc                 C   s  | j r0| |\}}|j| j| j | j| j | j| j gdd\}}}| }| }| }n| |\}}| |\}}| 	|\}}|j
g |jd d | j| jR  }|j
g |jd d | j| jR  }|j
g |jd d | j| jR  }| jrt||| j| j| jdd\}}|d ur|\}}	tr|j|jkrtj|jtjd |	jtjd gdd}
t|||
dd\}}nt|||	dd}t|||	dd}| |||}|d	}| jd
 |\}}|S )NrA   rB   T)qkq_normk_normrw   allow_inplace)r?   F)is_neox)is_neox_styler<   r   )r{   r|   splitrx   rw   ry   
contiguousr}   r~   r   viewshaperm   r   r   r   _is_cudarE   rO   rZ   rM   r   r   r   flattenr   )r!   r   r   qkvr^   r   r   vrP   rQ   cos_sin_cacher$   r$   r%   r'      sZ   



&&&
	
zZImageAttention.forward)Tri   Nrj   r   )r)   r*   r+   rg   boolrN   r   r   strr    rE   Tensorr   r'   r,   r$   r$   r"   r%   rh   v   s8    	Prh   c                       sx   e Zd Z			ddededededed	ed
ee def fddZ		dde
jdee
je
jf dee
j fddZ  ZS )ZImageTransformerBlockTNrj   layer_idrC   n_heads
n_kv_headsnorm_epsrm   ro   rp   c
              	      sL  t    || _|| | _|| _|| _t||||d||	 dd| _t|d d }
t	|t
o1t }|rmdd l}|jjj||d|
dd	}|j|j|jd
}t|fi || _t| jdrlt| jjdkrl|j| jjd _nt||
d| _t||d| _t||d| _t||d| _t||d| _|rttt|t d| dd| _!d S d S )Ngh㈵>z
.attention)rC   rk   rl   rm   rn   ro   rp         r   swigluF)rC   dim_outactivation_fn	inner_dimr/   )	precisionrankact_unsignednetr<   )rC   ra   rr      Tr/   )"r   r    rC   rw   r   
modulationrh   	attentionrg   rz   r   r   	diffusersmodelsr`   r   r   r   r   feed_forwardhasattrlenr   r
   attention_norm1	ffn_norm1attention_norm2	ffn_norm2r3   
Sequentialr   minADALN_EMBED_DIMadaLN_modulation)r!   r   rC   r   r   r   rm   r   ro   rp   ra   nunchaku_enabledr   ffnunchaku_kwargsr"   r$   r%   r      sX   



zZImageTransformerBlock.__init__r&   r   adaln_inputc              	   C   s   | j rX|d us	J | |\}}|djddd\}}}}	| |	 }}	d| d| }}| j| || |d}
||| |
  }||	| | 	| 
||   }|S | j| ||d}
|| |
 }|| | 	| 
| }|S )NrD   r   r<   rB         ?)r   )r   r   	unsqueezechunktanhr   r   r   r   r   r   )r!   r&   r   r   scale_msa_gater^   	scale_msagate_msa	scale_mlpgate_mlpattn_outr$   r$   r%   r'   E  s@   zZImageTransformerBlock.forward)TNrj   r   )r)   r*   r+   rg   rN   r   r   r   r   r    rE   r   r   r'   r,   r$   r$   r"   r%   r     s:    		
Gr   c                       r   )
FinalLayerc                    s^   t    tj|ddd| _t||ddd| _t | _t	t t
t|t|dd| _d S )NFri   )elementwise_affinern   Tr.   r   )r   r    r3   	LayerNorm
norm_finalr   linearr5   rd   r   r   r   r   r   )r!   hidden_sizeout_channelsr"   r$   r%   r    t  s   


zFinalLayer.__init__c                 C   s<   |  |\}}d| }| ||d }| |\}}|S )Nr   rD   )r   r   r   r   )r!   r&   cscaler^   r$   r$   r%   r'     s
   zFinalLayer.forwardr(   r$   r$   r"   r%   r   s  s    r   c                   @   sv   e Zd Z			ddedee dee fddZedd	ee d
ee defddZde	j
dee	j
e	j
f fddZdS )RopeEmbedder      p@   8   r   @      r   theta	axes_dims	axes_lensc                 C   s:   || _ || _|| _t|t|ksJ dd | _d | _d S )Nz1axes_dims and axes_lens must have the same length)r   r   r   r   
cos_cached
sin_cached)r!   r   r   r   r$   r$   r%   r      s   
zRopeEmbedder.__init__rC   r>   c           
      C   s   t dQ g }g }tt| |D ]9\}\}}d|t jd|dt jdd|   }t j||jt jd}	t |	| }|t 	| |t 
| q||fW  d    S 1 sYw   Y  d S )Ncpur   r   r<   r?   r@   )r@   r?   )rE   r@   	enumerateziprL   float64outerrN   appendrP   rQ   )
rC   r>   r   cos_listsin_listiderV   timestepr$   r$   r%   precompute_freqs  s   $zRopeEmbedder.precompute_freqsidsrq   c                    s.  |j dksJ |jd t| jksJ |j | jdu rA| j| j| j| jd\| _| _	 fdd| jD | _ fdd| j	D | _	n| jd j kr_ fd	d| jD | _ fd
d| j	D | _	g }g }t
t| jD ]}|dd|f }|| j| |  || j	| |  qjtj|ddtj|ddfS )z
        Args:
            ids: [batch, len(axes_dims)] or [seq_len, len(axes_dims)]
        Returns:
            cos: [batch/seq, head_dim // 2]
            sin: [batch/seq, head_dim // 2]
        r<   rA   N)r   c                       g | ]}|  qS r$   rZ   .0r   r@   r$   r%   
<listcomp>      z)RopeEmbedder.__call__.<locals>.<listcomp>c                    r   r$   r   r   sr   r$   r%   r     r   r   c                    r   r$   r   r   r   r$   r%   r     r   c                    r   r$   r   r   r   r$   r%   r     r   rB   )ndimr   r   r   r@   r   r   r   r   r   ranger   rE   rO   )r!   r   cos_outsin_outr   indexr$   r   r%   __call__  s&   
zRopeEmbedder.__call__N)r   r   r   )r   )r)   r*   r+   rN   r   rg   r    r_   r   rE   r   r   r   r$   r$   r$   r%   r     s    
"$r   c                	       s  e Zd ZdZdgZe jjZe jjZe jjZe jj	Z	e
deeee f fddZ	ddedeeef d	ee ddf fd
dZdeej dee deej fddZed ddZdeej deej dedefddZ				d!deej deej fddZ  ZS )"ZImageTransformer2DModelTr   rq   c                 C   s   g dg dddgdS )N)normembedrotary	pos_embed)zattention.to_qkvzattention.to_outimg_mlptxt_mlpimg_modtxt_mod)skip	svdq_w4a4	awq_w4a16r$   )clsr$   r$   r%   get_nunchaku_quant_rules  s   z1ZImageTransformer2DModel.get_nunchaku_quant_rulesNconfig	hf_configro   c                    sD  t  j||d |_|j  j_ j_ j_ j_ j_ j	_
 j_ j_d_tjtjks=J i }i }ttjjD ]5\}\}}t|| | j jddd}	|	|| d| < tj|| | j }
|
|| d| < qJt|_t|_t fddt jD _t fddt jD _ttjtd	d
_t t! j" j#dt$ j"jdd_%t&t'(djf_)t&t'(djf_*t fddt j+D _,jj
 }|t- j.ks	J  j._. j/_/t0jj.j/d_1dg_2d S )N)r  r  FTr.   -c                    s:   g | ]}t d | jj j j jdd| d	qS )i  Tznoise_refiner.r   ro   rp   r   rC   r   r   r   rm   r   r   arch_configro   r!   r$   r%   r     s    z5ZImageTransformer2DModel.__init__.<locals>.<listcomp>c                    s6   g | ]}t |jj j j jd d| d	qS )Fzcontext_refiner.r  r  r  r  r$   r%   r   +  s    i   )r9   rr   r   rD   c                    s4   g | ]}t |jj j j jd | dqS )zlayers.)ro   rp   r  r  r  r$   r%   r   G  s    )r   r   r   layers)3r   r    config_datar  in_channelsr   all_patch_sizeall_f_patch_sizerC   num_attention_headsr   
rope_thetat_scalegradient_checkpointingr   r   r   r   r   r3   
ModuleDictall_x_embedderall_final_layerr4   r   n_refiner_layersnoise_refinercontext_refinerr-   r   r   
t_embedderr   r
   cap_feat_dimr   r   cap_embedder	ParameterrE   emptyx_pad_tokencap_pad_token
num_layersr  sumr   r   r   
rotary_emblayer_names)r!   r  r  ro   r  r  	patch_idx
patch_sizef_patch_size
x_embedderfinal_layerrw   r"   r  r%   r      s~   z!ZImageTransformer2DModel.__init__r&   sizec              
   C   s   | }}|}t |}t ||ksJ t|D ]?}	||	 \}
}}|
| ||  ||  }||	 d | |
| || || |||| jddddddd| j|
||||	< q|S )N   r   r   rD   r   r<      )r   r   r   r   permutereshape)r!   r&   r2  r.  r/  pHpWpFbszr   FHWori_lenr$   r$   r%   
unpatchify_  s    z#ZImageTransformer2DModel.unpatchifyc                    sJ   |d u rdd | D } fddt || D }tj|dd}tj|ddS )	Nc                 s   s    | ]}d V  qdS )r   Nr$   )r   r^   r$   r$   r%   	<genexpr>u  s    zBZImageTransformer2DModel.create_coordinate_grid.<locals>.<genexpr>c                    s(   g | ]\}}t j||| t j d qS )r   )rE   rL   int32)r   x0spanr   r$   r%   r   w  s    zCZImageTransformer2DModel.create_coordinate_grid.<locals>.<listcomp>ij)indexingrA   rB   )r   rE   meshgridstack)r2  r=   r@   axesgridsr$   r   r%   create_coordinate_gridr  s   
z/ZImageTransformer2DModel.create_coordinate_grid	all_imageall_cap_featsr.  r/  c              	   C   s`  t |t |  krdksJ  J |d }|d }| }}|}	|j}
g }g }g }|d}| t }tj||dd  |dgdd}|| | \}}}}||||f ||	 || || }}}||||	||||}|	dddddd	d
|| | |	| | | }|d}| t }tj||dd  |dgdd}|| |||fS )
NrD   r   rA   rB   r   r4  r<   r   r3  )r   r@   r2  SEQ_MULTI_OFrE   rO   repeatr   r   r5  r6  )r!   rK  rL  r.  r/  imagecap_featr7  r8  r9  r@   all_image_outall_image_sizeall_cap_feats_outcap_ori_lencap_padding_lencap_padded_featCr;  r<  r=  F_tokensH_tokensW_tokensimage_ori_lenimage_padding_lenimage_padded_featr$   r$   r%   patchify_and_embed~  sD   $





z+ZImageTransformer2DModel.patchify_and_embedr   r<   rD   r   encoder_hidden_statesc                 K   s  || j v sJ || jv sJ |}	|}
d| }|}d}|	d j}| |}||	}| |	|
||\}	}
}tj|	dd}	| j| d|  |	\}	}|d }|		d}	|}| j
D ]}||	||}	qWtj|
dd}
| |
\}
}|d }|
	d}
| jD ]}||
|}
qztj|	|
gdd}tj|d |d gddtj|d |d gddf}| jD ]}||||}q| j| d|  ||}t|jdd}| ||||}	|	d  S )Ng     @@rD   r   rB   r  )r  r  r@   r"  type_asr^  rE   rO   r  r   r   r$  r!  r  r  listunbindr?  )r!   r   r_  r   guidancer.  r/  r   kwargsr&   	cap_featsrS   r:  r@   r   x_sizer^   x_freqs_cislayercap_freqs_cisunifiedunified_freqs_cisr$   r$   r%   r'     sP   








z ZImageTransformer2DModel.forwardr   )NN)r   r<   rD   N)r)   r*   r+    _supports_gradient_checkpointing_no_split_modulesr   r  _fsdp_shard_conditionsparam_names_mappingreverse_param_names_mappingclassmethoddictr   ra  r
  r   r   r   r    r   rE   r   r   r?  r_   rJ  rg   r^  r'   r,   r$   r$   r"   r%   r     s^    



r

=r   )<rJ   typingr   r   r   r   rE   torch.nnr3   0sglang.multimodal_gen.configs.models.dits.zimager   )sglang.multimodal_gen.runtime.distributedr   /sglang.multimodal_gen.runtime.layers.activationr   .sglang.multimodal_gen.runtime.layers.attentionr	   .sglang.multimodal_gen.runtime.layers.layernormr
   r   +sglang.multimodal_gen.runtime.layers.linearr   r   r   r   Esglang.multimodal_gen.runtime.layers.quantization.configs.base_configr   Isglang.multimodal_gen.runtime.layers.quantization.configs.nunchaku_configr   r   5sglang.multimodal_gen.runtime.layers.rotary_embeddingr   r   .sglang.multimodal_gen.runtime.models.dits.baser   'sglang.multimodal_gen.runtime.platformsr   5sglang.multimodal_gen.runtime.utils.layerwise_offloadr   1sglang.multimodal_gen.runtime.utils.logging_utilsr   nunchaku.models.attentionr   	Exceptionr)   loggeris_cudar   r   rM  Moduler   r-   r`   rh   r   r   r   r   
EntryClassr$   r$   r$   r%   <module>   sL    / rD  ,