o
    پi                     @   s  d dl mZmZmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 e0e1Z2zd dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z: d dl;m<Z= dZ>W n e?y   dZ4dZ6dZ7dZ:dZ=dZ>Y nw 	d,dej@deAdej@fddZB	d-	 d.d!d"ZCG d#d  d ej	jDeZEG d$d% d%e	jDZFG d&d' d'e	jDZGG d(d) d)e	jDZHG d*d+ d+e*e.ZIeIZJdS )/    )AnyDictListOptionalTupleUnionN)AttentionModuleMixinFeedForward)Transformer2DModelOutput)AdaLayerNormContinuousAdaLayerNormZeroAdaLayerNormZeroSingle)	LayerNorm)
FluxConfig)USPAttention)RMSNormapply_qk_norm)ColumnParallelLinearMergedColumnParallelLinear)QuantizationConfig)NunchakuConfigis_nunchaku_available)NDRotaryEmbedding apply_flashinfer_rope_qk_inplace)*CombinedTimestepGuidanceTextProjEmbeddings"CombinedTimestepTextProjEmbeddings)CachableDiT)current_platform)OffloadableDiTMixin)init_logger)NunchakuFeedForward)NunchakuAdaLayerNormZeroNunchakuAdaLayerNormZeroSingle)svdq_gemm_w4a4_cuda)%svdq_quantize_w4a4_act_fuse_lora_cudaTF   xpad_sizereturnc                 C   s|  | j \}}}| || |}t||j|j|jdk|d\}}	}
|| | d | | }|jdk}tj||jd tj	|j
d}|rOtj|jd |tj|j
d}ntj|jd ||j|j
d}tj||jj d tj|j
d}t||j||	|j||
|j|j||j|j|t|dd	t|d
d	d tj|| |j|j|j
d}t||j|||j||j|j|t|dd	t|d
d	dd |||dS )u  
    Fused GELU MLP matching nunchaku's fused_gelu_mlp kernel path.

    nunchaku's single-block MLP checkpoint is calibrated for the fused path where:
      1. fc1 GEMM + GELU + 0.171875 shift + unsigned re-quantization + fc2.lora_down
         are all done in a single fused kernel call
      2. fc2 GEMM then receives unsigned INT4 activations (act_unsigned=True)

    Using the sequential path (fc1 → GELU → fc2 with symmetric quantization) is
    fundamentally incompatible with these wscales, causing visually wrong outputs.
    nvfp4)	lora_downsmoothfp4r'         )dtypedevice   @   _nunchaku_alphaNwcscales)actwgtqoutascaleswscalesoscaleslora_act_inlora_upr*   lora_act_outbiassmooth_factorr,   alphar4   T)r5   r6   outr8   r9   r;   r<   r>   r,   r@   r4   act_unsigned)shapeview_svdq_quantize_w4a4	proj_downr?   	precisiontorchemptyoutput_size_per_partitionuint8r0   float8_e4m3fnr/   float32_svdq_gemm_w4a4qweightr9   proj_upr>   getattr)r&   fc1fc2r'   
batch_sizeseq_lenchannelsx_2dquantized_xr8   lora_actbatch_size_padis_fp4qout_actqout_ascalesqout_lora_actoutput ra   b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/dits/flux.py_fused_gelu_mlpT   s   




rc   attnFluxAttentionc                 C   s   t | ddr| |\}}dd |jdddD \}}}n| |\}}| |\}}| |\}}d  } }	}
|d urs| jd urst | ddr^| |\}}d	d |jdddD \}}	}
n| |\}}| 	|\}	}| 
|\}
}|||||	|
fS )
Nuse_fused_qkvFc                 S      g | ]}|  qS ra   
contiguous.0r&   ra   ra   rb   
<listcomp>   s    z(_get_qkv_projections.<locals>.<listcomp>   rC   dimuse_fused_added_qkvc                 S   rg   ra   rh   rj   ra   ra   rb   rl      s    )rR   to_qkvchunkto_qto_kto_vadded_kv_proj_dimto_added_qkv
add_q_proj
add_k_proj
add_v_proj)rd   hidden_statesencoder_hidden_statesqkv_querykeyvalueencoder_queryencoder_keyencoder_value	added_qkvra   ra   rb   _get_qkv_projections   s"    r   c                       s   e Zd Z													dd	ed
ededededee dee dedededee dedee def fddZ			dde
jdee
j de
jee
je
jf B fddZ  ZS ) re      r2           FNTh㈵> 	query_dim	num_headsdim_headdropoutr>   rv   added_proj_biasout_biasepsout_dimcontext_pre_onlypre_onlyquant_configprefixc              
      sR  t    || _|
d ur|
n|| | _|| _|| _|| _|
d ur"|
n|| _|| _|| _	|
d ur3|
| n|| _
|| _|| _t|t| _t|t| _t||	d| _t||	d| _| jrpt|| jgd |d||rj| dndd| _nt|| j|dd| _t|| j|dd| _t|| j|dd| _| j	stjg | _| jt| j| j|d||r| dnd	d |d
kr| jtj| |d urt||	d| _ t||	d| _!| jrt|| jgd |d||r| dndd| _"nt|| j|dd| _#t|| j|dd| _$t|| j|dd| _%t| j||d||r| dnd	d| _&t'|| jdd dd| _(d S )N)r   rm   Tz.to_qkvrq   r>   gather_outputr   r   r>   r   z	.to_out.0r   r   z.to_added_qkvrw   z.to_add_outr   F)r   	head_sizedropout_ratesoftmax_scalecausal))super__init__head_dim	inner_dimr   use_biasr   r   r   r   headsrv   r   
isinstancer   rf   rp   r   norm_qnorm_kr   rq   r   rs   rt   ru   rI   nn
ModuleListto_outappendDropoutnorm_added_qnorm_added_krw   rx   ry   rz   
to_add_outr   rd   )selfr   r   r   r   r>   rv   r   r   r   r   r   r   r   r   	__class__ra   rb   r      s   


	






		zFluxAttention.__init__r&   r|   r(   c                 C   sB  t | ||\}}}}}}	|d| jdf}|d| jdf}|d| jdf}t||| j| j| jdd\}}| jd ur|d| jdf}|d| jdf}|	d| jdf}	t||| j| j	| jdd\}}|j
\}
}}}tj||gdd}tj||gdd}tj|	|gdd}|d ur|\}}tj|jtjd |jtjd gdd}t|||dd\}}| |||}|d	d
}||j}|d ur|j|j
d |j
d |j
d  gdd\}}| js| jd |\}}t| jd	kr| jd |}| |\}}||fS | js| jd |\}}t| jd	kr| jd |}|S )NrC   T)qkq_normk_normr   allow_inplacer-   rn   )r/   F)is_neoxr.   rm   r   )r   	unflattenr   r   r   r   r   rv   r   r   rD   rI   cattorN   ri   r   rd   flattenr/   split_with_sizesr   r   lenr   )r   r&   r|   	freqs_cisr   r   r   r   r   r   bszrV   r~   cossincos_sin_cachera   ra   rb   forwardI  sz   


	
	


zFluxAttention.forward)r   r2   r   FNTTr   NNFNr   NN)__name__
__module____qualname__intfloatboolr   r   strr   rI   Tensortupler   __classcell__ra   ra   r   rb   re      sh    	
zc                       s   e Zd Z			ddededededee d	ef fd
dZ		dde	j
de	j
de	j
deee	j
e	j
f  deeeef  dee	j
e	j
f fddZ  ZS )FluxSingleTransformerBlock      @Nr   ro   num_attention_headsattention_head_dim	mlp_ratior   r   c                    s.  t    t|| | _t|t| _t|| _| jrlt	|| jdd||r(| dndd| _
tjdd| _t	| j|dd||rB| dndd| _t||||dd	d
||rW| dndd	| _td urjt| jdd| _d S d S t	|| jddd| _tjdd| _t	|| j |ddd| _t||||dd	dd| _d S )NTz.mlp_fc1mlp_fc1r   tanh)approximatez.mlp_fc2mlp_fc2ư>F.attnrd   )	r   r   r   r   r>   r   r   r   r   r   scale_shiftr   )r   r   r   r   r>   r   r   )r   r   r   mlp_hidden_dimr   r   use_nunchaku_structurer   normr   r   r   GELUact_mlpr   re   rd   r"   proj_mlpproj_out)r   ro   r   r   r   r   r   r   ra   rb   r     st   
	
	z#FluxSingleTransformerBlock.__init__r{   r|   tembr   joint_attention_kwargsr(   c                 C   sz  |j d }tj||gdd}|}| j||d\}}	|pi }| jrdtr+t|| j| j}
n| |\}}| 	|}
| |
\}
}| j
d	||d|}t|trR|d }||
 }|	d}	|	| }|| }n4| |\}}| 	|}
| j
d	||d|}tj||
gdd}|	d}	| |\}}|	| }|| }|jtjkr|dd}|d d d |f |d d |d f }}||fS )
Nr-   rn   emb)r&   r   r   r.       ra   )rD   rI   r   r   r   _nunchaku_fused_ops_availablerc   r   r   r   rd   r   r   	unsqueezer   r   r/   float16clip)r   r{   r|   r   r   r   text_seq_lenresidualnorm_hidden_statesgatemlp_hidden_statesmlp_outr~   attn_outputproj_hidden_statesr   ra   ra   rb   r     sX   







z"FluxSingleTransformerBlock.forward)r   Nr   r   )r   r   r   r   r   r   r   r   r   rI   r   r   r   r   r   r   ra   ra   r   rb   r     s@    Lr   c                       s   e Zd Z				ddedededed	ed
ee def fddZ		dde	j
de	j
de	j
deee	j
e	j
f  deeeef  dee	j
e	j
f fddZ  ZS )FluxTransformerBlockrms_normr   Nr   ro   r   r   qk_normr   r   r   c           
         s.  t    t|| _t|| _t|||||dd|||r | dndd
| _t|ddd| _t|ddd| _	|d uoJt
|doJ| d	koJt oJtd u}|| _t||d
d| _t||d
d| _|r|j|j|jd}	t| jfi |	| _t| jfi |	| _td urt| jdd| _t| jdd| _d S d S d S )NFTr   rd   )
r   rv   r   r   r   r   r>   r   r   r   r   )r   elementwise_affineget_namesvdquantzgelu-approximate)ro   dim_outactivation_fn)rH   rankrB   r   r   )r   r   r   norm1norm1_contextre   rd   r   norm2norm2_contexthasattrr   r   r    r   r	   ff
ff_contextrH   r   rB   r!   )
r   ro   r   r   r   r   r   r   nunchaku_enablednunchaku_kwargsr   ra   rb   r   "  sZ   




zFluxTransformerBlock.__init__r{   r|   r   r   r   r(   c                 C   s  | j ||d\}}}}	}
| j||d\}}}}}|pi }| jd|||d|}t|dkr3|\}}nt|dkr>|\}}}|d| }|| }| |}| jrd||	d d d f  |d d d f  }n|d|	d d d f   |d d d f  }| |}|
d| }|| }t|dkr|| }|d| }|| }| |}| jr||d d d f  |d d d f  }n|d|d d d f   |d d d f  }| 	|}||d|  }|j
tjkr|dd}||fS )	Nr   )r&   r|   r   r.   rm   r-   r   r   ra   )r   r   rd   r   r   r   r   r  r  r  r/   rI   r   r   )r   r{   r|   r   r   r   r   gate_msa	shift_mlp	scale_mlpgate_mlpnorm_encoder_hidden_states
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattention_outputsr   context_attn_outputip_attn_output	ff_outputcontext_ff_outputra   ra   rb   r   [  s\   


"&

"
zFluxTransformerBlock.forward)r   r   Nr   r   )r   r   r   r   r   r   r   r   r   rI   r   r   r   r   r   r   ra   ra   r   rb   r   !  sF    >r   c                       sJ   e Zd Zdedee f fddZdejdeejejf fddZ	  Z
S )	FluxPosEmbedthetaaxes_dimc                    s:   t    t||ddt st rtjntjd| _	d S )NF)rope_dim_list
rope_thetause_realrepeat_interleave_realr/   )
r   r   r   r   is_mpsis_musarI   rN   float64rope)r   r  r  r   ra   rb   r     s   
zFluxPosEmbed.__init__idsr(   c                 C   s2   |  }| jj|d\}}|   |   fS )N)pos)r   r  forward_uncachedri   )r   r   r!  	freqs_cos	freqs_sinra   ra   rb   r     s   zFluxPosEmbed.forward)r   r   r   r   r   r   rI   r   r   r   r   ra   ra   r   rb   r    s    (r  c                       s   e Zd ZdZe jjZedee	e
e	 f fddZ	ddedee	ef dee ddf fd	d
Z						ddejdejdejdejdejdejdeee	ef  deejef fddZ  ZS )FluxTransformer2DModelz|
    The Transformer model introduced in Flux.

    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
    r(   c                 C   s   g dg dddgdS )N)r   embedrotary	pos_embed)zattn.to_qkvzattn.to_outzattn.add_qkv_projzattn.to_added_qkvzattn.to_add_outimg_mlptxt_mlpzattention.to_qkvzattention.to_outr   r   r   r   zff.netzff_context.netimg_modtxt_mod)skip	svdq_w4a4	awq_w4a16ra   )clsra   ra   rb   get_nunchaku_quant_rules  s   z/FluxTransformer2DModel.get_nunchaku_quant_rulesNconfig	hf_configr   c                    s:  t  j||d |j_tjdd pjj_jjjj _	t
djjd_jjr0tnt}|j	jjd_tjjj	ddd_tjjj	ddd_t fdd	tjjD _t fd
d	tjjD _tj	j	ddd_tj	jjjj j ddd_ ddg_!d S )N)r2  r3  out_channelsi'  )r  r  )embedding_dimpooled_projection_dimTr   c              	      .   g | ]}t jjjjj d | dqS )ztransformer_blocks.ro   r   r   r   r   )r   r   r2  r   r   rk   ir   r   ra   rb   rl         z3FluxTransformer2DModel.__init__.<locals>.<listcomp>c              	      r7  )zsingle_transformer_blocks.r8  )r   r   r2  r   r   r9  r;  ra   rb   rl     r<  Fr   )r   r   transformer_blockssingle_transformer_blocks)"r   r   arch_configr2  rR   in_channelsr4  r   r   r   r  axes_dims_rope
rotary_embguidance_embedsr   r   r6  time_text_embedr   joint_attention_dimcontext_embedder
x_embedderr   r   range
num_layersr=  num_single_layersr>  r   norm_out
patch_sizer   layer_names)r   r2  r3  r   text_time_guidance_clsr   r;  rb   r     s\   


zFluxTransformer2DModel.__init__r{   r|   pooled_projectionstimestepguidancer   r   c                 C   s   |dur| dddurtd | |\}}| jjr(|dur(| |||}	n| ||}	| |\}}|durNd|v rN|d}
| 	|
}|
d|i | jD ]}||||	||d\}}qQ| jD ]}||||	||d\}}qb| ||	}| |\}}|S )a}  
        The [`FluxTransformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
                Input `hidden_states`.
            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            guidance (`torch.Tensor`):
                Guidance embeddings.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

        Nscalez\Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective.ip_adapter_image_embedsip_hidden_states)r{   r|   r   r   r   )getloggerwarningrG  r2  rC  rD  rF  popencoder_hid_projupdater=  r>  rK  r   )r   r{   r|   rO  rP  rQ  r   r   r~   r   rS  rT  blockr`   ra   ra   rb   r   5  sH   


zFluxTransformer2DModel.forwardN)NNNNNN)r   r   r   __doc__r   r?  param_names_mappingclassmethoddictr   listr1  r   r   r   r   rI   r   
LongTensorr   r   r
   r   r   ra   ra   r   rb   r%    sL    
#
Q	r%  )r%   r\  )rd   re   )Ktypingr   r   r   r   r   r   rI   torch.nnr   diffusers.models.attentionr   r	   !diffusers.models.modeling_outputsr
   diffusers.models.normalizationr   r   r   r   .sglang.multimodal_gen.configs.models.dits.fluxr   .sglang.multimodal_gen.runtime.layers.attentionr   .sglang.multimodal_gen.runtime.layers.layernormr   r   +sglang.multimodal_gen.runtime.layers.linearr   r   Esglang.multimodal_gen.runtime.layers.quantization.configs.base_configr   Isglang.multimodal_gen.runtime.layers.quantization.configs.nunchaku_configr   r   5sglang.multimodal_gen.runtime.layers.rotary_embeddingr   r   5sglang.multimodal_gen.runtime.layers.visual_embeddingr   r   .sglang.multimodal_gen.runtime.models.dits.baser   'sglang.multimodal_gen.runtime.platformsr   5sglang.multimodal_gen.runtime.utils.layerwise_offloadr   1sglang.multimodal_gen.runtime.utils.logging_utilsr   r   rV  nunchaku.models.attentionr    nunchaku.models.normalizationr!   r"   nunchaku.ops.gemmr#   rO   nunchaku.ops.quantizer$   rF   r   	Exceptionr   r   rc   r   Modulere   r   r   r  r%  
EntryClassra   ra   ra   rb   <module>   sp    
d
 J   I