o
    پiT                     @   sF  d Z ddlZddlmZmZmZmZmZ ddlZddl	m
  mZ ddlZddlm  m  mZ ddlm
Z
 ddlmZmZ ddlmZ ddlm  m  mZ ddlmZ ddlm Z  dd	l!m"Z" dd
l#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z< ddl=m>Z> G dd dej
j?Z@G dd de
j?ZAG dd de
j?ZBG dd de
j?ZCG dd de
j?ZDG dd  d e
j?ZEG d!d" d"e
j?ZFG d#d$ d$e
j?ZGG d%d& d&ej
j?ZHG d'd( d(e
j?ZIG d)d* d*e
j?ZJG d+d, d,e
j?ZKeKZLdS )-zPyTorch Mllama model.    N)IterableListOptionalTupleUnion)nn)BaseModelOutputCausalLMOutputWithPast)$_prepare_aspect_ratio_attention_mask)$get_tensor_model_parallel_world_size)
get_act_fn)VisionAttention)RMSNorm)ColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)DEFAULT_VOCAB_PADDING_SIZEParallelLMHeadVocabParallelEmbedding)MultimodalInputs)ForwardBatch)default_weight_loader)LlamaDecoderLayerLlamaMLP)
add_prefixc                       st   e Zd ZdZ	ddededeeeeef f deeeeef f dedd	f fd
dZde	j
de	j
fddZ  ZS )ColumnParallelConv2dPatcha  Conv2D Patching layer with model parallelism.
    Column parallel over unfolded input.
    Arguments:
        in_channels: Input channels.
        out_channels: Output channels.
        kernel_size: Size of convolution kernel.
        stride (default 1): Stride for convolution.
        bias (default False): Use bias in Conv2d.
    Input: (bsz, in_channels, width, height)
    Output: (bsz, num_tokens, out_channels)
    Fin_channelsout_channelskernel_sizestridebiasreturnNc                    sR   t    t|tr||f}tjj||d| _t||d  |d  ||d| _	d S )N)r"   r#   r      )r$   )
super__init__
isinstanceinttorchr   Unfold_unfoldr   _linear)selfr    r!   r"   r#   r$   	__class__ L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/mllama.pyr(   9   s   

z"ColumnParallelConv2dPatch.__init__xc                 C   s*   |  |}|ddd}| |\}}|S )Nr      r&   )r-   permuter.   )r/   r4   _r2   r2   r3   forwardK   s   
z!ColumnParallelConv2dPatch.forward)F)__name__
__module____qualname____doc__r*   r   r   boolr(   r+   Tensorr8   __classcell__r2   r2   r0   r3   r   ,   s"    r   c                       sF   e Zd Zddejdef fddZdejdejdejfd	d
Z	  Z
S )%MllamaPrecomputedAspectRatioEmbeddingTconfigis_gatedc                    sb   t    |j| _|j| _|j| _|| _t| jd | j| j | _|r/t	t
d| _d S d S )Nr&   )r'   r(   max_num_tileshidden_sizemax_aspect_ratio_idrB   r   	Embedding	embedding	Parameterr+   zerosgate)r/   rA   rB   r0   r2   r3   r(   T   s   
z.MllamaPrecomputedAspectRatioEmbedding.__init__hidden_stateaspect_ratio_idsr%   c                 C   s>   |  |}|d| jd| j}| jr|| j  }|| }|S )Nr&   )rG   reshaperC   rD   rB   rJ   tanh)r/   rK   rL   
embeddingsr2   r2   r3   r8   a   s   
z-MllamaPrecomputedAspectRatioEmbedding.forward)T)r9   r:   r;   config_mllamaMllamaVisionConfigr=   r(   r+   r>   r8   r?   r2   r2   r0   r3   r@   R   s    r@   c                       s@   e Zd Zdejf fddZdejdejdejfddZ  Z	S )	"MllamaPrecomputedPositionEmbeddingrA   c                    s   t    |j| _|j| _|j|j d d | _|j| _|jd | _t	
td| _t| j| j}t	
| j| | _t	| jd | j| j | j | _d S )Nr5   r&         )r'   r(   rC   rE   
image_size
patch_sizenum_patchesrD   scaler   rH   r+   rI   rJ   randnrG   rF   tile_embedding)r/   rA   position_embeddingr0   r2   r3   r(   o   s   

z+MllamaPrecomputedPositionEmbedding.__init__rK   rL   r%   c                 C   sp   d| j   | j }||dd| j| j }| |}|jd }||| j	| j| j}| j  | }|| }|S )Nr&   r   )
rJ   rO   rG   viewrW   rD   rZ   shaperN   rC   )r/   rK   rL   gated_position_embeddingtile_position_embedding
batch_sizegated_tile_position_embeddingr2   r2   r3   r8      s   

z*MllamaPrecomputedPositionEmbedding.forward)
r9   r:   r;   rQ   rR   r(   r+   r>   r8   r?   r2   r2   r0   r3   rS   n   s    rS   c                       sF   e Zd Z		ddee def fddZdejdejfd	d
Z	  Z
S )MllamaVisionMLPN quant_configprefixc                    s\   t    || _t|j| _t|j|jd|t	d|d| _
t|j|jd|t	d|d| _d S )NTfc1r$   rd   re   fc2)r'   r(   rA   r   
hidden_actactivation_fnr   rD   intermediate_sizer   rf   r   rh   r/   rA   rd   re   r0   r2   r3   r(      s"   
zMllamaVisionMLP.__init__hidden_statesr%   c                 C   s*   |  |\}}| |}| |\}}|S N)rf   rj   rh   )r/   rm   r7   r2   r2   r3   r8      s   
zMllamaVisionMLP.forwardNrc   )r9   r:   r;   r   r   strr(   r+   r>   r8   r?   r2   r2   r0   r3   rb      s    rb   c                	       sZ   e Zd Z			ddejdee dedef fdd	Z		dd
e
jdee
j fddZ  ZS )MllamaVisionEncoderLayerNFrc   rA   rd   rB   re   c              
      s   t    |j| _|j| _|| _|j| _t| j| j| jd|dtd|d| _	t
||td|d| _tj| j|jd| _tj| j|jd| _|rettdtj d	 | _ttdtj d	 | _d S d S )
NTF	self_attn)use_qkv_parallelrd   flatten_batchre   mlpre   epsr&      )r'   r(   rD   attention_headsnum_attention_headsrB   rk   r   r   rr   rb   ru   r   	LayerNormnorm_epsinput_layernormpost_attention_layernormrH   r+   onesmathpi	gate_attngate_ffn)r/   rA   rd   rB   re   r0   r2   r3   r(      s2   
	 z!MllamaVisionEncoderLayer.__init__rK   attention_maskc                 C   sx   |}|  |}| j||d}| jsdn| j }|||  }|}| |}| |}| js/dn| j }|||  }|S )Nr   r&   )r~   rr   rB   r   rO   r   ru   r   )r/   rK   r   residualr   r   r2   r2   r3   r8      s   


z MllamaVisionEncoderLayer.forward)NFrc   rn   )r9   r:   r;   rQ   rR   r   r   r=   rp   r(   r+   r>   r8   r?   r2   r2   r0   r3   rq      s&    (rq   c                	       sf   e Zd Z					ddejdee def fdd	Z	dd
e	j
dee	j
 deeef fddZ  ZS )MllamaVisionEncoderN    Frc   rA   rd   re   c                    sB   t     | _t fddt|D | _|pg | _d S )Nc              	      s(   g | ]}t  td | dqS )layers.rv   )rq   r   ).0irA   rB   re   rd   r2   r3   
<listcomp>  s    z0MllamaVisionEncoder.__init__.<locals>.<listcomp>)r'   r(   rA   r   
ModuleListrangelayersoutput_hidden_states)r/   rA   rd   
num_layersrB   r   re   r0   r   r3   r(      s   
	zMllamaVisionEncoder.__init__rm   r   r%   c                 C   s\   d}t | jD ]\}}|| jv r||f }|||}qt| jd | jv r*||f }||fS )Nr2   r&   )	enumerater   r   len)r/   rm   r   encoder_statesr   encoder_layerr2   r2   r3   r8     s   


zMllamaVisionEncoder.forward)Nr   FNrc   rn   )r9   r:   r;   rQ   rR   r   r   rp   r(   r+   r>   r   r   r   r8   r?   r2   r2   r0   r3   r      s*    
r   c                       sn   e Zd Z		ddejdee def fddZde	j
d	e	j
fd
dZde	j
de	j
de	j
d	e	j
fddZ  ZS )MllamaVisionModelNrc   rA   rd   re   c              	      s  t    |j| _|j| _|j| _|j| _|j| _|j| _| j| j d d | _	|jd | _
t|j| j| j| jdd| _t| j
t| j | _t|| _t|dd| _t|dd| _t| j| _t| j| _t|||jd|jtd|d	| _t|||jdtd
|d| _d S )Nr5   r&   rT   F)r    r!   r"   r#   r$   T)rB   transformer)rB   r   re   global_transformer)rB   re   ) r'   r(   rU   rV   rC   rD   num_channelsr    intermediate_layers_indicesrW   rX   r   patch_embeddingr   rH   r+   rY   class_embeddingrS   gated_positional_embeddingr@   pre_tile_positional_embeddingpost_tile_positional_embeddingr|   layernorm_prelayernorm_postr   num_hidden_layersr   r   num_global_layersr   rl   r0   r2   r3   r(   $  sR   

zMllamaVisionModel.__init__rK   r%   c                 C   s2   |j \}}}| j|d|}tj||gdd}|S )Nr&   dim)r]   r   expandr+   cat)r/   rK   r`   r7   rD   r   r2   r2   r3   apply_class_embedding\  s   z'MllamaVisionModel.apply_class_embeddingpixel_valuesrL   aspect_ratio_maskc                 C   s  |j \}}}}}}	||| | |||	}||| d}| || jjj}
|
}t 	|}|j \}}}||| |d|}| 
||}||| | ||}| |}|d7 }||| |||}| ||}| |}d|j d d  d }ddd|f}tj||ddd}|dkr| nd }||| d}t|| j|j d | jjjd	}||| d|}| j||d
}|d |d }}tj|dd}| |}||| ||| |}| ||}||| |||  |}| j||d
d }||| ||| |}|d d d d d |f }||||||}||| ||| d}|d d d d d |f }|||||d}tj||gdd}|S )NrM   r&      r   constant)modevaluer5   )r   rW   target_lengthdtyper   r   )r]   rN   r   tor   weightr   psget_tp_group
all_gatherr   r   r   Fpadr
   rW   r\   r   r+   stackr   r   r   r   )r/   r   rL   r   r`   num_concurrent_media	num_tilesr   heightwidthpatch_embedsrK   r7   rW   r   num_padding_patchespaddingslice_indexr   outputintermediate_hidden_statesr2   r2   r3   r8   b  s   





zMllamaVisionModel.forwardro   )r9   r:   r;   rQ   rR   r   r   rp   r(   r+   r>   r   r8   r?   r2   r2   r0   r3   r   #  s(    8r   c                
       st   e Zd Z				ddeej dee dee def fddZ	d	e
jd
ee
j dee
j dede
jf
ddZ  ZS )MllamaTextCrossAttentionNrc   rA   layer_idrd   re   c                    s>  t    || _t | _| jj| _| j| j | _| jj| _| j| j | _	|j
| _
|j| _|j| j | _|| _| j| j | _| j| j | _| j	| j | _t| j| j| j| jd|td|d| _t| j| j | jdd|td|d| _t| j|jd| _t| j|jd| _| jd | _t| j| j| j| j	|d|td	|d
| _d S )NFqkv_projrg   To_proj)r$   input_is_parallelrd   re   rw   rT   attn)r   is_cross_attentionrd   re   )r'   r(   rA   r   model_parallel_sizer{   	num_headsnum_local_headsnum_key_value_headsnum_local_key_value_headsdropoutrD   head_dimr   num_key_value_groupsq_local_sizekv_local_sizer   r   r   r   r   r   rms_norm_epsq_normk_normscalingr   r   r/   rA   r   rd   re   r0   r2   r3   r(     sZ   



	
z!MllamaTextCrossAttention.__init__rm   r   cross_attention_statesforward_batchr%   c                 C   s
  |  |\}}|j| j| j| jgdd\}}}|d u r d }d }	n:|  |\}
}|
j| j| j| jgdd\}}}	|d| j| j}|	d| j| j}	| |d| jd| j| j}|d| j	| j}| 
|d| jd| j	| j}| |||	|}| |\}}|S )NrM   r   )r   splitr   r   r\   r   r   r   rN   r   r   r   r   )r/   rm   r   r   r   qkv_decr7   qkvqkv_encr   outr2   r2   r3   r8     s.   

z MllamaTextCrossAttention.forward)NNNrc   )r9   r:   r;   r   rQ   MllamaTextConfigr*   r   rp   r(   r+   r>   r   r8   r?   r2   r2   r0   r3   r     s2    8r   c                       sl   e Zd ZdZ	ddejdedee de	ddf
 fd	d
Z
dejdejdejdejdedejfddZ  ZS ) MllamaCrossAttentionDecoderLayerzPCross-attention transformer block with tanh-gated attention
    and feedforward.rc   rA   r   rd   re   r%   Nc                    s   t    || _t|||td|d| _t|j|jd| _	t
jt
d| _t|j|j|j|td|d| _t|j|jd| _t
jt
d| _d S )N
cross_attn)rA   r   rd   re   rw   r&   ru   )rD   rk   ri   rd   re   )r'   r(   r   r   r   r   r   rD   r   r~   r+   r   rH   rI   cross_attn_attn_gater   rk   ri   ru   r   cross_attn_mlp_gater   r0   r2   r3   r(   >  s*   
z)MllamaCrossAttentionDecoderLayer.__init__rm   r   cross_attention_maskfull_text_row_masked_out_maskr   c                 C   sp   |}|  |}| j||||d}|| }|| j |  }|}| |}| |}|| }|| j |  }|S )N)rm   r   r   r   )r~   r   r   rO   r   ru   r   )r/   rm   r   r   r   r   r   r2   r2   r3   r8   ]  s    


z(MllamaCrossAttentionDecoderLayer.forwardrc   )r9   r:   r;   r<   rQ   r   r*   r   r   rp   r(   r+   r>   r   r8   r?   r2   r2   r0   r3   r   :  s6    r   c                       s   e Zd ZejZdZ	ddejdee de	f fddZ
dejd	eej d
eej deej deeejejf  dededejfddZ  ZS )MllamaTextModelmodelrc   rA   rd   re   c              
      s   t    |j| _|j| _t|jd |jtd|d| _|j	| _	g }t
|jD ]*}|| j	v r@|t|||td| |d q'|t|||td| |d q't|| _t|j|jd| _d S )Nr   embed_tokensrv   r   rd   re   )rd   r   re   rw   )r'   r(   pad_token_id
padding_id
vocab_sizer   rD   r   r   cross_attention_layersr   r   appendr   r   r   r   r   r   r   norm)r/   rA   rd   re   r   r   r0   r2   r3   r(   }  s<   


	zMllamaTextModel.__init__	input_ids	positionsr   r   r   r   skip_cross_attentionr%   c                 C   s   |  |}|}	t| jD ]1\}
}t|tr!|s ||	||||d}	qt|tr5|||	|d d\}	}|	| }	qtdt| | |	}	|	S )N)rm   r   r   r   r   )r   rm   r   r   zUnknown decoder layer type )	r   r   r   r)   r   r   
ValueErrortyper   )r/   r   r   r   r   r   r   r   inputs_embedsrm   r7   decoder_layerr   r2   r2   r3   r8     s0   






zMllamaTextModel.forwardr   )r9   r:   r;   rQ   r   config_classbase_model_prefixr   r   rp   r(   r+   
LongTensorr   r>   r   r=   r8   r?   r2   r2   r0   r3   r   y  s8    )	r   c                       s   e Zd ZejZdZddgZ	ddejdee	 de
f fdd	Zd
ejdeej deej deej deeejejf  dededejfddZ  ZS )MllamaForCausalLMlanguage_modelr   MllamaSelfAttentionDecoderLayerrc   rA   rd   re   c              	      sN   t    |j| _t||td|d| _t|j|j|jt|td|d| _	d S )Nr   rv   lm_head)org_num_embeddingspadding_sizerd   re   )
r'   r(   r   r   r   r   r   rD   r   r  rl   r0   r2   r3   r(     s   
zMllamaForCausalLM.__init__r   r   r   r   r   r   r   r%   c           	   	   C   s   | j |||||||d}|S )Nr   r   r   r   r   r   r   )r   )	r/   r   r   r   r   r   r   r   rm   r2   r2   r3   r8     s   
	zMllamaForCausalLM.forwardr   )r9   r:   r;   rQ   r   r   r   _no_split_modulesr   r   rp   r(   r+   r   r   r>   r   r=   r8   r?   r2   r2   r0   r3   r    s>    		r  c                
       s   e Zd Zg dZddgZdddddd	Z	
	d%dejdee	 de
f fddZdee defddZdefddZdejdee fddZdefddZdejdejdedeeef fd d!Zd"eee
ejf  fd#d$Z  ZS )&MllamaForConditionalGeneration)z.gate_proj..down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj..o_proj.r
  r  )r   r   )r   r&   )r   r5   )gate_up_projr   )r  r&   )q_projk_projv_proj	gate_projup_projNrc   rA   rd   re   c                    s   t    || _|jj| _|jj| _|jj| _|jj| _|j	d ur$|j	nd| _	|jj
| _
t|j|td|d| _t|j|td|d| _t|jj|jjd|dd| _t|j| _d S )NrM   vision_modelr   r  Tmulti_modal_projectorrg   )r'   r(   rd   text_configr   rD   vision_configrC   vision_output_dimr   rU   r   r   r  r  r  r   r  r   logits_processorrl   r0   r2   r3   r(     s6   





z'MllamaForConditionalGeneration.__init__r   	mm_inputsc           
      C   s~   t jdd |jD dd}dd |jD }|jdd \}}| jj}|| | }||_||t| t|  }	|	d | | S )Nc                 S      g | ]}|j qS r2   featurer   itemr2   r2   r3   r   8      z@MllamaForConditionalGeneration.pad_input_ids.<locals>.<listcomp>r   r   c                 S   r  r2   )	pad_valuer  r2   r2   r3   r   9  r  r&      )r+   r   mm_itemsr]   r  rW   num_image_tokensr   )
r/   r   r  r   
pad_valuesr   r   rW   	image_lenpad_idsr2   r2   r3   pad_input_ids7  s   z,MllamaForConditionalGeneration.pad_input_idsr   c              
   C   s  |j  s
t|jrdS d } }}t|jD ].\}}|j| sE|d urEtjdd |jD dd}t	||j
d }t	||j
d }|d7 }q|| | dkrPdS |jj tj|||d| j| jtjd	}tj||tjd
d}	tj|||tjd	}
d}g }t|jD ]d\}}|j| s|d u rq||j|  tjdd |jD dd}t|j
d D ]5}|d|f }|j
d }||||d |f< |jd jd|f |	||f< |jd jd|f |
||d |f< q|d7 }qW d    n1 sw   Y  ||	|
|fS )N)NNNNr   c                 S   r  r2   r  r  r2   r2   r3   r   N  r  zFMllamaForConditionalGeneration._batch_image_inputs.<locals>.<listcomp>r   r&   r5   r   r   cuda)r   devicec                 S   r  r2   r  r  r2   r2   r3   r   q  r  )forward_mode	is_decodeallencoder_cachedr   r  r+   r   r!  maxr]   out_cache_locr)  rI   rU   float32r   int64r   encoder_lensr   rL   r   )r/   r   max_num_imagesrC   bsr   mm_inputr   batched_imagesbatched_ar_idsbatched_ar_maskencoder_lens_needr   jimgr   r2   r2   r3   _batch_image_inputsD  sj   
	



&z2MllamaForConditionalGeneration._batch_image_inputsr   r9  c           
      C   sx   |j d }t|}tj|||j|jd}d }}|D ]}|dkr!q|| }	|| d | |||	< |d7 }||7 }q|S )NrM   )r)  r   r   r&   )r]   sumr+   rI   r)  r   )
r/   r   r9  r   total_encoder_lencross_attention_states_flatr   	start_posencoder_lenend_posr2   r2   r3   flat_encoder_result  s&   

z2MllamaForConditionalGeneration.flat_encoder_resultc                 C   s   |j  r|jdk}n2tj|j tjd}d}t|j	
 |jD ]\}}|dkr1d|||| < ||7 }q!||j	j}|ddS )Nr   r'  FrM   r&   )r*  r+  r2  r+   r   extend_seq_lensr=  r=   zipseq_lenstolistencoder_lens_cpur   r)  rN   )r/   r   r   r@  seq_lenrA  r2   r2   r3   !get_full_text_row_masked_out_mask  s"   

z@MllamaForConditionalGeneration.get_full_text_row_masked_out_maskr   r%   c              	   C   s   ddl m} | |\}}}}d }	d }
| rd}nt|jt|jks%J t|jt|jks1J |j dk}|s@| |}nd }|d uri| 	|||}
| 
|
\}
}|
j\}}}}}|
|d|}
| |
|}
| j|||
|	|||d}| ||| jj|S )Nr   )get_is_capture_modeFrM   r  )+sglang.srt.model_executor.cuda_graph_runnerrK  r<  r   r2  rF  rH  r.  rJ  r  r  r]   r\   rC  r  r  r  )r/   r   r   r   rK  r6  r7  r8  r9  r   r   r   r   r7   r4  image_token_dimrm   r2   r2   r3   r8     sR   
	z&MllamaForConditionalGeneration.forwardweightsc                 C   s   g d}t |  }t }|D ]W\}}d|v r&|dd}||jd d}|D ]#\}}}	||vr2q(|||}|| }
|| |
j}||
||	  nd|v rV|dd}||}
t	|
d	t
}||
| qd S )
N))	.qkv_projz.q_projr   )rO  z.k_projr   )rO  z.v_projr   ).gate_up_projz
.gate_projr   )rP  z.up_projr&   zpatch_embedding.weightzpatch_embedding._linear.weightr   rM   r  zself_attn.o_projzself_attn.projweight_loader)dictnamed_parameterssetreplacer\   r]   addrQ  popgetattrr   )r/   rN  stacked_params_mappingparams_dictupdated_paramsnameloaded_weight
param_nameweight_nameshard_idparamrQ  r2   r2   r3   load_weights  s2   


z+MllamaForConditionalGeneration.load_weightsro   )r9   r:   r;   #default_bitsandbytes_target_modulescolumn_parallel_weights_modules#bitsandbytes_stacked_params_mappingrQ   MllamaConfigr   r   rp   r(   r   r*   r   r&  r   r<  r+   r>   rC  rJ  r   r   r	   r8   r   rb  r?   r2   r2   r0   r3   r	    sH    
$<


$@r	  )Mr<   r   typingr   r   r   r   r   r+   torch.nn.functionalr   
functionalr   torch.utils.checkpoint/transformers.models.mllama.configuration_mllamamodelsmllamaconfiguration_mllamarQ   transformers.modeling_outputsr   r	   *transformers.models.mllama.modeling_mllamar
   %sglang.srt.distributed.parallel_statesrtdistributedparallel_stater   sglang.srt.distributedr   sglang.srt.layers.activationr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.quantizationr   !sglang.srt.layers.radix_attentionr   *sglang.srt.layers.vocab_parallel_embeddingr   r   r   "sglang.srt.managers.schedule_batchr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.llamar   r   sglang.srt.utilsr   Moduler   r@   rS   rb   rq   r   r   r   r   r   r  r	  
EntryClassr2   r2   r2   r3   <module>   sR   &*!<. =[?R2  