o
    -i=                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 G dd de
Z8de9de(fddZ:G dd  d ej;Z<G d!d" d"ej;Z=G d#d$ d$ej;Z>G d%d& d&ejj;Z?eG d'd( d(ej;Z@G d)d* d*ej;e,e-ZAdS )+zInference-only PLaMo3 model.    )Iterable)islice)AnyN)nn)PretrainedConfig)	Attention)support_torch_compile)
VllmConfig)$get_tensor_model_parallel_world_size)get_pp_group)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)DEFAULT_VOCAB_PADDING_SIZEParallelLMHeadVocabParallelEmbedding)LoaderFunctioncomposed_weight_loaderdefault_weight_loader)SupportsLoRA
SupportsPP)AutoWeightsLoaderextract_layer_index'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix)set_weight_attrs)IntermediateTensorsc                   @   s   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< eed	B  ed
< eed< e	ee
f ed< eed< eed< eed< d	S )Plamo3Configplamo3
model_typehidden_sizenum_hidden_layersrms_norm_epsnum_attention_headshead_dimnum_key_value_headsNinterleaved_sliding_windowsliding_window_patternrope_parametersrope_local_thetaintermediate_size
vocab_size)__name__
__module____qualname__r%   str__annotations__intfloatlistdictr    r;   r;   ^/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/plamo3.pyr#   3   s   
 r#   offsetreturnc                    s   t t fddS )Nc                    s   |   S Nr;   )xr=   r;   r<   <lambda>L   s    z(rms_norm_weight_loader.<locals>.<lambda>)r   r   rA   r;   rA   r<   rms_norm_weight_loaderI   s   
rC   c                	       sN   e Zd Z		ddededB deddf fddZd	ejdejfd
dZ	  Z
S )DenseMLPN configquant_configprefixr>   c                    sl   t    |j| _|j| _t| j| jgd d| d|dd| _t | _t| j| jd| d|dd| _	d S )N   Fz.gate_up_proj)biasrH   rG   return_biasz
.down_proj)
super__init__r&   r0   r   gate_up_projr   actr   	down_proj)selfrF   rG   rH   	__class__r;   r<   rM   Q   s(   

zDenseMLP.__init__hidden_statesc                 C   s   |  |}| |}| |S r?   )rN   rO   rP   )rQ   rT   hr;   r;   r<   forwardl   s   


zDenseMLP.forward)NrE   )r2   r3   r4   r#   r   r5   rM   torchTensorrV   __classcell__r;   r;   rR   r<   rD   P   s    rD   c                
       sZ   e Zd Zdddededdf fddZd	ejd
ejdejdB dedejf
ddZ	  Z
S )Plamo3AttentionMixerrE   rH   vllm_configrH   r>   Nc             	      s  t    |jj}|j}|j| _t }|j| _| j| dks J | j| | _	|j
| _| j|kr9| j| dks8J n	|| j dksBJ td| j| | _|j| _| j	| j | _| j| j | _| jd | _t|j| j| j| jd|| dd| _t| j| j |jd|| dd| _t|}|j| }|dk}	||jv r|j| }
n|j}
|	rtd	|jd
}
|j}t|jdrt|jjtrt ||jj}t!| j||
d| _"t#| j|j$d| _%t&| j%j'dt(ddi t#| j|j$d| _)t&| j)j'dt(ddi t*| j	| j| j| j|j+|j,| | dd| _-d S )Nr      g      Fz	.qkv_proj)rJ   rG   rH   z.o_projsliding_attentiondefault)	rope_type
rope_thetamax_model_len)max_positionr.   epsweight_loader      ?rA   z.attn)num_kv_headscache_configper_layer_sliding_windowrH   ).rL   rM   model_config	hf_configrG   r&   r
   r)   total_num_heads	num_headsr+   total_num_kv_headsmaxrh   r*   q_sizekv_sizescalingr   qkv_projr   o_projr   layer_typesr.   r:   r/   max_position_embeddingshasattr
isinstancerb   r7   minr   
rotary_embr   r(   q_normr!   weightrC   k_normr   ri   r,   attn)rQ   r\   rH   kwargsrF   rG   tp_size	layer_idx
layer_type
is_slidingr.   rc   rR   r;   r<   rM   s   s   

	


zPlamo3AttentionMixer.__init__	positionsrT   residualr   c                 K   s   |  |\}}|j| j| j| jgdd\}}}	|j}
||
d d |
d | j | jf }| j||
}|j}||d d |d | j | jf }| j	||}| 
|||\}}| |||	}| |\}}|S )N)dim)rt   splitrq   rr   shapereshaper*   r|   forward_nativer~   r{   r   ru   )rQ   r   rT   r   r   qkv_qkvq_shapek_shapeattn_outputoutputr;   r;   r<   rV      s    &&zPlamo3AttentionMixer.forward)r2   r3   r4   r	   r5   rM   rW   rX   r   rV   rY   r;   r;   rR   r<   rZ   r   s     XrZ   c                       sj   e Zd Z	ddedededdf fddZd	ejd
ejdejdB dede	ejejdB f f
ddZ
  ZS )Plamo3DecoderLayerrE   r\   rH   r   r>   Nc                    s   t    |jj}|j}t|| dd| _t||| dd| _t	|j
|jd| _t| jjdtddi t	|j
|jd| _t| jjdtd	di t	|j
|jd| _t| jjdtddi t	|j
|jd| _t| jjdtd
di d S )Nz.mixerr\   rH   z.mlp)rF   rG   rH   rd   rf   rg   rA   g?gWfѷ?)rL   rM   rk   rl   rG   rZ   mixerrD   mlpr   r&   r(   pre_mixer_normr!   r}   rC   post_mixer_normpre_mlp_normpost_mlp_norm)rQ   r\   rH   r   rF   rG   rR   r;   r<   rM      s<   
zPlamo3DecoderLayer.__init__r   rT   r   c                 K   sn   |d u r|}|  |}n|  ||\}}| j|||d}| |}| ||\}}| |}| |}||fS Nr   rT   r   )r   r   r   r   r   r   )rQ   r   rT   r   r   r;   r;   r<   rV     s   


zPlamo3DecoderLayer.forwardrE   )r2   r3   r4   r	   r5   r   rM   rW   rX   tuplerV   rY   r;   r;   rR   r<   r      s,    $r   c                       s`   e Zd Zddededdf fddZdejd	ejd
ejdB deejejdB f fddZ	  Z
S )Plamo3DecoderrE   r\   rH   r>   Nc                    s@   t     jjj}t| fdd| dd\| _| _| _d S )Nc                    s   t  | dS )Nr[   )r   r[   r\   r;   r<   rB   &  s    z(Plamo3Decoder.__init__.<locals>.<lambda>.layersr[   )	rL   rM   rk   rl   r'   r   start_layer	end_layerlayers)rQ   r\   rH   r'   rR   r   r<   rM      s   


zPlamo3Decoder.__init__r   rT   r   c                 C   s2   t | j| j| jD ]}||||d\}}q	||fS r   )r   r   r   r   )rQ   r   rT   r   layerr;   r;   r<   rV   *  s   zPlamo3Decoder.forwardr   )r2   r3   r4   r	   r5   rM   rW   rX   r   rV   rY   r;   r;   rR   r<   r     s    
r   c                       sv   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdejde	dB dejdB dejf
ddZ
  ZS )Plamo3ModelrE   r[   r\   rH   c                   s   t    |jj}|| _|j| _|j| _|j| _t	| j|j
|j| dd| _tddg|j
| _t|| dd| _t|j
|jd| _t| jjdtd	d
i d S )Nz.embed_tokens)org_num_embeddingsrH   rT   r   r   r[   rd   rf   rg   rA   )rL   rM   rk   rl   rF   pad_token_idpadding_idxr1   org_vocab_sizer   r&   embed_tokensr   make_empty_intermediate_tensorsr   r   r   r(   normr!   r}   rC   )rQ   r\   rH   rF   rR   r;   r<   rM   ;  s*   

zPlamo3Model.__init__	input_idsr>   c                 C   s
   |  |S r?   )r   rQ   r   r;   r;   r<   embed_input_idsT  s   
zPlamo3Model.embed_input_idsNr   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }| j|||d\}}t  js6t||dS | ||\}}|S )NrT   r   r   )rT   r   )r   is_first_rankr   r   is_last_rankr"   r   )rQ   r   r   r   r   rT   r   r   r;   r;   r<   rV   W  s"   

zPlamo3Model.forwardNN)r2   r3   r4   r	   r5   rM   rW   rX   r   r"   rV   rY   r;   r;   rR   r<   r   9  s     r   c                       s   e Zd ZdgdgdZdddededd	f fd
dZdejdejfddZ					ddejdejde
d	B dejd	B dejf
ddZdejdejd	B fddZdeeeejf  fddZ  ZS )Plamo3ForCausalLMrt   rN   )rt   rN   rE   r[   r\   rH   r>   Nc                   s   t    |jj| _|| _|j| _|j| _t|t|dd| _	| jj
| _
| jj
| _| j
d d d }t|| jj| jj
t| dd| _| jjrP| j| j	j| _t| j| jj
| _| j	j| _d S )Nmodelr         z.lm_head)r   padding_sizerH   )rL   rM   rk   rl   rF   r\   scheduler_configr   r    r   r1   unpadded_vocab_sizer   r&   r   lm_headtie_word_embeddingstie_weightsr   r   logits_processorr   )rQ   r\   rH   num_embeddingsrR   r;   r<   rM   z  s2   





zPlamo3ForCausalLM.__init__r   c                 C   s   | j |S r?   )r   r   r   r;   r;   r<   r     s   z!Plamo3ForCausalLM.embed_input_idsr   r   r   c                 C   s   |  ||||}|S r?   )r   )rQ   r   r   r   r   rT   r;   r;   r<   rV     s   zPlamo3ForCausalLM.forwardrT   c                 C   s   |  | j|}|S r?   )r   r   )rQ   rT   logitsr;   r;   r<   compute_logits  s   z Plamo3ForCausalLM.compute_logitsweightsc                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   rF   r   load_weights)rQ   r   loaderr;   r;   r<   r     s
   
zPlamo3ForCausalLM.load_weightsr   )r2   r3   r4   packed_modules_mappingr	   r5   rM   rW   rX   r   r"   rV   r   r   r   r   rY   r;   r;   rR   r<   r   t  s2      

$r   )B__doc__collections.abcr   	itertoolsr   typingr   rW   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   vllm.distributedr
   vllm.distributed.parallel_stater   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   r   -vllm.model_executor.model_loader.weight_utilsr   r   r   %vllm.model_executor.models.interfacesr   r    vllm.model_executor.models.utilsr   r   r   r   r    vllm.model_executor.utilsr!   vllm.sequencer"   r#   r8   rC   ModulerD   rZ   r   r   r   r   r;   r;   r;   r<   <module>   sB   "p=: