o
    -i5                     @   sl  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- G dd dej.Z/G dd dej.Z0G dd dej.Z1eG dd dej.Z2G dd dej.e'Z3dS ) zCInference-only persimmon model compatible with HuggingFace weights.    )Iterable)isliceN)nn)PersimmonConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
get_act_fn)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sD   e Zd Z		ddededB def fddZdejfd	d
Z	  Z
S )PersimmonMLPN configquant_configprefixc                    sR   t    t|j|j|| dd| _t|j|j|| dd| _t|j	| _
d S )Nz.dense_h_to_4hr!   r"   z.dense_4h_to_h)super__init__r   hidden_sizeintermediate_sizedense_h_to_4hr   dense_4h_to_hr   
hidden_actact)selfr    r!   r"   	__class__ a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/persimmon.pyr%   @   s   
zPersimmonMLP.__init__returnc                 C   s*   |  |\}}| |}| |\}}|S N)r(   r+   r)   )r,   hidden_states_r/   r/   r0   forwardU   s   
zPersimmonMLP.forward)Nr   )__name__
__module____qualname__r   r   strr%   torchTensorr5   __classcell__r/   r/   r-   r0   r   ?   s    r   c                	       s   e Zd Z			ddededB dedB def fddZd	ej	d
ej	fddZ
d	ej	d
ej	fddZdej	dej	d
ej	fddZ  ZS )PersimmonAttentionNr   r    cache_configr!   r"   c                    s.  t    || _t }|j| _|j| _| j| | _| j| j | _|j	| _	d| _
| j| j | jks2J | j| dks;J t| j| j| jd|| dd| _t| j| j | jd|| dd| _|j| _| jrst| j| _t| j| _t| j| j	|jd| _| jd | _t| j| j| j||| dd	| _d S )
NTr   z.query_key_value)biasr!   r"   z.dense)max_positionrope_parametersg      z.attn)scaler>   r!   r"   )r$   r%   r    r   r&   num_attention_headstotal_num_heads	num_headshead_dimmax_position_embeddings	is_causalr   query_key_valuer   denseqk_layernormis_qk_layernormr   	LayerNormq_layernormk_layernormr   rA   
rotary_embscalingr   attn)r,   r    r>   r!   r"   tensor_parallel_world_sizer-   r/   r0   r%   ]   sX   

zPersimmonAttention.__init__xr1   c                 C   s   |j d }||| j| jS Nr   shapeviewrE   rF   r,   rT   
seq_lengthr/   r/   r0   _split_heads   s   
zPersimmonAttention._split_headsc                 C   s   |j d }||| j| j S rU   rV   rY   r/   r/   r0   _merge_heads   s   
zPersimmonAttention._merge_headsposition_idsr3   c           
      C   s   |  |\}}|jddd\}}}| jr2| |}| |}| |}| |}| |}| |}| |||\}}| |||}| 	|\}	}|	S )N   )chunksdim)
rI   chunkrL   r[   rN   rO   r\   rP   rR   rJ   )
r,   r]   r3   qkvr4   qkvattn_outputoutputr/   r/   r0   r5      s   





zPersimmonAttention.forwardNNr   )r6   r7   r8   r   r   r   r9   r%   r:   r;   r[   r\   r5   r<   r/   r/   r-   r0   r=   \   s,    9r=   c                	       sZ   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	fddZ
  ZS )PersimmonDecoderLayerNr   r    r>   r!   r"   c                    sl   t    |j| _t|||| dd| _t||| dd| _tj|j|j	d| _
tj|j|j	d| _d S )Nz
.self_attn)r    r>   r!   r"   z.mlpr#   eps)r$   r%   r&   r=   	self_attnr   mlpr   rM   layer_norm_epsinput_layernormpost_attention_layernorm)r,   r    r>   r!   r"   r-   r/   r0   r%      s&   
zPersimmonDecoderLayer.__init__r]   r3   r1   c                 C   sL   |}|  |}| j||d}|| }|}| |}| |}|| }|}|S )N)r]   r3   )rp   rm   rq   rn   )r,   r]   r3   residualoutputsr/   r/   r0   r5      s   


zPersimmonDecoderLayer.forwardri   )r6   r7   r8   r   r   r   r9   r%   r:   r;   r5   r<   r/   r/   r-   r0   rj      s(    rj   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )PersimmonModelr   r"   vllm_configr"   c                   s   t    |jj|j |jj| _| _tjj	| _
tj fdd| dd\| _| _| _tjj	jd| _tdgj	| _d S )Nc                    s   t  | dS )Nru   )rj   ru   r>   r    r!   r/   r0   <lambda>  s    z)PersimmonModel.__init__.<locals>.<lambda>z.layersru   rk   r3   )r$   r%   model_config	hf_configr>   r!   
vocab_sizer    r   r&   embed_tokensr   num_hidden_layersstart_layer	end_layerlayersr   rM   ro   final_layernormr   make_empty_intermediate_tensors)r,   rv   r"   r-   rw   r0   r%      s(   

zPersimmonModel.__init__	input_idsr1   c                 C   s
   |  |S r2   )r|   r,   r   r/   r/   r0   embed_input_ids  s   
zPersimmonModel.embed_input_idsN	positionsintermediate_tensorsinputs_embedsc                 C   sz   t  jr|d ur|}n| |}n
|d usJ |d }t| j| j| jD ]}|||}q$t  js6td|iS | 	|}|S )Nr3   )
r
   is_first_rankr   r   r   r~   r   is_last_rankr   r   )r,   r   r   r   r   r3   layerr/   r/   r0   r5     s   
zPersimmonModel.forwardweightsc                 C   s   t | jdd}t }|D ]V\}}t|| rq|| }d|v rSt|dd }| jj}|d urS|j}	||	d | |ddf |	|d d   }|	||d }|
|	}t|dt}
|
|| || q|S )	NF)remove_duplicaterI   
output_dimr^   r_   r   weight_loader)dictnamed_parameterssetr   getattrr    rC   rW   rX   	transposereshaper   add)r,   r   params_dictloaded_paramsnameloaded_weightparamr   rE   loaded_weight_shaper   r/   r/   r0   load_weights&  s0   



zPersimmonModel.load_weightsr2   )r6   r7   r8   r	   r9   r%   r:   r;   r   r   r5   r   tupler   r   r<   r/   r/   r-   r0   rt      s     
,rt   c                
       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdejde	dB dejdB fddZ
dejdejdB fddZdeeeejf  dee fddZ  ZS )PersimmonForCausalLMr   ru   rv   r"   c                   sj   t    |jj}|| _|j| _t|t|dd| _t	|j|j
dt|dd| _t|j| _| jj| _d S )Nmodel)rv   r"   Flm_head)r?   r"   )r$   r%   ry   rz   r    r{   rt   r   r   r   r&   r   r   logits_processorr   )r,   rv   r"   r    r-   r/   r0   r%   G  s    

zPersimmonForCausalLM.__init__r   r1   c                 C   s   | j |S r2   )r   r   r   r/   r/   r0   r   Z  s   z$PersimmonForCausalLM.embed_input_idsNr   r   r   c                 C   s   | j ||||d}|S )N)r   r   r   r   )r   )r,   r   r   r   r   r3   r/   r/   r0   r5   ]  s   zPersimmonForCausalLM.forwardr3   c                 C   s   |  | j|}|S r2   )r   r   )r,   r3   logitsr/   r/   r0   compute_logitsl  s   z#PersimmonForCausalLM.compute_logitsr   c                 C   s   t | }||S r2   )r   r   )r,   r   loaderr/   r/   r0   r   s  s   
z!PersimmonForCausalLM.load_weights)NN)r6   r7   r8   r	   r9   r%   r:   r;   r   r   r5   r   r   r   r   r   r<   r/   r/   r-   r0   r   F  s(    

,r   )4__doc__collections.abcr   	itertoolsr   r:   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   %vllm.model_executor.layers.activationr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   utilsr   r   r   r   r   Moduler   r=   rj   rt   r   r/   r/   r/   r0   <module>   s4   	^7T