o
    -iC                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1 G dd dej2Z3G dd dej2Z4G dd dej2Z5eG dd dej2Z6G dd  d ej2e*e+Z7dS )!zEInference-only IBM Granite model compatible with HuggingFace weights.    )Iterable)isliceN)nn)GraniteConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parametermake_layersmaybe_prefixc                       sN   e Zd Z			ddededededB ded	ed
df fddZdd Z  Z	S )
GraniteMLPNF hidden_sizeintermediate_size
hidden_actquant_configbiasprefixreturnc                    sh   t    t||gd ||| dd| _t||||| dd| _|dkr.td| dt | _d S )	N   .gate_up_proj)
input_sizeoutput_sizesr'   r&   r(   z
.down_projr,   output_sizer'   r&   r(   siluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr#   r$   r%   r&   r'   r(   	__class__ _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/granite.pyr2   E   s(   
	
zGraniteMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r3   r6   r4   )r7   xgate_up_r:   r:   r;   forwardc   s   
zGraniteMLP.forward)NFr"   )
__name__
__module____qualname__intstrr   boolr2   r@   __classcell__r:   r:   r8   r;   r!   D   s(    r!   c                       sv   e Zd Z					ddedededed	ed
edB dededB deddf fddZ	de
jde
jde
jfddZ  ZS )GraniteAttention    NFr"   configr#   	num_headsnum_kv_headsmax_position_embeddingsr&   r'   cache_configr(   r)   c
              	      sV  t    || _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr/| j|
 dks.J n	|
| j dks8J td| j|
 | _t	|dd | _
| j
d u rT| j| j | _
| j| j
 | _| j| j
 | _|j| _|| _t|| j
| j| j|||	 dd| _t| j| j
 ||||	 dd| _t| j
||jd| _t| j| j
| j| j|||	 d	d
| _d S )Nr   r   head_dim	.qkv_proj)r#   	head_sizetotal_num_headstotal_num_kv_headsr'   r&   r(   z.o_projr.   )max_positionrope_parametersz.attn)rL   rN   r&   r(   )r1   r2   r#   r   rR   rK   rS   maxrL   getattrrO   q_sizekv_sizeattention_multiplierscalingrM   r   qkv_projr   o_projr   rU   
rotary_embr   attn)r7   rJ   r#   rK   rL   rM   r&   r'   rN   r(   tp_sizer8   r:   r;   r2   k   s`   


	
zGraniteAttention.__init__	positionshidden_statesc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)r\   splitrX   rY   r^   r_   r]   )
r7   ra   rb   qkvr?   qkvattn_outputoutputr:   r:   r;   r@      s    zGraniteAttention.forward)rI   NFNr"   )rA   rB   rC   r   rD   r   rF   r   rE   r2   torchTensorr@   rG   r:   r:   r8   r;   rH   j   sD    	
ErH   c                       sh   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	de
ej	ej	f fddZ  ZS )GraniteDecoderLayerNr"   rJ   rN   r&   r(   r)   c                    s   t    |j| _|j| _t|dd}t|ddpt|dd}t|| j|jt|d|j||||| dd	| _t| j|j	|j
|t|d	d| d
d| _t|j|jd| _t|j|jd| _d S )NrM   rI   attention_biasFr'   num_key_value_headsz
.self_attn)	rJ   r#   rK   rL   rM   r&   r'   rN   r(   mlp_biasz.mlp)r#   r$   r%   r&   r'   r(   eps)r1   r2   r#   residual_multiplierrW   rH   num_attention_heads	self_attnr!   r$   r%   mlpr   rms_norm_epsinput_layernormpost_attention_layernorm)r7   rJ   rN   r&   r(   rM   ro   r8   r:   r;   r2      s@   

zGraniteDecoderLayer.__init__ra   rb   c                 C   sT   |}|  |}| j||d}||| j  }|}| |}| |}||| j  }|S )N)ra   rb   )ry   rv   rt   rz   rw   )r7   ra   rb   residualr:   r:   r;   r@      s   


zGraniteDecoderLayer.forward)NNr"   )rA   rB   rC   r   r   r   rE   r2   rl   rm   tupler@   rG   r:   r:   r8   r;   rn      s,    +rn   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )GraniteModelr"   r(   vllm_configr(   c                   s   t    |jj|j |j| _| _t js j	r+t j
r+tjjd| _nt | _tj fdd| dd\| _| _| _t j
rUtjjd| _d S t | _d S )N)r&   c                    s   t  | dS )N)rJ   rN   r&   r(   )rn   r~   rN   rJ   r&   r:   r;   <lambda>  s    z'GraniteModel.__init__.<locals>.<lambda>z.layersr~   rr   )r1   r2   model_config	hf_configrN   r&   rJ   r
   is_first_ranktie_word_embeddingsis_last_rankr   
vocab_sizer#   embed_tokensr   r   num_hidden_layersstart_layer	end_layerlayersr   rx   norm)r7   r   r(   r8   r   r;   r2      s2   


zGraniteModel.__init__	input_idsr)   c                 C   s
   |  |S r<   )r   r7   r   r:   r:   r;   embed_input_ids#  s   
zGraniteModel.embed_input_idsNra   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}|| jj9 }n
|d usJ |d }t| j| j| jD ]}|||}q*t  j	s<t
d|iS | |}|S )Nrb   )r
   r   r   rJ   embedding_multiplierr   r   r   r   r   r   r   )r7   r   ra   r   r   rb   layerr:   r:   r;   r@   &  s    

zGraniteModel.forwardweightsc                 C   sF  g d}t |  }t }|D ]\}}| jd urA| j| }rA|| }t|dt}	| dkr2|n|d }|	|| || q|D ].\}
}}||vrMqC|	||
}|
dr]||vr]qCt|| rcqC|| }|j}	|	|||  n)|
dr|||vr|qt||}|d u rqt|| rq|| }t|dt}	|	|| || q|S )N))rP   z.q_projrg   )rP   z.k_projrh   )rP   z.v_projri   )r+   z
.gate_projr   )r+   z.up_projr   weight_loaderr   z.bias)dictnamed_parameterssetr&   get_cache_scalerW   r   rd   addreplaceendswithr   r   r   )r7   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_idr:   r:   r;   load_weightsE  sL   






zGraniteModel.load_weightsr<   )rA   rB   rC   r	   rE   r2   rl   rm   r   r   r@   r   r|   r   r   rG   r:   r:   r8   r;   r}      s     #
,r}   c                       s   e Zd Zg dddgdZdddZdd	d
edef fddZdej	dej	fddZ
		d#dej	dej	dedB dej	dB dej	eB f
ddZdej	dej	dB fddZdedejdejdefddZd eeeej	f  dee fd!d"Z  ZS )$GraniteForCausalLM)q_projk_projv_proj	gate_projup_proj)r\   r3   input_embeddingsoutput_embeddings)r   lm_headr"   r~   r   r(   c                   s   t    |jj}|j}|| _|| _t|t|dd| _t	 j
rRt|j|j|t|dd| _|jr8| jjj| j_t|dd}t|drH||j }t|j|d| _d S t | _d S )	Nmodel)r   r(   r   )r&   r(   logit_scaleg      ?logits_scaling)scale)r1   r2   r   r   r&   rJ   r}   r    r   r
   r   r   r   r#   r   r   r   weightrW   hasattrr   r   logits_processorr   )r7   r   r(   rJ   r&   r   r8   r:   r;   r2     s0   



zGraniteForCausalLM.__init__r   r)   c                 C   s   | j |S r<   )r   r   r   r:   r:   r;   r     s   z"GraniteForCausalLM.embed_input_idsNra   r   r   c                 C   s   |  ||||}|S r<   )r   )r7   r   ra   r   r   model_outputr:   r:   r;   r@     s   zGraniteForCausalLM.forwardrb   c                 C   s   |  | j|}|S r<   )r   r   )r7   rb   logitsr:   r:   r;   compute_logits  s   z!GraniteForCausalLM.compute_logits
batch_sizedtypedevicec                 C   s    t dtj|| jjf||diS )Nrb   )r   r   )r   rl   zerosrJ   r#   )r7   r   r   r   r:   r:   r;   make_empty_intermediate_tensors  s   z2GraniteForCausalLM.make_empty_intermediate_tensorsr   c                 C   s(   | j jrdgnd }t| |d}||S )Nzlm_head.)skip_prefixes)rJ   r   r   r   )r7   r   r   loaderr:   r:   r;   r     s   
zGraniteForCausalLM.load_weights)NN)rA   rB   rC   packed_modules_mappingembedding_modulesr	   rE   r2   rl   rm   r   r   r@   r   rD   r   r   r   r   r|   r   r   rG   r:   r:   r8   r;   r     sF     

,r   )8__doc__collections.abcr   	itertoolsr   rl   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   r    Moduler!   rH   rn   r}   r   r:   r:   r:   r;   <module>   s8   	&SA 