o
    پiM                     @   sr  d Z ddlZddlmZmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, e-e.Z/G dd de	j0Z1G dd de	j0Z2G dd de	j0Z3G dd de	j0Z4G dd de	j0Z5e5gZ6dS )zAInference-only Granite model compatible with HuggingFace weights.    N)AnyDictIterableOptionalTuple)nn)GraniteConfig)$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefix)get_exception_tracebackc                       sH   e Zd Z		ddedededee deddf fd	d
Zdd Z  Z	S )
GraniteMLPN hidden_sizeintermediate_size
hidden_actquant_configprefixreturnc                    sh   t    t||gd d|td|d| _t||d|td|d| _|dkr.td| dt | _	d S )	N   Fgate_up_projbiasr!   r"   	down_projsiluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r   r%   r   r(   
ValueErrorr
   act_fn)selfr   r   r    r!   r"   	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/granite.pyr+   4   s(   

zGraniteMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r%   r-   r(   )r.   xgate_up_r1   r1   r2   forwardR   s   
zGraniteMLP.forwardNr   )
__name__
__module____qualname__intstrr   r   r+   r7   __classcell__r1   r1   r/   r2   r   3   s"    r   c                       s   e Zd Z							ddeded	ed
edededeeee	f  de
dedee deddf fddZdejdejdedejfddZ  ZS )GraniteAttentionr   '  NT    r   configr   	num_headsnum_kv_headslayer_id
rope_thetarope_scalingrope_is_neox_stylemax_position_embeddingsr!   r"   r#   c              
      sR  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|d| j| j | _
| j| j
 | _| j| j
 | _|j| _|| _|	| _t|| j
| j| jd|
td|d| _t| j| j
 |d|
td|d| _t| j
| j
|	|||d| _t| j| j
| j| j||
td	|d
| _d S )Nr      head_dimFqkv_projr&   o_proj)
rotary_dimmax_positionbaserG   is_neox_styleattn)rD   rE   r!   r"   )r*   r+   r   r	   total_num_headsrC   total_num_kv_headsmaxrD   getattrrK   q_sizekv_sizeattention_multiplierscalingrF   rI   r   r   rL   r   rM   r   
rotary_embr   rR   )r.   rB   r   rC   rD   rE   rF   rG   rH   rI   r!   r"   tp_sizer/   r1   r2   r+   Z   sh   

	
zGraniteAttention.__init__	positionshidden_statesforward_batchc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )Ndim)rL   splitrW   rX   r[   rR   rM   )r.   r]   r^   r_   qkvr6   qkvattn_outputoutputr1   r1   r2   r7      s    zGraniteAttention.forward)r   r@   NTrA   Nr   )r9   r:   r;   r   r<   floatr   r   r=   r   boolr   r+   torchTensorr   r7   r>   r1   r1   r/   r2   r?   Y   sT    	
Kr?   c                       sr   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )GraniteDecoderLayerr   Nr   rB   rE   r!   r"   r#   c           	         s   t    |j| _|j| _t|dd}t|dd }|d ur(t|dd r(|j|d< t|dd}t|dd}t|| j|j|j||||||t	d	|d
| _
t| j|j|j|t	d|d| _t|j|jd| _t|j|jd| _d S )NrF   r@   rG    original_max_position_embeddingsrH   TrI   rA   	self_attn)rB   r   rC   rD   rE   rF   rG   rH   rI   r!   r"   mlp)r   r   r    r!   r"   eps)r*   r+   r   residual_multiplierrV   ro   r?   num_attention_headsnum_key_value_headsr   rp   r   r   r    rq   r   rms_norm_epsinput_layernormpost_attention_layernorm)	r.   rB   rE   r!   r"   rF   rG   rH   rI   r/   r1   r2   r+      sH   

zGraniteDecoderLayer.__init__r]   r^   r_   residualc                 C   sf   |d u r|}|  |}n|  ||\}}| j|||d| j }| ||\}}| || j }||fS )N)r]   r^   r_   )rx   rp   rt   ry   rq   )r.   r]   r^   r_   rz   r1   r1   r2   r7      s   
zGraniteDecoderLayer.forward)r   Nr   )r9   r:   r;   r   r<   r   r   r=   r+   rl   rm   r   r   r7   r>   r1   r1   r/   r2   rn      s4    -rn   c                       sb   e Zd Z		ddedee deddf fddZ	dd	ej	d
ej	de
dej	dej	f
ddZ  ZS )GraniteModelNr   rB   r!   r"   r#   c                    sj   t     | _ j| _ j| _t j j| _t	
 fddt jD | _t j jd| _d S )Nc              	      s(   g | ]}t  |td | dqS )zlayers.r!   r"   )rn   r   ).0irB   r"   r!   r1   r2   
<listcomp>  s    z)GraniteModel.__init__.<locals>.<listcomp>rr   )r*   r+   rB   pad_token_idpadding_idx
vocab_sizer   r   embed_tokensr   
ModuleListrangenum_hidden_layerslayersr   rw   norm)r.   rB   r!   r"   r/   r   r2   r+      s   
zGraniteModel.__init__	input_idsr]   r_   input_embedsc           
      C   sl   |d u r
|  |}n|}d }|| jj9 }tt| jD ]}| j| }|||||\}}q| ||\}}	|S r3   )r   rB   embedding_multiplierr   lenr   r   )
r.   r   r]   r_   r   r^   rz   r~   layerr6   r1   r1   r2   r7     s   

zGraniteModel.forwardr8   r3   )r9   r:   r;   r   r   r   r=   r+   rl   rm   r   r7   r>   r1   r1   r/   r2   r{      s0    r{   c                       s   e Zd Z		ddedee deddf fddZe	 			d d
ej
dej
dedej
dedefddZdd Zdd Zdeeeej
f  fddZ	d!dedededeej
 fddZ  ZS )"GraniteForCausalLMNr   rB   r!   r"   r#   c                    s   t    || _|| _t||td|d| _t|j|j	|td|d| _
| jjr0| j
| jj t|dr;d|j }nd }t||d| _ttjdd| _g d	| _d S )
Nmodelr|   lm_headlogits_scalingg      ?)logit_scaleT)pooling_type	normalize)	.qkv_projz.q_projre   )r   z.k_projrf   )r   z.v_projrg   ).gate_up_projz
.gate_projr   )r   z.up_projrJ   )r*   r+   rB   r!   r{   r   r   r   r   r   r   tie_word_embeddingstie_weightsr   hasattrr   r   logits_processorr   r   LASTpoolerstacked_params_mapping)r.   rB   r!   r"   r   r/   r1   r2   r+   2  s(   

zGraniteForCausalLM.__init__Fr   r]   r_   r   get_embeddingc                 C   s6   |  ||||}|s| ||| j|}|S | ||S r3   )r   r   r   r   )r.   r   r]   r_   r   r   r^   logits_processor_outputr1   r1   r2   r7   \  s   	
zGraniteForCausalLM.forwardc                 C   sT   | j D ]\}}}}||v r|||d td  |f  S q|d td  dfS )Nz.weightrJ   )r   replacer   )r.   name
param_nameweight_nameshard_id	num_shardr1   r1   r2    get_module_name_from_weight_namen  s   z3GraniteForCausalLM.get_module_name_from_weight_namec                 C   s   t |  }t|S r3   )dictnamed_parametersr   )r.   params_dictr1   r1   r2   get_num_paramsw  s   z!GraniteForCausalLM.get_num_paramsweightsc                 C   s  g d}t |  }|D ]w\}}d|v sd|v rqd|v s!d|v r"q|dr,||vr,qd|v r5| jjr5q|D ](\}}}||vrAq7|||}|drQ||vrQq7|| }	|	j}
|
|	||  n#|drj||vrjq|d	rt||vrtq|| }	t|	d
t	}
|
|	| qd S )Nr   zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerlm_head.weightz.biasz	.kv_scaleweight_loader)
r   r   
startswithrB   r   r   endswithr   rV   r   )r.   r   r   r   r   loaded_weightr   r   r   paramr   r1   r1   r2   load_weights{  s<   	
zGraniteForCausalLM.load_weightsd   rJ   r   truncate_sizer\   c              	      s  z|dkr"| j jr"td | jjj t	j
  d| W S |}d}| jD ]\}}}||v r<|||}|} nq)t|  }	|	| }
|dur|dv r| j j| }| j j| }| j j| j j }|dkrnd}|| }n|dkr{|| }|| }n|dkr|| | }|| }|
jd|| n-|d	v r| j j}|| }|dkrd}|}n|d
kr|}|}|
jd|| n|
j n|
j |d
krd|v sd|v r fddt|D }t	j|  t	j|d
d   t	j
  d| W S  ty   td| dt   Y dS w )zGet the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.

        Only used for unit test with an unoptimized performance.
        For optimized performance, please use torch.save and torch.load.
        r   zTword embedding is tied for this model, return embed_tokens.weight as lm_head.weight.N)re   rf   rg   re   r   rf   rg   )r   rJ   rJ   rM   r(   c                    s   g | ]}t  qS r1   )rl   
zeros_like)r}   r6   weightr1   r2   r     s    z:GraniteForCausalLM.get_weights_by_name.<locals>.<listcomp>ra   zError getting weights by name z in GraniteForCausalLM: )rB   r   loggerinfor   r   r   cputorl   float32numpytolistr   r   r   r   ru   rv   r   datanarrowr   r   distributed
all_gathercat	Exceptionerrorr   )r.   r   r   r\   mapped_namemapped_shard_idr   r   r   r   r   rC   rD   rK   offsetsizer   
slice_sizegathered_weightsr1   r   r2   get_weights_by_name  sx   


"z&GraniteForCausalLM.get_weights_by_namer8   )NF)r   rJ   )r9   r:   r;   r   r   r   r=   r+   rl   no_gradrm   r   rk   r   r7   r   r   r   r   r   r<   r   r>   r1   r1   r/   r2   r   1  sR    *	5r   )7__doc__loggingtypingr   r   r   r   r   rl   r   transformersr   sglang.srt.distributedr	   sglang.srt.layers.activationr
   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   r   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   sglang.utilsr   	getLoggerr9   r   Moduler   r?   rn   r{   r   
EntryClassr1   r1   r1   r2   <module>   s8   
&ZJ4 
I