o
    پi{>                     @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ej*e( ddd Z+G dd dej,Z-G dd dej,Z.G dd dej,Z/G dd dej,Z0G dd dej,Z1G dd  d ej,Z2G d!d" d"e2Z3e2e3gZ4dS )#zPyTorch Cohere model.    )IterableOptionalTupleN)nn)	Parameter)Cohere2ConfigCohereConfigPretrainedConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
SiluAndMul)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)VocabParallelEmbedding)ForwardBatch)default_weight_loadermaybe_remap_kv_scale_name)
add_prefixget_compiler_backendset_weight_attrs)backendc                 C   sh   | j }| tj} | jddd}| | djddd}| | t||  } |tj|  } | |S )NT)keepdim   )dtypetotorchfloat32meanpowrsqrt)hidden_statesweightvariance_epsiloninput_dtyper#   variance r+   N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/commandr.pylayer_norm_funcG   s   
r-   c                       s<   e Zd Zd fdd	ZdddZdedejfd	d
Z  Z	S )	LayerNormNh㈵>c                    s8   t    tt|| _|| _t| jd| j	i d S )Nweight_loader)
super__init__r   r   r!   onesr'   r(   r   r0   )selfparam_shapeeps	__class__r+   r,   r2   S   s   
zLayerNorm.__init__c                 C   s   t || j| j}||fS N)r-   r'   r(   )r4   r&   	residualsr+   r+   r,   forwardY   s   
zLayerNorm.forwardparamloaded_weightc                 C   sf   t  }| dkrdnd }|j}|d ur$|j| }|| }||||}|j|jks,J || d S )N   r   )r
   dimdatashapenarrowcopy_)r4   r<   r=   tp_rank	shard_dim
param_data
shard_size	start_idxr+   r+   r,   r0   _   s   
zLayerNorm.weight_loader)Nr/   r9   )
__name__
__module____qualname__r2   r;   r   r!   Tensorr0   __classcell__r+   r+   r7   r,   r.   R   s    
r.   c                       s8   e Zd Z		d	dee def fddZdd Z  ZS )
	CohereMLPN quant_configprefixc                    sn   t    || _|j| _|j| _t| j| jgd d|td|d| _t| j| jd|td|d| _	t
 | _d S )Nr   Fgate_up_projbiasrP   rQ   	down_proj)r1   r2   confighidden_sizeintermediate_sizer   r   rR   r   rU   r   act_fnr4   rV   rP   rQ   r7   r+   r,   r2   m   s&   

zCohereMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S r9   )rR   rY   rU   )r4   xgate_up_r+   r+   r,   r;      s   
zCohereMLP.forwardNrO   )	rI   rJ   rK   r   r   strr2   r;   rM   r+   r+   r7   r,   rN   l   s    rN   c                	       sb   e Zd Z			ddededee def fdd	Zd
d Z	de
jde
jdede
jfddZ  ZS )CohereAttentionr   NrO   rV   layer_idrP   rQ   c                    s  t    t }|| _|j| _|j| _|j| _| j| | _| j| j | _	|j
| _| j|kr7| j| dks6J n	|| j dks@J td| j| | _| j| j	 | _| j| j	 | _| j	d | _t|dd pht|dd| _|j| _t|dd | _t|dd	| _t| j| j	| j| jd	|td
|d| _t| j| j	 | jd	|td|d| _t| j	| j	| j| j| jd	d| _t|t| _t|t | _!| j!r|j"| dkr|j#| _$nd| _$t%| j| j	| j| j|| j$|td|d| _&| jrt'| j| j	f|j(d| _)t'| j| j	f|j(d| _*d S d S )Nr   r>   g      model_max_lengthmax_position_embeddingsi    rope_scalinguse_qk_normFqkv_projrS   o_proj)
rotary_dimmax_positionbaserd   is_neox_stylesliding_attentionr   attn)num_kv_headsra   sliding_window_sizerP   rQ   r5   r6   )+r1   r2   r   rV   attention_dropoutrW   num_attention_headstotal_num_heads	num_headshead_dimnum_key_value_headstotal_num_kv_headsmaxrn   q_sizekv_sizescalinggetattrrc   
rope_thetard   re   r   r   rf   r   rg   r   
rotary_emb
isinstancer   v1r   v2layer_typessliding_windowro   r   rm   r.   layer_norm_epsq_normk_norm)r4   rV   ra   rP   rQ   tp_sizer7   r+   r,   r2      s   


	
	


zCohereAttention.__init__c                 C   s   |j g |jd d d| jR  }|j g |jd d d| jR  }| |\}}| |\}}|j g |jd d dR  }|j g |jd d dR  }||fS )Nr   )viewrA   ru   r   r   )r4   qkr]   r+   r+   r,   _apply_qk_norm   s   $$zCohereAttention._apply_qk_norm	positionsr&   forward_batchreturnc                 C   s   |  |\}}|j| j| j| jgdd\}}}| jr"| ||\}}| js*| jdkr3| |||\}}| 	||||}	| 
|	\}
}|
S )Nr   )r?   r   )rf   splitry   rz   re   r   r   ro   r~   rm   rg   )r4   r   r&   r   qkvr]   r   r   vattn_outputoutputr+   r+   r,   r;      s    zCohereAttention.forwardr   NrO   )rI   rJ   rK   r	   intr   r   r_   r2   r   r!   rL   r   r;   rM   r+   r+   r7   r,   r`      s.    W	r`   c                       sn   e Zd Z			ddededee def fdd	Zd
e	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )CohereDecoderLayerr   NrO   rV   ra   rP   rQ   c                    sV   t    |j| _t|||td|d| _t||td|d| _t|j|j	d| _
d S )N	self_attn)ra   rP   rQ   mlprP   rQ   rp   )r1   r2   rW   r`   r   r   rN   r   r.   r   input_layernorm)r4   rV   ra   rP   rQ   r7   r+   r,   r2     s    
zCohereDecoderLayer.__init__r   r&   r   residualr   c                 C   sB   |}|  ||\}}| j|||d}| |}|| | }||fS )N)r   r&   r   )r   r   r   )r4   r   r&   r   r   hidden_states_attentionhidden_states_mlpr+   r+   r,   r;     s   
zCohereDecoderLayer.forwardr   )rI   rJ   rK   r	   r   r   r   r_   r2   r!   rL   r   r   r;   rM   r+   r+   r7   r,   r     s0    r   c                       sT   e Zd Z		ddedee def fddZdej	d	ej	d
e
dej	fddZ  ZS )CohereModelNrO   rV   rP   rQ   c                    sb   t     | _ j| _t j j| _t fddt	 j
D | _t j jd| _d S )Nc              	      s(   g | ]}t  |td | dqS )zlayers.r   )r   r   ).0irV   rQ   rP   r+   r,   
<listcomp>@  s    z(CohereModel.__init__.<locals>.<listcomp>rp   )r1   r2   rV   
vocab_sizer   rW   embed_tokensr   
ModuleListrangenum_hidden_layerslayersr.   r   normrZ   r7   r   r,   r2   3  s   
zCohereModel.__init__	input_idsr   r   r   c           	      C   sR   |  |}d }tt| jD ]}| j| }|||||\}}q| ||\}}|S r9   )r   r   lenr   r   )	r4   r   r   r   r&   r   r   layerr]   r+   r+   r,   r;   N  s   


zCohereModel.forwardr^   )rI   rJ   rK   r	   r   r   r_   r2   r!   rL   r   r;   rM   r+   r+   r7   r,   r   2  s&    r   c                	       s|   e Zd Z		ddedee deddf fddZe	 d	ej
d
ej
dedej
fddZdeeeej
f  fddZ  ZS )CohereForCausalLMNrO   rV   rP   rQ   r   c                    sN   t    || _|| _t|dd | _t|| jd| _t||t	d|d| _
d S )Nlogit_scale)r   model)rQ   )r1   r2   rV   rP   r|   r   r   logits_processorr   r   r   rZ   r7   r+   r,   r2   c  s   
zCohereForCausalLM.__init__r   r   r   c                 C   s"   |  |||}| ||| j j|S r9   )r   r   r   )r4   r   r   r   r&   r+   r+   r,   r;   r  s   zCohereForCausalLM.forwardweightsc                 C   s   g d}t |  }t }|D ]\\}}|D ](\}}}	||vrq|||}|dr/||vr/q|| }
|
j}||
||	  n(d|v rCq|drM||vrMqt||}|d u rWq|| }
t|
dt}||
| |	| qd S )N))rf   q_projr   )rf   k_projr   )rf   v_projr   )rR   	gate_projr   )rR   up_projr>   z.biaszlm_head.weightr0   )
dictnamed_parameterssetreplaceendswithr0   r   r|   r   add)r4   r   stacked_params_mappingparams_dictloaded_paramsnamer=   
param_name
shard_nameshard_idr<   r0   r+   r+   r,   load_weights  s4   

zCohereForCausalLM.load_weightsr^   )rI   rJ   rK   r	   r   r   r_   r2   r!   no_gradrL   r   r;   r   r   r   rM   r+   r+   r7   r,   r   b  s.    $r   c                   @   s   e Zd ZdS )Cohere2ForCausalLMN)rI   rJ   rK   r+   r+   r+   r,   r     s    r   )5__doc__typingr   r   r   r!   torch.utils.checkpointr   torch.nn.parameterr   transformersr   r   r	   sglang.srt.distributedr
   r   sglang.srt.layers.activationr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr   r   r   compiler-   Moduler.   rN   r`   r   r   r   r   
EntryClassr+   r+   r+   r,   <module>   s8   %

"s10J