o
    
۾i3D                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlmZ d dlmZm Z  d dl!m"Z"m#Z# d dl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- G dd dej.Z/G dd deZ0G dd dej.Z1G dd dej.Z2G dd dej.Z3G dd dej.Z4G dd  d ej.Z5G d!d" d"ej.e'Z6dS )#    )Iterable)isliceN)
DbrxConfig)CacheConfig
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)	Attention)FusedMoE)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sJ   e Zd ZdZ	ddedejdB f fddZdejdejfd	d
Z	  Z
S )
DbrxRouterzXA Router implementation for DBRX that returns logits for each expert
    per token.
    Nconfigparams_dtypec                    s@   t    t | _|jj| _|j| _t| j| jd|d d| _	d S )NF)biasr    quant_config)
super__init__r	   tp_size
ffn_configmoe_num_expertsnum_total_expertsd_modelr   layer)selfr   r    	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/dbrx.pyr$   4   s   

zDbrxRouter.__init__hidden_statesreturnc                 C   s   |  |\}}|S N)r*   )r+   r0   router_logits_r.   r.   r/   forwardE   s   zDbrxRouter.forwardr2   )__name__
__module____qualname____doc__r   torchdtyper$   Tensorr5   __classcell__r.   r.   r,   r/   r   /   s    r   c                	       s^   e Zd Z			ddededB dejdB def fddZd	e	j
d
ejdedefddZ  ZS )DbrxExpertsN r   r"   r    prefixc                    sT   t  j|jj|jj|j|jj|dd|t |d
 || _|j| _| jjj| j	 | _
d S )NT)
num_expertstop_khidden_sizeintermediate_sizer    reduce_resultsrenormalizer"   r%   r@   )r#   r$   r&   r'   	moe_top_kr)   ffn_hidden_sizer	   r   r%   rD   r+   r   r"   r    r@   r,   r.   r/   r$   K   s   zDbrxExperts.__init__paramloaded_weightweight_name
param_namec           	      C   s  t  }|j}| j}t|| |d | }|drS|drCt|d| j| j | jg}|d d |d d f |d d d|d d f< n|drQ||d d df< n|}|dr|drt|d| j| j | jg}|d d |d d f |d d |d| d d f< n|dr||d d df< n||d d < |d	r|drt|d| j| j | jg	dd}|d d d d |f |d d < d S ||d d < d S d S )
Nr   w1weightr   weight_scalev1   w2)
r   datarD   sliceendswithr:   reshaper%   r)   	transpose)	r+   rJ   rK   rL   rM   tp_rank
param_data
shard_sizeshardr.   r.   r/   weight_loaderc   sJ   

.


 


"zDbrxExperts.weight_loaderNNr?   )r6   r7   r8   r   r   r:   r;   strr$   nn	Parameterr<   r^   r=   r.   r.   r,   r/   r>   J   s,    r>   c                	       sZ   e Zd ZdZ			ddededB dejdB def fdd	Z	d
ej
dej
fddZ  ZS )DbrxMoEzA tensor-parallel MoE implementation for DBRX.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    Nr?   r   r"   r    r@   c                    sT   t    |j| _|d u rt }|| _t|| j| _t||| j| dd| _	d S )Nz.experts)r   r"   r    r@   )
r#   r$   r)   r:   get_default_dtyper    r   routerr>   expertsrI   r,   r.   r/   r$      s   
zDbrxMoE.__init__r0   r1   c                 C   s4   |j }|d| j}| |}| ||}||S )NrP   )shapeviewr)   re   rf   )r+   r0   
orig_shaper3   final_hidden_statesr.   r.   r/   r5      s
   

zDbrxMoE.forwardr_   )r6   r7   r8   r9   r   r   r:   r;   r`   r$   r<   r5   r=   r.   r.   r,   r/   rc      s    
rc   c                	       Z   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	fddZ
  ZS )DbrxAttentionNr?   r   cache_configr"   r@   c              	      sn  t    |j| _|j| _| j| j | _|jj| _|jj	| _	dt
|jjd}|j| _t| j| j| j| jd|| dd| _t| j| jd|| dd| _t| j| j|dd| _t }|| _| j| d	ksgJ | j| | _| j|kr|| j| d	ks{J n	|| j d	ksJ td
| j| | _| j| j | _| j| j | _| jd | _t| j| j| j| j||| dd| _d S )Ndefault)	rope_type
rope_thetaFz.Wqkv)r!   r"   r@   z	.out_projT)max_positionrope_parametersis_neox_styler   r   g      .attn)num_kv_headsrm   r"   r@   )r#   r$   r)   n_headstotal_num_headshead_dimattn_config
kv_n_headstotal_num_kv_headsclip_qkvintrp   max_seq_lenrq   r   Wqkvr   out_projr   
rotary_embr	   r%   	num_headsmaxru   q_sizekv_sizescalingr
   attn)r+   r   rm   r"   r@   rr   tp_world_sizer,   r.   r/   r$      sh   



	
zDbrxAttention.__init__position_idsr0   r1   c           	      C   s~   |  |\}}| jd ur|j| j | jd |j| j| j| jgdd\}}}| |||\}}| |||}| |\}}|S )N)minr   rP   )dim)	r   r|   clamp_splitr   r   r   r   r   )	r+   r   r0   qkvr4   qkvattn_outputr.   r.   r/   r5      s   
 zDbrxAttention.forwardr_   r6   r7   r8   r   r   r   r`   r$   r:   r<   r5   r=   r.   r.   r,   r/   rl      s(    Erl   c                	       rk   )DbrxFusedNormAttentionNr?   r   rm   r"   r@   c                    sJ   t    |j| _t|||| dd| _t| j| _t| j| _d S )Nrt   r@   )	r#   r$   r)   rl   r   ra   	LayerNormnorm_1norm_2r+   r   rm   r"   r@   r,   r.   r/   r$     s   
zDbrxFusedNormAttention.__init__r   r0   r1   c                 C   s:   |}|  |}| j||d}|| }|}| |}||fS N)r   r0   )r   r   r   )r+   r   r0   residualxr.   r.   r/   r5     s   

zDbrxFusedNormAttention.forwardr_   r   r.   r.   r,   r/   r     s(    r   c                	       rk   )	DbrxBlockNr?   r   rm   r"   r@   c                    s<   t    t|||| dd| _t||| dd| _d S )Nz.norm_attn_normr   z.ffn)r#   r$   r   norm_attn_normrc   ffnr   r,   r.   r/   r$   1  s
   
zDbrxBlock.__init__r   r0   r1   c                 C   s(   | j ||d\}}| |}|| }|S r   )r   r   )r+   r   r0   r   r.   r.   r/   r5   >  s   

zDbrxBlock.forwardr_   r   r.   r.   r,   r/   r   0  s(    r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )	DbrxModelr?   r   vllm_configr@   c                   s   t    |jj|j |j| _tjj| _	t
j fdd| dd\| _| _| _tjjdd| _|  D ]}t|drRt|jtjrR|dd  q>tdgj| _d S )	Nc                    s   t  | dS )Nr   )r   r   rm   r   r"   r.   r/   <lambda>[  s    z$DbrxModel.__init__.<locals>.<lambda>z.blocksr   gh㈵>)epsr!   r0   )r#   r$   model_config	hf_configrm   r"   r   
vocab_sizer)   wter   n_layersstart_layer	end_layerblocksra   r   norm_fmoduleshasattr
isinstancer!   rb   register_parameterr   make_empty_intermediate_tensors)r+   r   r@   moduler,   r   r/   r$   M  s,   

zDbrxModel.__init__	input_idsr1   c                 C   s
   |  |S r2   )r   r+   r   r.   r.   r/   embed_input_idsg  s   
zDbrxModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc                 C   sv   t  jr|d ur|}n| |}n|sJ |d }t| j| j| jD ]}|||}q"t  js4td|iS | 	|}|S )Nr0   )
r   is_first_rankr   r   r   r   r   is_last_rankr   r   )r+   r   r   r   r   r0   blockr.   r.   r/   r5   j  s   
zDbrxModel.forwardweightsc                 C   s:  dd dD }t | jdd}t }|D ]\}}| jd urF| j| }rF|| }t|dt}	| dkr7|n|d }|	|| || q|	drO|d	 }|D ]$\}
}||vrZqQ|
||
}t|| rfqQ|| }|j}	|	||||  nt|| r|qt||}|d u rq|| }t|dt}	|	|| || q|S )
Nc                 S   s&   g | ]}|d v r
dndd| fqS ))rN   rR   w13rT   zmlp.r.   ).0rL   r.   r.   r/   
<listcomp>  s    z*DbrxModel.load_weights.<locals>.<listcomp>)rN   rR   rT   F)remove_duplicater^   r   )rN   rT   rR   _weight)dictnamed_parameterssetr"   get_cache_scalegetattrr   r   addrW   replacer   r^   r   )r+   r   expert_params_mappingparams_dictloaded_paramsnamerK   
scale_namerJ   r^   rM   rL   r.   r.   r/   load_weights  sL   







zDbrxModel.load_weightsr2   )r6   r7   r8   r   r`   r$   r:   r<   r   r   r5   r   tupler   r   r=   r.   r.   r,   r/   r   L  s     
,r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
dejdejdB fddZdeeeejf  dee fddZ  ZS )DbrxForCausalLMr?   r   r   r@   c                   s|   t    |jj}|j}|| _|jrtd|| _t|t	|dd| _
t|j|j|t	|dd| _t|j| _| j
j| _d S )Nz5tie_word_embeddings is not supported for Dbrx models.transformer)r   r@   lm_head)r"   r@   )r#   r$   r   r   r"   r   tie_word_embeddings
ValueErrorr   r   r   r   r   r)   r   r   logits_processorr   )r+   r   r@   r   r"   r,   r.   r/   r$     s&   

zDbrxForCausalLM.__init__r   r1   c                 C   s   | j |S r2   )r   r   r   r.   r.   r/   r     s   zDbrxForCausalLM.embed_input_idsN	positionsr   r   c                 C   s   |  ||||}|S r2   )r   )r+   r   r   r   r   r0   r.   r.   r/   r5     s   zDbrxForCausalLM.forwardr0   c                 C   s   |  | j|}|S r2   )r   r   )r+   r0   logitsr.   r.   r/   compute_logits  s   zDbrxForCausalLM.compute_logitsr   c                 C   s   t | }||S r2   )r   r   )r+   r   loaderr.   r.   r/   r     s   
zDbrxForCausalLM.load_weights)NN)r6   r7   r8   r   r`   r$   r:   r<   r   r   r5   r   r   r   r   r   r=   r.   r.   r,   r/   r     s,    

,r   )7collections.abcr   	itertoolsr   r:   torch.nnra   transformersr   vllm.configr   r   vllm.distributedr   r   r	   $vllm.model_executor.layers.attentionr
   $vllm.model_executor.layers.fused_moer   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   utilsr   r   r   r   r   Moduler   r>   rc   rl   r   r   r   r   r.   r.   r.   r/   <module>   s4   	I'U!h