o
    -iXI                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1 G dd dej2Z3G dd dej2Z4G dd dej2Z5edddddd G d!d" d"ej2Z6G d#d$ d$ej2e+Z7dS )%z>Inference-only Ouro model compatible with HuggingFace weights.    )Iterable)AnyN)nn)PretrainedConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)AttentionType   )SupportsLoRA)AutoWeightsLoaderextract_layer_index'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sH   e Zd Z		ddededededB deddf fd	d
Zdd Z  ZS )OuroMLPN hidden_sizeintermediate_size
hidden_actquant_configprefixreturnc                    sh   t    t||gd d|| dd| _t||d|| dd| _|dkr.td| dt | _d S )	N   Fz.gate_up_projbiasr%   r&   z
.down_projsiluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr"   r#   r$   r%   r&   	__class__ \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/ouro.pyr-   H   s(   

zOuroMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r.   r1   r/   )r2   xgate_up_r5   r5   r6   forwarde   s   
zOuroMLP.forward)Nr!   )	__name__
__module____qualname__intstrr   r-   r;   __classcell__r5   r5   r3   r6   r    G   s"    r    c                       s   e Zd Zddddejdfdededededed	edB d
edB de	de	de
e	ef dB ddf fddZdejdejdedejfddZ  ZS )OuroAttentioni   Nr!   configr"   	num_headsnum_kv_headsmax_positioncache_configr%   r&   	attn_typedual_chunk_attention_configr'   c                    s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|
| _t|dd}|j}t|| j	| j| jd|| dd| _t| j| j	 |d|| d	d| _t| j	||j|
d
| _t | _t|D ]:}t|}|| | }|d| d| }| jt| j| j	| jf| j|||	| dd|
r||
dni  qd S )Nr   r   g      total_ut_steps   Fz	.qkv_projr)   z.o_proj)rF   rope_parametersrI   zlayers.z.attn)rE   rG   r%   rH   r&   )	layer_idxrI   )r,   r-   r"   r
   total_num_headsrD   total_num_kv_headsmaxrE   head_dimq_sizekv_sizescalingrI   getattrnum_hidden_layersr   qkv_projr   o_projr   rL   
rotary_embr   
ModuleListattnranger   replaceappendr   )r2   rC   r"   rD   rE   rF   rG   r%   r&   rH   rI   tp_sizerJ   total_layersut_stepbase_layer_idxunique_layer_idxunique_prefixr3   r5   r6   r-   m   s   

	

zOuroAttention.__init__	positionshidden_states
current_utc                 C   sd   |  |\}}|j| j| j| jgdd\}}}| |||\}}| j| |||}	| |	\}
}|
S )N)dim)rW   splitrR   rS   rY   r[   rX   )r2   re   rf   rg   qkvr:   qkvattn_outputoutputr5   r5   r6   r;      s    zOuroAttention.forward)r<   r=   r>   r   DECODERr   r?   r   r   r@   dictr   r-   torchTensorr;   rA   r5   r5   r3   r6   rB   l   sN    	
\rB   c                       sz   e Zd Z			ddededB dedB deddf
 fdd	Z	dd
ej	dej	de
dej	dB deej	ej	f f
ddZ  ZS )OuroDecoderLayerNr!   rC   rG   r%   r&   r'   c                    s   t    |j| _t|dd }t|ddrtj}ntj}t|| j|j|j	|j
||| d||d
| _t| j|j|j|| dd| _t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d S )	NrI   	is_causalTz
.self_attn)
rC   r"   rD   rF   rE   rG   r%   r&   rH   rI   z.mlp)r"   r#   r$   r%   r&   eps)r,   r-   r"   rU   r   rq   ENCODER_ONLYrB   num_attention_headsmax_position_embeddingsnum_key_value_heads	self_attnr    r#   r$   mlpr   rms_norm_epsinput_layernorminput_layernorm_2post_attention_layernormpost_attention_layernorm_2)r2   rC   rG   r%   r&   rI   rH   r3   r5   r6   r-      sF   
zOuroDecoderLayer.__init__re   rf   rg   residualc                 C   sn   |d u r|}|  |}n|  ||\}}| j|||d}| |}| ||\}}| |}| |}||fS )N)re   rf   rg   )r   r}   r   r   r~   r   )r2   re   rf   rg   r   r5   r5   r6   r;     s   


zOuroDecoderLayer.forward)NNr!   r7   )r<   r=   r>   r   r   r   r@   r-   rs   rt   r?   tupler;   rA   r5   r5   r3   r6   ru      s6    4ru   rh   )	input_idsre   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s   e Zd Zdeddededeej f fddZ	de
jd	e
jfd
dZ		dde
jde
jdedB de
jdB d	e
jeB f
ddZdeeee
jf  d	ee fddZ  ZS )	OuroModelr!   )r&   decoder_layer_typevllm_configr&   r   c                   s  t    |jj|j |j jd ur)tdr)jj	ks)J d
jj	| _| _j| _tjj| dd| _pDttj	 fdd| dd\| _| _| _td	d
gj| _tjjd| _tjddd| _t| jdd| _d S )Nmax_window_layerszSliding window for some but all layers is not supported. This model uses sliding window but `max_window_layers` = {} is less than `num_hidden_layers` = {}. Please open an issue to discuss this feature.z.embed_tokensr%   r&   c                    s    | dS )N)rC   rG   r%   r&   r5   r&   rG   rC   r   r%   r5   r6   <lambda>R  s    z$OuroModel.__init__.<locals>.<lambda>z.layersr   rf   r   rw   r   T)r*   rJ   rK   )r,   r-   model_config	hf_configrG   r%   sliding_windowhasattrr   rV   formatrC   
vocab_sizer   r"   embed_tokensru   r   start_layer	end_layerlayersr   make_empty_intermediate_tensorsr   r   normr   early_exit_gaterU   rJ   )r2   r   r&   r   r3   r   r6   r-   (  sF   


zOuroModel.__init__r   r'   c                 C   s
   |  |S r7   )r   r2   r   r5   r5   r6   embed_input_idsc  s   
zOuroModel.embed_input_idsNre   r   r   c           
      C   sj   |d ur|}n|  |}t| jD ]!}d }| j| j| j D ]}|||||\}}q| ||\}}	q|S r7   )r   r\   rJ   r   r   r   r   )
r2   r   re   r   r   rf   rg   r   layerr:   r5   r5   r6   r;   f  s   

zOuroModel.forwardweightsc                 C   st  g d}t | jdd}t }|D ]\}}d|v rq| jd urH| j| }rH|| }t|dt}	| dkr9|n|d }|	|| || q|D ]D\}
}}||vrTqJ|	||
}|
drd||vrdqJ|
drst||}|d u rsqJ|| }t|dt}	|	tkr|	|| n|	|||  n#|
dr||vrqt||}|d u rq|| }t|dt}	|	|| || q|S )	N))rW   q_projrl   )rW   k_projrm   )rW   v_projrn   )r.   	gate_projr   )r.   up_projr   F)remove_duplicatezrotary_emb.inv_freqweight_loaderr   z.biasscale)rr   named_parameterssetr%   get_cache_scalerU   r   ri   addr]   endswithr   )r2   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_idr5   r5   r6   load_weights{  sT   






zOuroModel.load_weightsNN)r<   r=   r>   ru   r	   r@   typer   Moduler-   rs   rt   r   r   r;   r   r   r   r   rA   r5   r5   r3   r6   r     s2    ;
,r   c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ			ddejdejde
dB dejdB deje
B f
ddZdejdejdB fddZdeeeejf  dee fddZ  ZS )OuroForCausalLM)r   r   r   r   r   )rW   r.   r!   r   r   r&   c                   s   t    |jj}|j}|| _|| _t|t|dd| _|j	r%| jj
| _nt|j|j|t|dd| _t|j| _| jj| _d S )Nmodel)r   r&   lm_headr   )r,   r-   r   r   r%   rC   r   r   r   tie_word_embeddingsr   r   r   r   r"   r   logits_processorr   )r2   r   r&   rC   r%   r3   r5   r6   r-     s&   

zOuroForCausalLM.__init__r   r'   c                 C   s   | j |S r7   )r   r   r   r5   r5   r6   r     s   zOuroForCausalLM.embed_input_idsNre   r   r   c                 C   s   |  ||||}|S r7   )r   )r2   r   re   r   r   rf   r5   r5   r6   r;     s   zOuroForCausalLM.forwardrf   c                 C   s   |  | j|}|S r7   )r   r   )r2   rf   logitsr5   r5   r6   compute_logits  s   zOuroForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   rC   r   r   )r2   r   loaderr5   r5   r6   r     s
   
zOuroForCausalLM.load_weightsr   )r<   r=   r>   packed_modules_mappingr	   r@   r-   rs   rt   r   r   r;   r   r   r   r   r   rA   r5   r5   r3   r6   r     s6    

,r   )8__doc__collections.abcr   typingr   rs   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.v1.attention.backendr   
interfacesr   utilsr   r   r   r   r   r   r    rB   ru   r   r   r5   r5   r5   r6   <module>   sF   	%kH 