o
    
۾i                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ eeZG dd deZG dd dejZG dd dejZG dd deZdS )zBInference-only FlexOlmo model compatible with HuggingFace weights.    N)nn)
VllmConfig)$get_tensor_model_parallel_world_size)init_logger)FusedMoE)RMSNorm)ReplicatedLinear)OlmoeAttentionOlmoeForCausalLM)FlexOlmoConfigc                       s,   e Zd Zdddedef fddZ  ZS )FlexOlmoAttention prefixvllm_configr   c                   sZ   t  j||d |jj}t|tsJ t| j| j |j	d| _
t| j| j |j	d| _d S )Nr   r   eps)super__init__model_config	hf_config
isinstancer   r   total_num_kv_headshead_dimrms_norm_epsk_normtotal_num_headsq_normselfr   r   r   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/flex_olmo.pyr   !   s   zFlexOlmoAttention.__init__)__name__
__module____qualname__r   strr   __classcell__r#   r#   r!   r$   r       s    $r   c                       sF   e Zd ZdZdddedef fddZdejd	ejfd
dZ	  Z
S )FlexOlmoMoEa	  A tensor-parallel MoE implementation for FlexOlmo that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    r   r   r   r   c                   s   t    |jj}t|tsJ t }t|j|j	ddd | dd| _
t|j	|j|j|jddd || dtjd
| _|j| _d S )NFz.gate)biasreturn_biasquant_configr   Tz.experts)
num_expertstop_khidden_sizeintermediate_sizereduce_resultsrenormalizer-   tp_sizer   router_logits_dtype)r   r   r   r   r   r   r   r   r0   r.   gater   num_experts_per_tokr1   torchfloat32expertsr/   )r    r   r   r   r4   r!   r#   r$   r   8   s2   
	zFlexOlmoMoE.__init__hidden_statesreturnc                 C   sJ   |j }|j d }|d|}| |}| j|  | d}||S )N)r;   router_logits)shapeviewr6   r:   detachclonefloat)r    r;   
orig_shape
hidden_dimr>   final_hidden_statesr#   r#   r$   forwardY   s   



zFlexOlmoMoE.forward)r%   r&   r'   __doc__r   r(   r   r8   TensorrG   r)   r#   r#   r!   r$   r*   /   s    !r*   c                       sd   e Zd Zdddededdf fddZd	ejd
ejdejdB deejejdB f fddZ	  Z
S )FlexOlmoDecoderLayerr   r   r   r   r<   Nc                   sp   t    |jj}t|tsJ t|| dd| _t|j	|j
d| _t|j	|j
d| _t|| dd| _d S )Nz
.self_attnr   r   z.mlp)r   r   r   r   r   r   r   	self_attnr   r0   r   post_attention_layernormpost_feedforward_layernormr*   mlpr   r!   r#   r$   r   l   s   

zFlexOlmoDecoderLayer.__init__	positionsr;   residualc                 C   sJ   |}|  ||}| |}|| }|}| |}| |}|| }|d fS )N)rK   rL   rN   rM   )r    rO   r;   rP   r#   r#   r$   rG   }   s   


zFlexOlmoDecoderLayer.forward)r%   r&   r'   r   r(   r   r8   rI   tuplerG   r)   r#   r#   r!   r$   rJ   k   s     rJ   c                       s<   e Zd ZdZdeddededeej	 f fddZ
  ZS )	FlexOlmoForCausalLMFr   )r   
layer_typer   r   rS   c                   s   t  j|||d d S )N)r   r   rS   )r   r   )r    r   r   rS   r!   r#   r$   r      s   zFlexOlmoForCausalLM.__init__)r%   r&   r'   fall_back_to_pt_during_loadrJ   r   r(   typer   Moduler   r)   r#   r#   r!   r$   rR      s    rR   )rH   r8   r   vllm.configr   vllm.distributedr   vllm.loggerr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr    vllm.model_executor.models.olmoer	   r
   vllm.transformers_utils.configsr   r%   loggerr   rV   r*   rJ   rR   r#   r#   r#   r$   <module>   s    <&