o
    eit-                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- e.e/Z0G dd de"Z1G dd de#Z2G dd deZ3G dd de Z4G dd de'Z5G d d! d!e+Z6G d"d# d#ej7Z8G d$d% d%e!Z9eG d&d' d'eZ:eG d(d) d)e)Z;G d*d+ d+e(eZ<g d,Z=dS )-zPyTorch OLMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask)MoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_grouped_mm_availablelogging)OutputRecorder   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)MixtralExpertsMixtralForCausalLMMixtralModel)Qwen2MoeTopKRouter   )OlmoeConfigc                       s   e Zd Zd fdd	Z  ZS )OlmoeRMSNormh㈵>c                    s   t  || d S N)super__init__)selfhidden_sizeeps	__class__ e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/olmoe/modular_olmoe.pyr%   .   s   zOlmoeRMSNorm.__init__)r"   )__name__
__module____qualname__r%   __classcell__r+   r+   r)   r,   r!   -   s    r!   c                   @      e Zd ZdS )OlmoeRotaryEmbeddingNr-   r.   r/   r+   r+   r+   r,   r2   2       r2   c                   @   r1   )OlmoeMLPNr3   r+   r+   r+   r,   r5   6   r4   r5   c                       s   e Zd ZddededB f fddZ		ddejdeejejf dejdB d	e	dB d
ej
dB dee deejejdB eej dB f fddZ  ZS )OlmoeAttentionNconfig	layer_idxc                    sB   t  || t|j|jd| _t|j|j |j |jd| _d S )Nr(   )	r$   r%   r!   r'   rms_norm_epsq_normnum_attention_headsnum_key_value_headsk_normr&   r7   r8   r)   r+   r,   r%   ;   s
   zOlmoeAttention.__init__hidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc                 K   s  |j d d }g |d| jR }| | |}	| | |}
| |}| jjd urP|	j	| jj | jjd |
j	| jj | jjd |j	| jj | jjd |	j
| dd}	|
j
| dd}
|j
| dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j|\}
}t| jjt}|| |	|
||f| jsdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )	N)minmaxr   r   )sincosrD           sliding_window)dropoutscalingrM   )shapehead_dimr;   q_projr>   k_projv_projr7   clip_qkvclamp_view	transposer   updater8   r   get_interface_attn_implementationr   trainingattention_dropoutrO   getattrreshape
contiguouso_proj)r&   r@   rA   rB   rC   rD   rE   input_shapehidden_shapequery_states
key_statesvalue_statesrK   rJ   cache_kwargsattention_interfaceattn_outputattn_weightsr+   r+   r,   forwardB   sH   	
	

zOlmoeAttention.forwardr#   )NN)r-   r.   r/   r    intr%   torchTensortupler   
LongTensorr   r   rk   r0   r+   r+   r)   r,   r6   :   s&    r6   c                   @   r1   )OlmoeExpertsNr3   r+   r+   r+   r,   rq   w   r4   rq   c                   @   r1   )OlmoeTopKRouterNr3   r+   r+   r+   r,   rr   {   r4   rr   c                       s2   e Zd Z fddZdejdejfddZ  ZS )OlmoeSparseMoeBlockc                    s"   t    t|| _t|| _d S r#   )r$   r%   rr   gaterq   expertsr&   r7   r)   r+   r,   r%      s   

zOlmoeSparseMoeBlock.__init__r@   rF   c           	      C   sD   |j \}}}|d|}| |\}}}| ||||||}|S )NrG   )rP   rW   rt   ru   r_   )	r&   r@   
batch_sizesequence_length
hidden_dim_top_k_weightstop_k_indexfinal_hidden_statesr+   r+   r,   rk      s   zOlmoeSparseMoeBlock.forward)r-   r.   r/   r%   rm   rn   rk   r0   r+   r+   r)   r,   rs      s    rs   c                       s&   e Zd Zdedef fddZ  ZS )OlmoeDecoderLayerr7   r8   c                    sV   t  || |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
d S )N)r7   r8   r9   )r$   r%   r'   r6   	self_attnrs   mlpr!   r:   input_layernormpost_attention_layernormr?   r)   r+   r,   r%      s   
zOlmoeDecoderLayer.__init__)r-   r.   r/   r    rl   r%   r0   r+   r+   r)   r,   r~      s    r~   c                   @   s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
eeddeedZe ZdZe d	d
 ZdS )OlmoePreTrainedModelr7   modelTr~   rC   r   )index)router_logitsr@   
attentionsc                 C   sn   t | | t|tr#tj|jd| jjd tj|j	d| jjd d S t|t
r5tj|jd| jjd d S d S )NrL   )meanstd)r   _init_weights
isinstancerq   initnormal_gate_up_projr7   initializer_range	down_projrr   weight)r&   moduler+   r+   r,   r      s   

z"OlmoePreTrainedModel._init_weightsN)r-   r.   r/   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar   rr   r~   r6   _can_record_outputsr   _can_compile_fullgraph_supports_attention_backendrm   no_gradr   r+   r+   r+   r,   r      s"   
 
r   c                       s   e Zd Zdef fddZ							ddejdB dejdB dejdB dedB d	ej	dB d
e
dB dejdB dee defddZ  ZS )
OlmoeModelr7   c                    sd   t    t j j| j| _t fddt	 j
D | _t j jd| _t d| _d S )Nc                    s   g | ]}t  |qS r+   )r~   ).0r8   r7   r+   r,   
<listcomp>   s    z'OlmoeModel.__init__.<locals>.<listcomp>r9   r   )r$   r%   r   	Embedding
vocab_sizer'   padding_idxembed_tokens
ModuleListrangenum_hidden_layerslayersr!   r:   normr2   
rotary_embrv   r)   r   r,   r%      s   zOlmoeModel.__init__N	input_idsrB   position_idsrC   inputs_embeds	use_cacherD   rE   rF   c              
   K   s   |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|}| ||}| jd | jj D ]}||f||
||||d|}qb| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )device)r7   r   rB   rD   rC   r   )rA   rB   r   rC   r   rD   )last_hidden_staterC   )
ValueErrorr   r7   r   get_seq_lengthrm   arangerP   r   	unsqueezer	   r   r   r   r   r
   )r&   r   rB   r   rC   r   r   rD   rE   past_seen_tokenscausal_maskr@   rA   decoder_layerr+   r+   r,   rk      sR   

	
zOlmoeModel.forward)NNNNNNN)r-   r.   r/   r    r%   rm   rp   rn   r   FloatTensorboolr   r   r
   rk   r0   r+   r+   r)   r,   r      s8    	
r   c                       s0   e Zd ZddiZ fddZ fddZ  ZS )OlmoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                    s"   t  | t|| _|j| _d S r#   )r$   r%   r   r   num_expertsrv   r)   r+   r,   r%     s   
zOlmoeForCausalLM.__init__c                    s   t  jdi |S )u  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OlmoeForCausalLM

        >>> model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")
        >>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
        ```
        Nr+   )r$   rk   )r&   super_kwargsr)   r+   r,   rk     s   zOlmoeForCausalLM.forward)r-   r.   r/   _tied_weights_keysr%   rk   r0   r+   r+   r)   r,   r      s    r   )r   r   r   )>__doc__collections.abcr   rm   r    r   r   cache_utilsr   r   
generationr   masking_utilsr	   modeling_outputsr
   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.output_capturingr   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   mixtral.modeling_mixtralr   r   r   qwen2_moe.modeling_qwen2_moer   configuration_olmoer    
get_loggerr-   loggerr!   r2   r5   r6   rq   rr   Modulers   r~   r   r   r   __all__r+   r+   r+   r,   <module>   sB    
=
H#