o
    eiI                  
   @   s,  d Z ddlZddlm  mZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- e.e/Z0			d5dej1e2ej1 B dB de3dB dej1dB dej1e3B fddZ4eG dd dej5Z6G dd dej5Z7G dd dej5Z8G d d! d!e*Z9G d"d# d#e+Z:G d$d% d%e#Z;G d&d' d'eZ<G d(d) d)e)Z=G d*d+ d+e(Z>G d,d- d-e$Z?G d.d/ d/e&Z@G d0d1 d1e'ZAG d2d3 d3e%ZBg d4ZCdS )6zPyTorch Mixtral model.    N)nn   )initialization)ACT2FN)CacheDynamicCache)use_experts_implementation)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsis_grouped_mm_availablelogging)OutputRecorder   )	MistralAttentionMistralForCausalLMMistralForQuestionAnswering MistralForSequenceClassificationMistralForTokenClassificationMistralModelMistralPreTrainedModelMistralRMSNormMistralRotaryEmbedding   )MixtralConfiggate_logitsnum_expertsattention_maskreturnc                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS  )to).0
layer_gatecompute_devicer$   i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mixtral/modular_mixtral.py
<listcomp>W   s    z,load_balancing_loss_func.<locals>.<listcomp>dim)
isinstancetupledevicetorchcatr   
functionalsoftmaxtopkone_hotmeanfloatshapeexpandreshaper%   sum	unsqueeze)r    r!   top_kr"   concatenated_gate_logitsrouting_weights_selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthnum_hidden_layersexpert_attention_mask router_per_expert_attention_maskoverall_lossr$   r(   r*   load_balancing_loss_func5   s>   



rM   c                       sH   e Zd ZdZdef fddZdejdejdejdejfd	d
Z  Z	S )MixtralExpertsz2Collection of expert weights stored as 3D tensors.configc                    sn   t    |j| _|j| _|j| _t	t
| jd| j | j| _t	t
| j| j| j| _t|j | _d S )Nr   )super__init__num_local_expertsr!   hidden_size
hidden_dimintermediate_sizeintermediate_dimr   	Parameterr2   emptygate_up_proj	down_projr   
hidden_actact_fnselfrO   	__class__r$   r*   rQ      s   
 zMixtralExperts.__init__hidden_statestop_k_indextop_k_weightsr#   c                 C   s  t |}t  % t jjj|| jd}|ddd}t |j	ddd
 }W d    n1 s1w   Y  |D ]O}|d }|| jkrDq8t || \}}	||	 }
tj|
| j| jddd\}}| || }tj|| j| }|||	|d f  }|d|	||j q8|S )N)num_classesr   r   r   )r.   r,   r.   )r2   
zeros_likeno_gradr   r4   r7   r!   permutegreaterr=   nonzerowherelinearrY   chunkr\   rZ   
index_add_r%   dtype)r^   ra   rb   rc   final_hidden_statesrD   
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statesr$   r$   r*   forward   s$   


"zMixtralExperts.forward)
__name__
__module____qualname____doc__r   rQ   r2   Tensorry   __classcell__r$   r$   r_   r*   rN      s    	rN   c                       s$   e Zd Z fddZdd Z  ZS )MixtralTopKRouterc                    s>   t    |j| _|j| _|j| _t	t
| j| j| _d S N)rP   rQ   num_experts_per_tokr?   rR   r!   rS   rT   r   rW   r2   rX   weightr]   r_   r$   r*   rQ      s
   
zMixtralTopKRouter.__init__c                 C   sh   | d| j}t|| j}tjjj|	 dd}tj
|| jdd\}}||jddd }|}|||fS )Nr.   r,   T)r-   keepdim)r<   rT   Frl   r   r2   r   r4   r5   r9   r6   r?   r=   )r^   ra   router_logitsrouter_top_valuerouter_indicesrouter_scoresr$   r$   r*   ry      s   
zMixtralTopKRouter.forward)rz   r{   r|   rQ   ry   r   r$   r$   r_   r*   r      s    r   c                       s<   e Zd Z fddZdejdeejejf fddZ  ZS )MixtralSparseMoeBlockc                    s2   t    |j| _|j| _t|| _t|| _	d S r   )
rP   rQ   r   r?   router_jitter_noisejitter_noiser   rv   rN   expertsr]   r_   r$   r*   rQ      s
   

zMixtralSparseMoeBlock.__init__ra   r#   c                 C   s   |j \}}}| jr| jdkr|t|d| j d| j 9 }|d|j d }| |\}}}| |||}|	|||}|S )Nr   g      ?r.   )
r:   trainingr   r2   
empty_likeuniform_viewrv   r   r<   )r^   ra   rG   rH   rT   rB   rc   rb   r$   r$   r*   ry      s   "zMixtralSparseMoeBlock.forward)	rz   r{   r|   rQ   r2   r~   r0   ry   r   r$   r$   r_   r*   r      s    (r   c                   @      e Zd ZdS )MixtralRMSNormNrz   r{   r|   r$   r$   r$   r*   r          r   c                   @   r   )MixtralRotaryEmbeddingNr   r$   r$   r$   r*   r      r   r   c                   @   r   )MixtralAttentionNr   r$   r$   r$   r*   r      r   r   c                       s   e Zd Zdedef fddZ					ddejdeejejf dB dejdB d	ej	dB d
e
dB dej	dB dee dejfddZ  ZS )MixtralDecoderLayerrO   	layer_idxc                    sP   t    |j| _t||| _t|| _t|j|jd| _	t|j|jd| _
d S )N)eps)rP   rQ   rS   r   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernorm)r^   rO   r   r_   r$   r*   rQ      s   

zMixtralDecoderLayer.__init__Nra   position_embeddingsr"   position_idspast_key_valuescache_positionkwargsr#   c           
   	   K   s\   |}|  |}| jd||||||d|\}}	|| }|}| |}| |}|| }|S )N)ra   r   r"   r   r   r   r$   )r   r   r   r   )
r^   ra   r   r"   r   r   r   r   residualrB   r$   r$   r*   ry      s$   


	

zMixtralDecoderLayer.forward)NNNNN)rz   r{   r|   r   intrQ   r2   r~   r0   
LongTensorr   r   r   ry   r   r$   r$   r_   r*   r      s0    	r   c                   @   s6   e Zd Ze ZeeddeedZ	e
 dd ZdS )MixtralPreTrainedModelr   )index)r   ra   
attentionsc                 C   sj   t | | | jj}t|tr#tj|jd|d tj|j	d|d d S t|t
r3tj|jd|d d S d S )Ng        )r8   std)r   _init_weightsrO   initializer_ranger/   rN   initnormal_rY   rZ   r   r   )r^   moduler   r$   r$   r*   r     s   

z$MixtralPreTrainedModel._init_weightsN)rz   r{   r|   r   _can_compile_fullgraphr   r   r   r   _can_record_outputsr2   rg   r   r$   r$   r$   r*   r     s    
r   c                   @   st   e Zd Z							ddejdB dejdB dejdB dedB dejdB dedB dejdB d	e	e
 d
efddZdS )MixtralModelN	input_idsr"   r   r   inputs_embeds	use_cacher   r   r#   c              
   K   s  |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}| jj
d u rNtnt}
|
| j|||||d}|}| j||d}| jd | jj D ]}||f||||||d|}qm| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embeds)rO   r   r   )r1   )rO   r   r"   r   r   r   )r   )r"   r   r   r   r   r   )last_hidden_stater   )
ValueErrorr   rO   embed_tokensget_seq_lengthr2   aranger:   r1   r>   sliding_windowr	   r
   
rotary_emblayersrI   normr   )r^   r   r"   r   r   r   r   r   r   past_seen_tokensmask_functioncausal_maskra   r   decoder_layerr$   r$   r*   ry     sT   

	
zMixtralModel.forward)NNNNNNN)rz   r{   r|   r2   r   r~   r   FloatTensorboolr   r   r   ry   r$   r$   r$   r*   r     s6    	
r   c                       s   e Zd ZddiZ fddZ										ddejdB dejdB d	ejdB d
edB dej	dB dejdB de
dB de
dB dejdB deejB dee defddZ  ZS )MixtralForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                    s2   t  | t|| _|j| _|j| _|j| _d S r   )rP   rQ   r   modelrouter_aux_loss_coefrR   r!   r   r]   r_   r$   r*   rQ   ^  s
   
zMixtralForCausalLM.__init__Nr   r   r"   r   r   r   labelsr   output_router_logitsr   logits_to_keepr   r#   c                 K   s   |dur|n| j j}| jd||||||||	d|}|j}t|
tr)t|
 dn|
}| |dd|ddf }d}|durK| j||| j	fi |}d}|rht
|j| j| j|}|durh|| j||j 7 }t||||j|j|j|jdS )a~  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MixtralForCausalLM

        >>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r   r"   r   r   r   r   r   r   )lossaux_losslogitsr   ra   r   r   r$   )rO   r   r   r   r/   r   slicelm_headloss_function
vocab_sizerM   r   r!   r   r   r%   r1   r   r   ra   r   )r^   r   r"   r   r   r   r   r   r   r   r   r   outputsra   slice_indicesr   r   r   r$   r$   r*   ry   e  sN   &	zMixtralForCausalLM.forward)
NNNNNNNNNr   )rz   r{   r|   _tied_weights_keysrQ   r2   r   r~   r   r   r   r   r   r   r   ry   r   r$   r$   r_   r*   r   [  sL    		
r   c                   @   r   ) MixtralForSequenceClassificationNr   r$   r$   r$   r*   r     r   r   c                   @   r   )MixtralForTokenClassificationNr   r$   r$   r$   r*   r     r   r   c                   @   r   )MixtralForQuestionAnsweringNr   r$   r$   r$   r*   r     r   r   )r   r   r   r   r   r   )Nr   N)Dr}   r2   torch.nn.functionalr   r4   r    r   r   activationsr   cache_utilsr   r   integrationsr   masking_utilsr	   r
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.output_capturingr   mistral.modeling_mistralr   r   r   r   r   r   r   r   r   configuration_mixtralr   
get_loggerrz   loggerr~   r0   r   rM   ModulerN   r   r   r   r   r   r   r   r   r   r   r   r   __all__r$   r$   r$   r*   <module>   sZ   ,

R'(>_