o
    ei_                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3 e"4e5Z6G dd de-Z7G dd de.Z8G dd dej9Z:G dd dej9Z;G d d! d!ej9Z<G d"d# d#ej9Z=G d$d% d%ej9Z>G d&d' d'e)Z?e G d(d) d)e,Z@e G d*d+ d+e+ZAG d,d- d-e@eZBG d.d/ d/ee@ZCg d0ZDdS )1zPyTorch JetMoe model.    )CallableN)nn)
functional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask) GenericForSequenceClassification)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )LlamaDecoderLayer)MixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralRotaryEmbeddingapply_rotary_pos_embeager_attention_forwardload_balancing_loss_func   )JetMoeConfigc                   @      e Zd ZdS )JetMoeRMSNormN__name__
__module____qualname__ r*   r*   g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/jetmoe/modular_jetmoe.pyr%   4       r%   c                   @   r$   )JetMoeRotaryEmbeddingNr&   r*   r*   r*   r+   r-   8   r,   r-   c                       s6   e Zd Zdedededdf fddZdd	 Z  ZS )
JetMoeParallelExpertsnum_experts
input_sizeoutput_sizereturnNc                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the JetMoeParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
super__init__r   	Parametertorchemptyweightr/   r0   r1   )selfr/   r0   r1   	__class__r*   r+   r4   =   s
   

zJetMoeParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ]}|t|| | j|  qtj|dd}|S )a  
        Forward pass of the JetMoeParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   dim)	splitranger/   appendFlinearr8   r6   cat)r9   inputsexpert_size
input_listoutput_listiresultsr*   r*   r+   forwardT   s   zJetMoeParallelExperts.forwardr'   r(   r)   intr4   rJ   __classcell__r*   r*   r:   r+   r.   <   s    r.   c                       s2   e Zd Zdededef fddZdd Z  ZS )JetMoeTopKGatingr0   r/   top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.

        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        FbiasN)r3   r4   r/   r0   rO   r   Linearlayer)r9   r0   r/   rO   r:   r*   r+   r4   j   s
   
zJetMoeTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr"   r<   r   dtypedevicetrunc)rounding_mode)rS   floattopkrO   r6   softmaxtype_aszerossizer/   rU   rV   scatterlongsumtolistflattensortdiv)r9   hidden_stateslogitstop_k_logitstop_k_indicestop_k_gatesr]   gatesrE   top_k_experts_index_sorted_expertsbatch_indexbatch_gatesr*   r*   r+   rJ   ~   s   zJetMoeTopKGating.forwardrK   r*   r*   r:   r+   rN   i   s    rN   c                       s.   e Zd ZdZdef fddZdd Z  ZS )	JetMoeMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    s   t    |j| _|j| _t|j | _tj	
t| j| _t|j| j| jd | _t|j| j| j| _t| j|j|jd| _d S )Nr   r0   r/   rO   )r3   r4   hidden_sizer0   intermediate_sizer   activation_function
activationr6   r   r5   r7   rQ   r.   num_local_expertsinput_linearoutput_linearrN   num_experts_per_tokrouterr9   rr   r:   r*   r+   r4      s   
zJetMoeMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	|| }
| |
|}|jddd}| |d |d  }| ||}||dddf  }tj|| | j	f|j
|jd}|d||}|||| j	}|| j }|S )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r   r<   r   r"   NrT   )r^   reshaper|   ry   chunkrw   rz   r6   r]   r0   rU   rV   	index_addviewrQ   )r9   layer_inputbszlengthemb_sizerm   ro   rp   rE   router_logitsexpert_inputsrf   chunked_hidden_statesexpert_outputsr]   layer_outputr*   r*   r+   rJ      s   
zJetMoeMoE.forward)r'   r(   r)   __doc__r#   r4   rJ   rM   r*   r*   r:   r+   rq      s    rq   c                       s>   e Zd ZdZdef fddZdd Zdd Zd	d
 Z  Z	S )	JetMoeMoAz
    A Sparsely gated mixture of attention layer with pairs of query- and output-projections as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    rr   c                    s   t    |j| _|j| _|j|j | _|j| _	t
jt
| j| _t| j| j| j| _t| j| j| j| _t| j| j| j	d| _d S )Nrs   )r3   r4   rx   r/   rt   r0   kv_channelsnum_key_value_headsr{   rO   r6   r   r5   r7   rQ   r.   ry   rz   rN   r|   r}   r:   r*   r+   r4      s   
zJetMoeMoA.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	||||f}
|| }| ||}tj|| | j | jf|j|j	d}|
d||}|||| jd}||	|
fS )z
        Map inputs to attention experts according to routing decision and compute query projection inside each experts.
        r~   rT   r   )r^   r   r|   ry   r6   r]   rO   rt   rU   rV   r   r   )r9   r   r   r   r   rn   ro   rp   rE   r   	topo_infor   r   r]   r   r*   r*   r+   map   s   
zJetMoeMoA.mapc                 C   s   |  \}}}}|d|}|\}}}	}
|| }| ||
}||	dddf  }tj|| | jf|j|jd}|d||}|	||| j}|| j
 }|S )zu
        Compute output projection inside each attention experts and merge the outputs of different experts.
        r~   NrT   r   )r^   r   rz   r6   r]   r0   rU   rV   r   r   rQ   )r9   r   r   r   r   krt   rn   ro   rp   rE   r   r   r]   r   r*   r*   r+   reduce  s   
zJetMoeMoA.reducec                 C   s   t d)Nz-This module doesn't support call and forward.)NotImplementedError)r9   r   r*   r*   r+   rJ     s   zJetMoeMoA.forward)
r'   r(   r)   r   r#   r4   r   r   rJ   rM   r*   r*   r:   r+   r      s    r   c                       s   e Zd ZdZddededB f fddZ				ddejdejdB d	ej	dB d
e
dB dej	dB deejejdB eej dB f fddZ  ZS )JetMoeAttentionzH
    Multi-headed attention from 'Attention Is All You Need' paper.
    Nrr   	layer_idxc                    s   t    || _|| _d| _|du rtd| jj d d| _	|j
| _|j| _|j|j | _|j| _|j| _|j| _| jd | _t|| _tjj|j| jd dd	| _dS )
z
        Initialize the JetMoeAttention module.

        Args:
            config:
                Configuration object with model hyperparameters.
            layer_idx:
                Index of the layer in the model.
        TNzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r"   g      r   FrP   )r3   r4   rr   r   	is_causalloggerwarning_oncer;   r'   num_key_value_groupsr{   rO   attention_dropoutr   r   kv_projection_sizenum_attention_heads	num_headshead_dimscalingr   expertsr6   r   rR   rt   kv_projr9   rr   r   r:   r*   r+   r4   %  s$   


 zJetMoeAttention.__init__rf   attention_maskposition_embeddingspast_key_valuescache_positionr2   c                 K   sp  |j d d }g |d| jR }| j|\}	}
}| |jddd\}}|	|dd}	||dd}||dd}|\}}t|	|||\}	}|d urc|||d}|	||| j
|\}}t| jjt}|d| jdd}|d| jdd}|| |	|||f| jsdn| j| jd|\}}|jg || jdR  }| j||}|jg |dR  }|||
fS )Nr~   r   r<   r"   )sincosr           )dropoutr   )shaper   r   r   r   r   r   	transposer   updater   r   get_interfacerr   _attn_implementationr    repeatrO   trainingr   r   r   )r9   rf   r   r   r   r   kwargsinput_shapehidden_shapequery_statesr   r   
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightsr*   r*   r+   rJ   F  sB   	

zJetMoeAttention.forwardN)NNNN)r'   r(   r)   r   r#   rL   r4   r6   Tensor
LongTensorr   tuplerJ   rM   r*   r*   r:   r+   r      s(    $r   c                       s   e Zd ZddededB f fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )JetMoeDecoderLayerNrr   r   c                    sD   t  || t|j| _t||| _t|j| _t|| _	| `
d S r   )r3   r4   r%   rt   input_layernormr   self_attentionpost_attention_layernormrq   mlp	self_attnr   r:   r*   r+   r4   |  s   
zJetMoeDecoderLayer.__init__Frf   r   position_idsr   	use_cacher   r   r   r2   c              
   K   s`   |}	|  |}| jd|||||||d|\}}
}
|	| }|}	| |}| |}|	| }|S )N)rf   r   r   r   r   r   r   r*   )r   r   r   r   )r9   rf   r   r   r   r   r   r   r   residualrm   r*   r*   r+   rJ     s&   



zJetMoeDecoderLayer.forwardr   )NNNFNN)r'   r(   r)   r#   rL   r4   r6   r   r   r   boolr   r   r   rJ   rM   r*   r*   r:   r+   r   {  s6    	
r   c                   @   sn   e Zd ZU eeddeeddgeeedddZee	d< dZ
dZd	gZd
gZdZdZdZe dd ZdS )JetMoePreTrainedModelr   )index   r"   )r   rf   
attentionsrr   modelFr   r   Tc                 C   sR   t | | t|trtj|jd| jjd dS t|t	t
B r't|j dS dS )zInitialize the weights.r   )meanstdN)r   _init_weights
isinstancer.   initnormal_r8   rr   initializer_ranger   rq   zeros_rQ   )r9   moduler*   r*   r+   r     s   
z#JetMoePreTrainedModel._init_weightsN)r'   r(   r)   r   r   rN   r   _can_record_outputsr#   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr6   no_gradr   r*   r*   r*   r+   r     s   
 
r   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )JetMoeModelrr   c                    sp   t     j| _ j| _t j j| j| _t	 fddt
 jD | _ j| _t j jd| _d S )Nc                    s   g | ]}t  |qS r*   )r   ).0r   rr   r*   r+   
<listcomp>  s    z(JetMoeModel.__init__.<locals>.<listcomp>)eps)r3   r4   pad_token_idpadding_idx
vocab_sizer   	Embeddingrt   embed_tokens
ModuleListr?   num_hidden_layerslayersr   r%   rms_norm_epsnormr}   r:   r   r+   r4     s   zJetMoeModel.__init__N	input_idsr   r   r   inputs_embedsr   r   r   r2   c              
   K   s   |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|}| ||}| jd | jj D ]}||f||
||||d|}qb| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r"   )rV   )rr   r   r   r   r   r   )r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorr	   rr   r   get_seq_lengthr6   aranger   rV   	unsqueezer   
rotary_embr   r   r   r   )r9   r   r   r   r   r   r   r   r   past_seen_tokenscausal_maskrf   r   decoder_layerr*   r*   r+   rJ     sR   

	
zJetMoeModel.forward)NNNNNNN)r'   r(   r)   r#   r4   r   r   r   r6   r   r   r   FloatTensorr   r   r   r   rJ   rM   r*   r*   r:   r+   r     s>    	
r   c                       s   e Zd ZddiZ fddZee										ddejdB d	ej	dB d
ejdB de
dB dejdB dejdB dedB dejdB deej	B dedB defddZ  ZS )JetMoeForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                    s`   t  | t|| _|j| _|j| _tj|j|jdd| _	|j
| _
|j| _|j| _|   d S )NFrP   )r3   r4   r   r   r   aux_loss_coefr   rR   rt   lm_headtie_word_embeddingsrx   r/   r{   	post_initr}   r:   r*   r+   r4     s   
zJetMoeForCausalLM.__init__Nr   Fr   r   r   r   r   labelsr   r   logits_to_keepoutput_router_logitsr2   c                 K   s   | j d||||||||
d|}|j}t|	trt|	 d n|	}| |d d |d d f }d }|d urC| j||fd| jji|}d }|
r`t	|j
| j| j|}|d ur`|| j||j 7 }t||||j|j|j|j
dS )N)r   r   r   r   r   r   r   r  r   )lossaux_lossrg   r   rf   r   r   r*   )r   r   r   rL   slicer   loss_functionrr   r   r!   r   r/   r{   r   torV   r   r   rf   r   )r9   r   r   r   r   r   r  r   r   r  r  r   outputsrf   slice_indicesrg   r  r  r*   r*   r+   rJ     sX   	zJetMoeForCausalLM.forward)
NNNNNNNNr   F)r'   r(   r)   _tied_weights_keysr4   r   r   r6   r   r   r   r   r   rL   r   rJ   rM   r*   r*   r:   r+   r     sL    	
r   c                   @   r$   )JetMoeForSequenceClassificationNr&   r*   r*   r*   r+   r  _  s    r  )r   r   r   r  )Er   collections.abcr   r6   r   torch.nnr   rA    r   r   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   llama.modeling_llamar   mixtral.modeling_mixtralr   r   r   r   r   r    r!   configuration_jetmoer#   
get_loggerr'   r   r%   r-   Moduler.   rN   rq   r   r   r   r   r   r   r  __all__r*   r*   r*   r+   <module>   sH   $	
-1:L[+NP