o
    eiG                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, e-e.Z/G dd de&Z0G dd de"Z1G dd de*Z2G dd dej3Z4G dd  d ej5Z6G d!d" d"ej3Z7G d#d$ d$e$Z8G d%d& d&eZ9G d'd( d(eZ:eG d)d* d*e:Z;G d+d, d,e%e:eZ<g d-Z=dS ).zPyTorch AFMoE model.    )CallableN)nn   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)MoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )GptOssRMSNorm)LlamaAttentionLlamaForCausalLMLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Qwen2MoeMLP   )AfmoeConfigc                   @      e Zd ZdS )AfmoeRotaryEmbeddingN__name__
__module____qualname__ r%   r%   e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/afmoe/modular_afmoe.pyr    /       r    c                   @   r   )AfmoeRMSNormNr!   r%   r%   r%   r&   r(   3   r'   r(   c                   @   r   )AfmoeMLPNr!   r%   r%   r%   r&   r)   7   r'   r)   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )AfmoeTokenChoiceRouterz
    Token-choice top-K router for MoE routing.

    This router assigns each token to the top-K experts based on sigmoid scores, matching the released checkpoints.
    c                    sB   t    || _|j| _|j| _|j| _tj|j	|jdd| _
d S NFbias)super__init__confignum_experts_per_toktop_knum_expertsroute_scaler   Linearhidden_sizegateselfr0   	__class__r%   r&   r/   B   s   
zAfmoeTokenChoiceRouter.__init__hidden_statesexpert_biasc           	      C   s   |j \}}}|d|}t| |tj}tj|| | jdd\}}|j	d|d}|j
dddd }|| }|| j }||fS )Nr   )kdim)r@   indexT)r@   keepdimg#B;)shapeviewtorchsigmoidr7   tofloat32topkr2   gathersumr4   )	r9   r<   r=   _
hidden_dimscoresselected_experts
top_scoresdenominatorr%   r%   r&   forwardJ   s   
zAfmoeTokenChoiceRouter.forward)	r"   r#   r$   __doc__r/   rE   TensorrR   __classcell__r%   r%   r:   r&   r*   ;   s    r*   c                       sH   e Zd ZdZdef fddZdejdejdejdejfd	d
Z  Z	S )AfmoeExpertsz
    Container holding the routed experts.

    This mirrors the Experts pattern used across other MoE models to ease checkpoint conversion.
    r0   c                    sB   t    |j| _|j| _t| jD ]}| t||jd qd S )N)intermediate_size)	r.   r/   r1   r2   r3   rangeappendr)   moe_intermediate_size)r9   r0   rL   r:   r%   r&   r/   _   s   
zAfmoeExperts.__init__r<   rO   routing_weightsreturnc                 C   sh  |j \}}}|dkr||d|S |d|}|j d }tj|j d |jtjd|}	|d}
|d}tj	|
dd}|	| }	|
| }
|| }|
d|	}t|}tj|
dd\}}d}t| | D ]!\}}|dkrqqh|| }||| }| | |}||||< |}qh|tj|d |j}t|}|	d|}|d|| ||||S )z
        Args:
            hidden_states: (batch, seq, hidden)
            selected_experts: (batch, seq, top_k)
            routing_weights: (batch, seq, top_k)
        r   r>   )devicedtypeT)stable)return_counts)rC   	new_zerosrD   rE   aranger]   longrepeat_interleavereshapeargsortindex_select
zeros_likeunique_consecutiveziptolistrG   rH   	unsqueezer^   	expand_asscatter_add_)r9   r<   rO   r[   
batch_sizeseq_lenrM   hidden_states_flatr2   token_indicesexpert_indicessortingdispatched_tokensexpert_outputsunique_expertscountsstart	expert_idcountendexpert_inputexpert_outputweighted_outputs
aggregatedscatter_indicesr%   r%   r&   rR   f   sB   	




zAfmoeExperts.forward)
r"   r#   r$   rS   r   r/   rE   rT   rR   rU   r%   r%   r:   r&   rV   X   s    rV   c                       s(   e Zd ZdZ fddZdd Z  ZS )AfmoeMoEz
    Mixture of Experts (MoE) module for AFMoE.

    This module implements a sparse MoE layer with both shared experts (always active) and
    routed experts (activated based on token-choice routing).
    c                    sT   t    || _t|| _t||j|j | _t	|| _
tjt|jdd| _d S )NF)requires_grad)r.   r/   r0   r*   routerr)   rZ   num_shared_expertsshared_expertsrV   expertsr   	ParameterrE   zerosr3   r=   r8   r:   r%   r&   r/      s   


zAfmoeMoE.__init__c           
      C   sx   |j \}}}|d|}| || j\}}|||| jj}|||| jj}| ||||}| |||}	||	 S )Nr>   )rC   rD   r   r=   r0   r1   r   r   )
r9   r<   ro   rp   rM   rq   rP   rO   shared_outputrouted_outputr%   r%   r&   rR      s   zAfmoeMoE.forward)r"   r#   r$   rS   r/   rR   rU   r%   r%   r:   r&   r      s    r   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejf fddZ  ZS )AfmoeAttentionaJ  
    Multi-headed attention module with optional sliding window and gating.

    This attention mechanism supports both full attention and sliding window attention,
    and includes Q/K normalization and gating of the output. It inherits from [`LlamaAttention`] to minimize the amount
    of custom logic we need to maintain.
    r0   	layer_idxc                    st   t  || |j| dk| _| jr|jnd | _t| j|jd| _t| j|jd| _	t
j|j|j| j dd| _d S )Nsliding_attentionepsFr,   )r.   r/   layer_typesis_local_attentionsliding_windowr(   head_dimrms_norm_epsq_normk_normr   r5   r6   num_attention_heads	gate_projr9   r0   r   r:   r%   r&   r/      s    zAfmoeAttention.__init__Nr<   position_embeddingsattention_maskpast_key_valuecache_positionkwargsr\   c                 K   sN  |j d d }g |d| jR }| ||}	| ||}
| ||}| |}| |	dd}	| 	|
dd}
|dd}| j
rV|\}}t|	|
||\}	}
|d urid|i}||
|| j|\}
}t| jjt}|| |	|
|f|| js}dn| j| j| jd|\}}|jg |dR   }|t| }| |}||fS )Nr>   r   r   r   g        )r   dropoutscalingr   )rC   r   q_projrD   k_projv_projr   r   	transposer   r   r   updater   r   get_interfacer0   _attn_implementationr   trainingattention_dropoutr   r   
contiguousrE   rF   o_proj)r9   r<   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesgate_statescossincache_kwargsattention_interfaceoutputattn_weightsattn_outputr%   r%   r&   rR      sF   	
	

zAfmoeAttention.forward)NN)r"   r#   r$   rS   r   intr/   rE   rT   tupler   
LongTensorr   r   rR   rU   r%   r%   r:   r&   r      s(    r   c                       s   e Zd ZdZdedef fddZ						ddejdejdB d	ej	dB d
e
dB dedB dej	dB deejejf dB dee dejfddZ  ZS )AfmoeDecoderLayerz
    AFMoE decoder layer with dual normalization.

    This layer applies self-attention followed by either a dense MLP or MoE block,
    with dual normalization (pre and post) around each component.
    r0   r   c                    s   t    |j| _|| _t||d| _|j| | _t|j|j	d| _
t|j|j	d| _t|j|j	d| _t|j|j	d| _||jk| _| jrMt|| _d S t|| _d S )N)r0   r   r   )r.   r/   r6   r   r   	self_attnr   attention_typer(   r   input_layernormpost_attention_layernormpre_mlp_layernormpost_mlp_layernormnum_dense_layersmoe_enabledr   mlpr)   r   r:   r%   r&   r/     s   
zAfmoeDecoderLayer.__init__Nr<   r   position_idsr   	use_cacher   r   r   r\   c              
   K   sr   |}	|  |}| jd|||||||d|\}}
| |}|	| }|}	| |}| |}| |}|	| }|S )N)r<   r   r   r   r   r   r   r%   )r   r   r   r   r   r   )r9   r<   r   r   r   r   r   r   r   residualrL   r%   r%   r&   rR     s*   






zAfmoeDecoderLayer.forward)NNNNNN)r"   r#   r$   rS   r   r   r/   rE   rT   r   r   boolr   r   r   FloatTensorrR   rU   r%   r%   r:   r&   r      s8    	
r   c                       s`   e Zd ZU dZeed< dZdgZdgZe	e
dZg dZdZdZdZdZdZ fd	d
Z  ZS )AfmoePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r0   modelr   past_key_values)r<   
attentions)r   r   r   r   r   r   normr=   Tc                    sF   t  | t|trt|jj dS t|tr!t|j	 dS dS )zInitialize the weightsN)
r.   _init_weights
isinstancer*   initzeros_r7   weightr   r=   )r9   moduler:   r%   r&   r   _  s   

z"AfmoePreTrainedModel._init_weights)r"   r#   r$   rS   r   __annotations__base_model_prefix_no_split_modules_skip_keys_device_placementr   r   _can_record_outputs_keep_in_fp32_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendsupports_gradient_checkpointingr   rU   r%   r%   r:   r&   r   A  s    
 
r   c                       s   e Zd ZdZdef fddZeee							dde	j
dB de	jdB de	jdB d	e	j
dB d
edB de	j
dB dedB dee deeB fddZ  ZS )
AfmoeModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AfmoeDecoderLayer`]

    Args:
        config: AfmoeConfig
    r0   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r%   )r   ).0r   r0   r%   r&   
<listcomp>x  s    z'AfmoeModel.__init__.<locals>.<listcomp>r   r   F)r.   r/   pad_token_idpadding_idx
vocab_sizer   	Embeddingr6   embed_tokens
ModuleListrX   num_hidden_layerslayersr(   r   r   r    
rotary_embgradient_checkpointing	post_initr8   r:   r   r&   r/   q  s   zAfmoeModel.__init__N	input_idsr   inputs_embedsr   r   r   r   r   r\   c              
   K   sD  |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| }
tse| j||||d}tdi |tdi |d}
|}| jjrs|| jjd  }| ||}| jD ]}||f|
|j |||||d	|}q|| |}t||r|d
S d d
S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r]   )r0   r   r   r   r   )full_attentionr   g      ?)r   r   r   r   r   r   )last_hidden_stater   r%   )
ValueErrorr   r0   r   get_seq_lengthrE   rb   rC   r]   rl   r   dictr	   r
   mup_enabledr6   r   r   r   r   r   )r9   r   r   r   r   r   r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsr<   r   decoder_layerr%   r%   r&   rR     sb   



zAfmoeModel.forward)NNNNNNN)r"   r#   r$   rS   r   r/   r   r   r   rE   r   rT   r   r   r   r   r   r   r   rR   rU   r%   r%   r:   r&   r   h  s@    	
r   c                   @   s   e Zd Zdd ZdS )AfmoeForCausalLMc                 C   s@   t | | t|| _|j| _tj|j|jdd| _| 	  d S r+   )
r   r/   r   r   r   r   r5   r6   lm_headr   r8   r%   r%   r&   r/     s
   
zAfmoeForCausalLM.__init__N)r"   r#   r$   r/   r%   r%   r%   r&   r     s    r   )r   r   r   )>rS   collections.abcr   rE   r    r   r   cache_utilsr   r   
generationr   masking_utilsr	   r
   modeling_layersr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   gpt_oss.modeling_gpt_ossr   llama.modeling_llamar   r   r   r   r   qwen2_moe.modeling_qwen2_moer   configuration_afmoer   
get_loggerr"   loggerr    r(   r)   Moduler*   r   rV   r   r   r   r   r   r   __all__r%   r%   r%   r&   <module>   sB   
>GE'a	