o
    }oi>                  	   @   sh  d dl Z d dlm  mZ d dlmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ zd dlmZ d dlm Z m!Z! dZ"W n e#e$fy{   dZ"e  Z% Z& Z'Z(Y nw zd dl)m*Z*m Z m!Z! d dl+m,Z, dZ-W n e#e$fy   eZ*dZ-Y nw G dd deej.Z/G dd deZ0dS )    N)AdapterNameLora4HtoHAdapterConfigLoraHto4HAdapterConfigMLPInfusedAdapterConfig)fused_bias_geglu)fused_bias_gelu)get_layer_norm)LayerNorm1P)MegatronModule)ApexGuardDefaultsApproxGELUActivationerf_gelu)openai_gelu)squared_relu)adapter_mixins)MixedFusedRMSNorm)parallel_statetensor_parallelTF)ModelParallelConfigr   r   )$get_tensor_model_parallel_world_sizec                       sH   e Zd ZdZejddddddddddfd	ef fd
dZdd Z  Z	S )ParallelMLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    TFgelupre_ln	layernormh㈵>        configc              	      s&  t t| j|d |
| _|| _|| _|| _|| _|| _|
| _|| _	|| _
| tjtjtjg g d}|
|vrAtd|
 d| |
dv | _tj|| jrP|d n||d|d|d	| _|
d
v rktj|||d|d|d	| _|
dv | _|
dv }|r|rtd|
 d| jr|	r| jrtd|
 d|r|std|| _|rt| _n0|
dv rtj| _n'|
dkrt| _n|	rt| _n|
dv rtj| _n|
dv rtj| _n|
dkrt | _tj!|||d|d|d| _"|dkr|dkrt#|t$  ||| _d S |dkrt%|t$  ||j&d| _d S t'|t$  || _d S d S )Nr   )	r   gegluregluswiglusquared-relu
fast-geglufast-swiglu
fast-regluapprox-geluzActivation z* not supported. Supported activations are )r"   r#   r$      FT)r   gather_outputinit_methodskip_bias_addbias)r   r   r    )r   r   r    r"   r$   r#   )r   r    z'Cannot use bias_activation_fusion with z. activation. Please turn bias gelu fusion off.zVCannot use onnx_safe with specificed activation function and bias_activation_fusion : z Please turn onnx safe off.zkCannot use bias_activation_fusion without bias terms. Please set bias=True or bias_activation_fusion=False.)r   r   r"   r%   )r   r$   )r    r#   r!   r   input_is_parallelr(   r)   r*   
normformerr   layernorm1p)sequence_parallel_enabled)(superr   __init__
activationr*   transformer_block_typenormalizationlayernorm_epsilonpersist_layer_normdropoutdtypeset_accepted_adapter_typesr   _target_r   r   
ValueErrorfast_glu_activationr   ColumnParallelLineardense_h_to_4hdense_h_to_4h_2glu_activation_familybias_activation_fusionopenai_gelu_funcactivation_funcFr   r   r   relusilur   RowParallelLineardense_4h_to_hr   r   r	   sequence_parallelr   )selfr   r(   output_layer_init_methodhidden_sizeffn_hidden_sizer8   rA   r   	onnx_safer2   r*   r3   r4   r5   r6   r7   supported_activations"bias_activation_fusion_unavailable	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/nlp/modules/common/megatron/mlp.pyr1   E   s   













zParallelMLP.__init__c                 C   s  |  |\}}|  r#| tj}|r#| jtj d r#||}|| }| jr?tj|ddd\}}|d ur>tj|ddd\}}n| j	rL| jsL| 
|\}}| jrg| jdkrZt||}n<| jdv rft||||}n/| j	r| js|d ur}| || ||  }n| || }n|d ur| || }n| |}| jdkrtj|| j| jd}| tj}|r||}| jd	kr| |}| |\}	}
|  r| tj}|r| jtj d r||}|	| }	|	|
fS )
Nenabledr&   dimr   )r   r"   r   )ptrainingr-   )r>   is_adapter_availableget_adapter_moduler   LORA_Hto4H_ADAPTERadapter_cfgr<   torchchunkr@   r?   rA   r2   r   r   rC   r7   rD   rZ   MLP_INFUSEDr3   r4   rH   LORA_4HtoH_ADAPTER)rJ   hidden_statesintermediate_parallelbias_parallellora_dense_h_to_4h_adapterlora_intermediate_parallelintermediate_parallel_2bias_parallel_2infused_adapteroutputoutput_biaslora_dense_4h_to_h_adapterlora_outputrS   rS   rT   forward   sX   





zParallelMLP.forward)
__name__
__module____qualname____doc__r_   float32r   r1   ro   __classcell__rS   rS   rQ   rT   r   =   s$     r   c                       sT   e Zd ZdZ											dd	ef fd
dZdd ZedddZ  Z	S )	SwitchMLPzCTop-1 MoE
    
    Curently supports Sinkhorn based expert routing.TFr   r   r   r   r   r   c                    s   t t| j|d || _tj| _tj|||d|d|d| _||||||||	|
||||||d t	j
 fddt|D | _d S )Nr   Fr+   )r   r(   rK   rL   rM   rA   r   rN   r2   r*   r3   r4   r5   r6   r7   c                    s   g | ]	}t d i  qS )rS   )r   ).0_mlp_argsrS   rT   
<listcomp>O  s    z&SwitchMLP.__init__.<locals>.<listcomp>)r0   rv   r1   num_expertssinkhorn
route_algor   rG   routerr_   nn
ModuleListrangeexperts)rJ   r   r|   r(   rK   rL   rM   rA   r   rN   r2   r*   r3   r4   r5   r6   rI   r7   rQ   ry   rT   r1     s:   $zSwitchMLP.__init__c                 C   s  |j }| |\}}|d| j}| jrOt  | | j	tj
d}tj|dd\}}W d    n1 s8w   Y  t|}|t|d|f }nt|}tj|dd\}}t|d}|d|d }|dk }||d d f }	| jd |	\}
}||
}tj||
jd}tj||jd}|
||d d f< |||d d f< t| jD ]2\}}|dkrq||k }||d d f }	||	\}
}||
}|
||d d f< |||d d f< q|| }|| }||}||}||fS )NrV   )r8      rW   r   )shaper   viewr|   rZ   r_   no_gradr~   detachtort   maxsigmoidarangesize	unsqueezenonzeror   	expand_as
empty_liker8   	enumerate)rJ   rc   hidden_shaperouterx   
norm_routemax_indmax_problocal_indiceshiddenrk   rl   output_totaloutput_bias_total
expert_numexpertrS   rS   rT   ro   Q  sL   






zSwitchMLP.forward-C6?c                 C   s   t |}t j|d|j|jd}t j|d|j|jd}d}d}|}||krbd|d d t || d|  }d|d d t |d| d|  }t t 	|| }|}||ks)|| |d S )z$Megatron-LMs sinkhorn implementationr   )devicer8   r   g:0yE>g    eA)
r_   exponesr   r   r8   sumr   meanabs)clscosttold0d1epserrord1_oldrS   rS   rT   r}     s   
&,zSwitchMLP.sinkhorn)TFFr   Tr   r   r   FFr   )r   )
rp   rq   rr   rs   r   r1   ro   classmethodr}   ru   rS   rS   rQ   rT   rv     s&    5/rv   )1r_   torch.nn.functionalr   
functionalrD   Gnemo.collections.nlp.modules.common.megatron.adapters.parallel_adaptersr   r   r   r   =nemo.collections.nlp.modules.common.megatron.fused_bias_geglur   <nemo.collections.nlp.modules.common.megatron.fused_bias_gelur   =nemo.collections.nlp.modules.common.megatron.fused_layer_normr   :nemo.collections.nlp.modules.common.megatron.layer_norm_1pr	   3nemo.collections.nlp.modules.common.megatron.moduler
   2nemo.collections.nlp.modules.common.megatron.utilsr   r   r   r   rB   r   	nemo.corer   apex.normalizationr   apex.transformerr   r   	HAVE_APEXImportErrorModuleNotFoundError	ModelTypeAttnMaskTypeAttnType	LayerTypemegatron.corer   megatron.core.parallel_stater   HAVE_MEGATRON_COREAdapterModuleMixinr   rv   rS   rS   rS   rT   <module>   s>    [