o
    ߥi                     @   s*  d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlm Z  d dl!m"Z# d d	lm$Z$ d
Z%d
Z&e'dZ(G dd deZ)G dd deZ*G dd deZ+dd Z,dddZ-dd Z.dd Z/dS )    N)AnyDictListOptionalTuple)utils)	fsdp_wrap)FairseqEncoderFairseqEncoderDecoderModelFairseqIncrementalDecoderregister_modelregister_model_architecture)	AdaptiveSoftmax	BaseLayerFairseqDropoutLayerDropModuleList	LayerNormPositionalEmbeddingSinusoidalPositionalEmbeddingTransformerDecoderLayerTransformerEncoderLayer)checkpoint_wrapper)quant_noise)Tensori   g    חAc                       s  e Zd ZdZ fddZedd Zedd Zed+d	d
Z	edd Z
edd Z				d,dededee dee fddZejj	d+deeeeeeee  f  f dedeeeef  fddZ	d-deeee f deeeeee f f defd d!Z	d-deeee f deeeeee f f defd"d#Zd$eeef fd%d&Zdeeeee f  fd'd(Zdeeeeee f f fd)d*Z  ZS ).
CanmtModela%  

    Args:
        encoder (TransformerEncoder): the encoder
        decoder (TransformerDecoder): the decoder

    The CanmtModel provides the following named architectures and
    command-line arguments:

    .. argparse::
        :ref: fairseq.models.transformer_parser
        :prog:
    c                    s0   t  || || _d| _|| _|| _|| _d S )NT)super__init__argssupports_align_argsencoderdecodersecond_decoder)selfr   r   r    r!   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/canmt/canmt_model.pyr   -   s   
zCanmtModel.__init__c                 C   s  | j dt dd | j dtddd | j dtdd	d | j d
dtddd | j dtddd | j dtddd | j dtddd | j dtddd | j dtddd | j dddd | j dddd | j dtdd d | j d!tdd"d | j d#tdd$d | j d%tdd&d | j d'tdd(d | j d)dd*d | j d+dd,d | j d-tdd.d | j d/dd0d | j d1dd2d | j d3d4dd5d6 | j d7d8d9d:f | j d;tdd<d | j d=dd>d | j d?dd@d | j dAddBd | j dCddDd | j dEd4ddFd6 | j dGd4ddHd6 | j dItddJdKdL | j dMtddJdNdL | j dOdPdQdR | j dSdPdQdR | j dTtddJdUdL | j dVtddWdXdL | j dYtddJdZdL | j d[tdtd\dL dPS )]z+Add model-specific arguments to the parser.z--activation-fnzactivation function to use)choiceshelpz	--dropoutDzdropout probability)typemetavarr(   z--attention-dropoutz)dropout probability for attention weightsz--activation-dropoutz--relu-dropoutz,dropout probability after activation in FFN.z--encoder-embed-pathSTRz%path to pre-trained encoder embeddingz--encoder-embed-dimNzencoder embedding dimensionz--encoder-ffn-embed-dimz#encoder embedding dimension for FFNz--encoder-layersznum encoder layersz--encoder-attention-headsznum encoder attention headsz--encoder-normalize-before
store_truez)apply layernorm before each encoder block)actionr(   z--encoder-learned-posz0use learned positional embeddings in the encoderz--decoder-embed-pathz%path to pre-trained decoder embeddingz--decoder-embed-dimzdecoder embedding dimensionz--decoder-ffn-embed-dimz#decoder embedding dimension for FFNz--decoder-layersznum decoder layersz--decoder-attention-headsznum decoder attention headsz--decoder-learned-posz0use learned positional embeddings in the decoderz--decoder-normalize-beforez)apply layernorm before each decoder blockz--decoder-output-dimzPdecoder output dimension (extra linear layer if different from decoder embed dimz"--share-decoder-input-output-embedz)share decoder input and output embeddingsz--share-all-embeddingszWshare encoder, decoder and output embeddings (requires shared dictionary and embed dim)z --no-token-positional-embeddingsFz?if set, disables positional embeddings (outside self attention))defaultr/   r(   z--adaptive-softmax-cutoffEXPRzacomma separated list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion)r+   r(   z--adaptive-softmax-dropoutz6sets adaptive softmax dropout for the tail projectionsz--layernorm-embeddingzadd layernorm to embeddingz--no-scale-embeddingzif True, dont scale embeddingsz--checkpoint-activationszicheckpoint activations at each layer, which saves GPU memory usage at the cost of some additional computez--offload-activationszUcheckpoint activations at each layer, then save to gpu.Sets --checkpoint-activations.z--no-cross-attentionzdo not perform cross-attentionz--cross-self-attentionzperform cross+self-attentionz--encoder-layerdropr   z!LayerDrop probability for encoder)r*   r+   r0   r(   z--decoder-layerdropz!LayerDrop probability for decoderz--encoder-layers-to-keepNz=which layers to *keep* when pruning as a comma-separated list)r0   r(   z--decoder-layers-to-keepz--quant-noise-pqz0iterative PQ quantization noise at training timez--quant-noise-pq-block-size   z1block size of quantization noise at training timez--quant-noise-scalarzBscalar quantization noise and scalar quantization at training timez--min-params-to-wrapad  minimum number of params for a layer to be wrapped with FSDP() when training with --ddp-backend=fully_sharded. Smaller values will improve memory efficiency, but may make torch.distributed communication less efficient due to smaller input sizes. This option is set to 0 (i.e., always wrap) when --checkpoint-activations or --offload-activations are passed.)add_argumentr   get_available_activation_fnsfloatstrintDEFAULT_MIN_PARAMS_TO_WRAP)parserr%   r%   r&   add_args5   s  
zCanmtModel.add_argsc                 C   sr  t | |jrt|jd|_|jrt|jd|_t|dddu r't|_	t|dddu r2t
|_|j|j}}|jrk||krDtd|j|jkrNtd|jr[|j|jkr[td| |||j|j}|}d|_n| |||j|j}| |||j|j}t|d	d
rd|_| |||}| |||}| |||}	|jst|dt}
t||
d}t||
d}| ||||	S )zBuild a new model instance.,max_source_positionsNmax_target_positionsz3--share-all-embeddings requires a joined dictionaryzP--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dimz?--share-all-embeddings not compatible with --decoder-embed-pathToffload_activationsFmin_params_to_wrapmin_num_params)base_architectureencoder_layers_to_keeplensplitencoder_layersdecoder_layers_to_keepdecoder_layersgetattrDEFAULT_MAX_SOURCE_POSITIONSr<   DEFAULT_MAX_TARGET_POSITIONSr=   	vocab_src	vocab_tgtshare_all_embeddings
ValueErrorencoder_embed_dimdecoder_embed_dimdecoder_embed_pathencoder_embed_pathbuild_embedding share_decoder_input_output_embedcheckpoint_activationsbuild_encoderbuild_decoderr8   r   )clsr   tasksrc_dicttgt_dictencoder_embed_tokensdecoder_embed_tokensr   r    r!   r?   r%   r%   r&   build_model   sh   zCanmtModel.build_modelNc           	      C   s<   t |}| }t|||}|rt|}t||| |S N)rD   pad	Embeddingr   parse_embeddingload_embedding)	rY   r   
dictionary	embed_dimpathnum_embeddingspadding_idxemb
embed_dictr%   r%   r&   rT   9  s   
zCanmtModel.build_embeddingc                 C   s   t |||S r`   )TransformerEncoder)rY   r   r[   embed_tokensr%   r%   r&   rW   E  s   zCanmtModel.build_encoderc                 C   s   t |||t|dddS )Nno_cross_attentionF)no_encoder_attn)TransformerDecoderrI   )rY   r   r\   rm   r%   r%   r&   rX   I  s   
zCanmtModel.build_decoderTFreturn_all_hiddensfeatures_onlyalignment_layeralignment_headsc	              
   C   s   | j |||d}	| j||	|||||d}
| j|d|d||||d}|d d }|d d }|g|gd	}| j|||||d|d}|
||fS )
z
        Run the forward pass for an encoder-decoder model.

        Copied from the base class, but without ``**kwargs``,
        which are not supported by TorchScript.
        )src_lengthsrq   )encoder_outrr   rs   rt   ru   rq   NT)rv   rr   full_context_alignmentrs   rt   ru   rq      
last_layerself_attn_padding_mask)rv   encoder_padding_mask)r   r    r!   )r"   
src_tokensru   prev_output_tokensprev_src_tokensrq   rr   rs   rt   rv   decoder_outdecoder_out_redecoder_out_tensordecoder_paddingdecoder_kvssrc_outr%   r%   r&   forwardR  sN   


	zCanmtModel.forward
net_output	log_probssamplec                 C   s   |  |||S )z@Get normalized probabilities (or log probs) from a net's output.)get_normalized_probs_scriptable)r"   r   r   r   r%   r%   r&   get_normalized_probs  s   zCanmtModel.get_normalized_probs      ?encoder_outsincremental_statestemperaturec                 C   s  d }|}| j j|||d}d }t|}|dkrN|d d urNt|d tr)|d }n|d d }	t|	tr7|	}n|	d ur?|	d }|d urN|d d dd d f }|d d d dd d d f ||dkred n|d f}
| j|
dd d}|d d dd d f }|d d }|||fS )	N)rv   incremental_staterx   attnr   Tr   r   ry   )r    r   rD   
isinstancer   div_r   r"   tokensr   r   r   rv   r   r   decoder_lenattn_holderdecoder_out_tupleprobsr   r%   r%   r&   forward_decoder  s8   

"
zCanmtModel.forward_decoderc                 C   s  d }|}| j j||d}d }t|}|dkrM|d d urMt|d tr(|d }n|d d }	t|	tr6|	}n|	d ur>|	d }|d urM|d d dd d f }|d d d dd d d f ||dkrdd n|d f}
| j|
dd d}|d d dd d f }|d d }||||fS )	N)rv   rx   r   r   r   Tr   ry   )r!   r   rD   r   r   r   r   r   r%   r%   r&   forward_decoder_src  s4   

"zCanmtModel.forward_decoder_src	net_inputc                 C   s   dd |  D }| j|S )Nc                 S   s.   i | ]\}}|d kr|dkr|dkr||qS )r}   r~   sourcesr%   ).0kvr%   r%   r&   
<dictcomp>  s
    z.CanmtModel.forward_encoder.<locals>.<dictcomp>)itemsr   forward_torchscript)r"   r   encoder_inputr%   r%   r&   forward_encoder  s   zCanmtModel.forward_encoderc                 C   s   |dusJ | j ||S )  
        Reorder encoder output according to *new_order*.

        Args:
            encoder_out: output from the ``forward()`` method
            new_order (LongTensor): desired order

        Returns:
            *encoder_out* rearranged according to *new_order*
        N)r   reorder_encoder_out)r"   r   	new_orderr%   r%   r&   r     s   zCanmtModel.reorder_encoder_outc                 C   s   | j || d S r`   )r    #reorder_incremental_state_scripting)r"   r   r   r%   r%   r&   reorder_incremental_state  s   z$CanmtModel.reorder_incremental_stater`   )TFNN)r   ) __name__
__module____qualname____doc__r   staticmethodr:   classmethodr_   rT   rW   rX   boolr   r7   r   torchjitexportr   r   r   r6   r   r   r5   r   r   r   r   r   __classcell__r%   r%   r#   r&   r      sv    
 J
9

	
<
-
%r   c                       s   e Zd ZdZ fddZdd Z	ddeej fdd	Z			
	ddeej de
deej fddZ		
	ddeej de
deej fddZejjdeeee f fddZdd Zdd Z  ZS )rl   aI  
    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
    is a :class:`TransformerEncoderLayer`.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        dictionary (~fairseq.data.Dictionary): encoding dictionary
        embed_tokens (torch.nn.Embedding): input embedding
    c                    sp   _ t | dtdg t jjj	d_
 j_|j}|j_ j_|_ jr4dnt|_ jsHt j|j jdnd _t dd}t ddr_t||d	_nd _ jsz jd
krzttj||dd j j _!nd _!jdkrt"jd_#nt$g _#j#% fddt& j'D  t(j#_) j*rt||d	_+d S d _+d S )Nversion   module_namer   learnedr   Flayernorm_embeddingr   r   bias        pc                    s   g | ]}  qS r%   )build_encoder_layer)r   ir   r"   r%   r&   
<listcomp>=  s    
z/TransformerEncoder.__init__.<locals>.<listcomp>),r   r   r   register_bufferr   r   r   dropoutr$   r   dropout_moduleencoder_layerdropembedding_dimri   r<   rm   no_scale_embeddingmathsqrtembed_scaleno_token_positional_embeddingsr   encoder_learned_posembed_positionsrI   r   r   adaptive_inputquant_noise_pqapply_quant_noise_nnLinearquant_noise_pq_block_sizer   r   layers
ModuleListextendrangerF   rD   
num_layersencoder_normalize_before
layer_norm)r"   r   re   rm   rf   r   r#   r   r&   r     sX   
	

zTransformerEncoder.__init__c                 C   sT   t |}t|dd}|rt|dd}t||d}|s t|dtnd}t||d}|S NrV   Fr>   )offload_to_cpur?   r   r@   )r   rI   r   r8   r   )r"   r   layer
checkpointr   r?   r%   r%   r&   r   G  s   z&TransformerEncoder.build_encoder_layerNtoken_embeddingc                 C   sr   |d u r	|  |}| j|  }}| jd ur|| | }| jd ur&| |}| |}| jd ur5| |}||fS r`   )rm   r   r   r   r   r   )r"   r|   r   xembedr%   r%   r&   forward_embeddingS  s   






z$TransformerEncoder.forward_embeddingFru   rq   token_embeddingsc                 C   s   |  ||||S )  
        Args:
            src_tokens (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            src_lengths (torch.LongTensor): lengths of each source sentence of
                shape `(batch)`
            return_all_hiddens (bool, optional): also return all of the
                intermediate hidden states (default: False).
            token_embeddings (torch.Tensor, optional): precomputed embeddings
                default `None` will recompute embeddings

        Returns:
            dict:
                - **encoder_out** (Tensor): the last encoder layer's output of
                  shape `(src_len, batch, embed_dim)`
                - **encoder_padding_mask** (ByteTensor): the positions of
                  padding elements of shape `(batch, src_len)`
                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
                  of shape `(batch, src_len, embed_dim)`
                - **encoder_states** (List[Tensor]): all intermediate
                  hidden states of shape `(src_len, batch, embed_dim)`.
                  Only populated if *return_all_hiddens* is True.
        )forward_scriptable)r"   r|   ru   rq   r   r%   r%   r&   r   c  s   zTransformerEncoder.forwardc                 C   s   | | j}|jjdkp| }| ||\}}|r&|d|d|  }|dd}g }	|r5|		| | j
D ]}
|
||r@|ndd}|rQ|	dusLJ |		| q8| jdur\| |}|g|g|g|	g g dS )r   xlarx   r   r   N)r{   rv   r{   encoder_embeddingencoder_statesr|   ru   )eqri   devicer*   anyr   	unsqueezetype_as	transposeappendr   r   )r"   r|   ru   rq   r   r{   has_padsr   r   r   r   r%   r%   r&   r     s>   




z%TransformerEncoder.forward_scriptablerv   c                 C   s$  t |d dkrg }n|d d d|g}t |d dkr!g }n|d d d|g}t |d dkr7g }n|d d d|g}t |d dkrMg }n|d d d|g}t |d dkrcg }n|d d d|g}|d }t |dkrt|D ]\}	}
|
d|||	< q|||||||d	S )
r   rv   r   rx   r{   r   r|   ru   r   r   )rD   index_select	enumerate)r"   rv   r   new_encoder_outnew_encoder_padding_masknew_encoder_embeddingr|   ru   r   idxstater%   r%   r&   r     sB   z&TransformerEncoder.reorder_encoder_outc                 C       | j du r| jS t| j| j jS )z.Maximum input length supported by the encoder.N)r   r<   minmax_positionsr"   r%   r%   r&   r    
   
z TransformerEncoder.max_positionsc                 C   s   t | jtr#d|}||v rtd| ||= td|d|< t| jD ]}| j	| 
|d|| q(d|}t||tdgd dk r\d	| _d
| _tdg||< |S )@Upgrade a (possibly old) state dict for new versions of fairseq.{}.embed_positions.weightszdeleting {0}rx    {}.embed_positions._float_tensorz{}.layers.{}
{}.versionr      NF)r   r   r   formatprintr   FloatTensorr   r   r   upgrade_state_dict_namedr   itemgetr   r   	normalize)r"   
state_dictnameweights_keyr   version_keyr%   r%   r&   r    s&   


"z+TransformerEncoder.upgrade_state_dict_namedr`   )NFN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r6   r   r   r  r  r   r%   r%   r#   r&   rl     s@    
5

$
G<rl   c                       sz  e Zd ZdZ		d  fdd	Zdd Zd!dd	Z								d"d
eee	e
e f  deee	ee	ee f f  dededee dee dee defddZ				d#d
eee	e
e f  deee	ee	ee f f  dedee dee f
ddZ	 				d#d
eee	e
e f  deee	ee	ee f f  dedee dee f
ddZdd Zdd Zdd Zdd Z  ZS )$rp   a  
    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
    is a :class:`TransformerDecoderLayer`.

    Args:
        args (argparse.Namespace): parsed command-line arguments
        dictionary (~fairseq.data.Dictionary): decoding dictionary
        embed_tokens (torch.nn.Embedding): output embedding
        no_encoder_attn (bool, optional): whether to attend to encoder outputs
            (default: False).
    FNc           	         s   _ t | dtdg td_t j	j
jd_ j_ j_|j} j}|_ j_|j_ j_|_ jrHdnt|_ jsf jdkrfttj ||dd j j!_"nd _"||krtt ||ddnd _# j$st%j|j j&dnd _'t( d	d}t( d
drt)||d_*nd _*t( dd_+jdkrt,jd_-nt.g _-j-/ fddt0 j1D  t2j-_3 j4rt( ddst)||d_5nd _5|jkr j6st |jddnd _7d _8|_9j9d u r: || d S d S )Nr   r   r   r   r   Fr   r   r   r   r   cross_self_attentionr   r   c                    s   g | ]}  qS r%   )build_decoder_layer)r   _r   ro   r"   r%   r&   r   r  s    
z/TransformerDecoder.__init__.<locals>.<listcomp>no_decoder_final_norm);r   r   r   r   r   r   empty_future_maskr   r   r$   r   r   decoder_layerdroprU   share_input_output_embedr   rQ   rf   decoder_output_dimoutput_embed_dimri   r=   rm   r   r   r   r   r   r   r   r   r   r   r   project_in_dimr   r   decoder_learned_posr   rI   r   r   r  r   r   r   r   r   rH   rD   r   decoder_normalize_beforer   tie_adaptive_weightsproject_out_dimadaptive_softmaxoutput_projectionbuild_output_projection)	r"   r   re   rm   ro   r%  input_embed_dimrf   r   r#   r  r&   r   3  s   
	

zTransformerDecoder.__init__c              	   C   s   |j d ur#tt|| jtj|j td|j|jr|nd |j	|j
d| _n7| jr@tj| jjjd | jjjd dd| _| jj| j_ntj| jt|dd| _tjj| jjd| jd d t|d	d}t|D ]}| j|d |j |d  t| qdd S )
N)r*   )r   adaptive_inputsfactortie_projrx   r   Fr         ࿩meanstdbase_layers)adaptive_softmax_cutoffr   rD   r  r   eval_str_listr7   adaptive_softmax_dropoutr"  adaptive_softmax_factortie_adaptive_projr$  r  r   r   rm   weightshaper%  initnormal_rI   r   r   insertrH   r   )r"   r   re   rm   num_base_layersr   r%   r%   r&   r&    sD   


z*TransformerDecoder.build_output_projectionc                 C   sV   t ||}t|dd}|rt|dd}t||d}|s!t|dtnd}t||d}|S r   )r   rI   r   r8   r   )r"   r   ro   r   r   r   r?   r%   r%   r&   r    s   
z&TransformerDecoder.build_decoder_layerrv   r   rr   rw   rs   rt   ru   rq   c
                 C   s0   | j ||||||d\}
}|s| |
}
|
|fS )a\  
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for teacher forcing
            encoder_out (optional): output from the encoder, used for
                encoder-side attention, should be of size T x B x C
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`
            features_only (bool, optional): only return features without
                applying output layer (default: False).
            full_context_alignment (bool, optional): don't apply
                auto-regressive mask to self-attention (default: False).

        Returns:
            tuple:
                - the decoder's output of shape `(batch, tgt_len, vocab)`
                - a dictionary with any model-specific outputs
        )rv   r   rw   rs   rt   )extract_featuresoutput_layer)r"   r}   rv   r   rr   rw   rs   rt   ru   rq   r   extrar%   r%   r&   r     s    
	
zTransformerDecoder.forwardc                 C   s   |  ||||||S r`   )extract_features_scriptable)r"   r}   rv   r   rw   rs   rt   r%   r%   r&   r;    s   
z#TransformerDecoder.extract_featuresc                 C   s  |  \}}|du r| jd }d}	d}
|dur8t|d dkr8|d d }	|	  d |ks8J d| d|	j |durJt|d dkrJ|d d }
d}| jdurX| j||d}|durt|ddd	df }|durt|ddd	df }| j| | }| jdur| |}| jdur| |}|dur||7 }| j	dur| 	|}| 
|}|dd}d}| js|| j r|| j}d}|g}t| jD ]?\}}|du r|s| |}nd}|||	|
|||t||kt||kd
\}}}|| |dur	||kr	| |}q|dur |dur|d| }|jdd}| jdur+| |}|}|dd}| jdur>| |}||g|||dfS )a  
        Similar to *forward* but only return features.

        Includes several features from "Jointly Learning to Align and
        Translate with Transformer Models" (Garg et al., EMNLP 2019).

        Args:
            full_context_alignment (bool, optional): don't apply
                auto-regressive mask to self-attention (default: False).
            alignment_layer (int, optional): return mean alignment over
                heads at this layer (default: last layer).
            alignment_heads (int, optional): only average alignment over
                this many heads (default: all heads).

        Returns:
            tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - a dictionary with any model-specific outputs
        Nrx   rv   r   zExpected enc.shape == (t, z	, c) got r{   )r   r   )self_attn_maskrz   	need_attnneed_head_weights)dim)r   inner_statesry   rz   )sizer   rD   r6  r   r   rm   r   r  r   r   r   r  r   ri   r   r   r   buffered_future_maskr   r   r5   tor-  r   r#  )r"   r}   rv   r   rw   rs   rt   bsslenencpadding_mask	positionsr   rz   r   rC  r   r   r?  
layer_attnself_attn_hiddenry   r%   r%   r&   r>    s   


















z.TransformerDecoder.extract_features_scriptablec                 C   s   | j du r
| |S |S )z(Project features to the vocabulary size.N)r$  r%  )r"   featuresr%   r%   r&   r<  t  s   

zTransformerDecoder.output_layerc                 C   r   )z/Maximum output length supported by the decoder.N)r   r=   r   r  r  r%   r%   r&   r  |  r  z TransformerDecoder.max_positionsc                 C   sz   | d}| j ddks| jj|jkr| j d|k r+ttt||gd| _| j|| _| jd |d |f S )Nr   rx   )	rD  r  r   r   triur   fill_with_neg_infzerosrF  )r"   tensorrB  r%   r%   r&   rE    s   
z'TransformerDecoder.buffered_future_maskc                 C   sB  t | jtrd|}||v r||= td|d|< | d|vrD| jr,| d}n| d}||v rD|| || d< | jsD||= t| jD ]1}ddd	d
}|	 D ]$\}}dD ]}	d||||	}
|
|v rx||
 |d||||	< ||
= q[qUqId|}t
||tdgd dkrd| _d| _tdg||< |S )r  r  rx   r  z.output_projection.weightz.embed_tokens.weightz
.embed_outself_attn_layer_normencoder_attn_layer_normfinal_layer_norm)012)r5  r   z{}.layers.{}.layer_norms.{}.{}z{}.layers.{}.{}.{}r  r   r  NF)r   r   r   r	  r   r  r  r   r   r   r   r  r  r   r   r  )r"   r  r  r  embed_out_keyr   layer_norm_mapoldnewmr   r  r%   r%   r&   r    sV   


	"z+TransformerDecoder.upgrade_state_dict_named)FN)F)NNFFNNNF)NFNN)r   r   r   r   r   r&  r  r   r   r6   r   r   r   r7   r   r   r;  r>  r<  r  rE  r  r   r%   r%   r#   r&   rp   &  s    U
!

	

2





y
rp   c                 C   s@   t j| ||d}t jj|jd|d d t j|j| d |S )N)ri   r   r+  r,  )r   rb   r7  r8  r5  	constant_)rh   r   ri   r]  r%   r%   r&   rb     s   rb   Tc                 C   s4   t | ||}t j|j |rt j|jd |S )Nr   )r   r   r7  xavier_uniform_r5  r^  r   )in_featuresout_featuresr   r]  r%   r%   r&   r     s
   r   c                 C   sH  t | dd | _t | dd| _t | dd| _t | dd| _t | dd	| _t | d
d| _t | dd| _t | dd | _t | d| j| _	t | d| j| _
t | dd| _t | dd	| _t | dd| _t | dd| _t | dd| _t | dd| _t | dd| _t | dd| _t | dd | _t | dd| _t | dd| _t | dd| _t | d d| _t | d!d| _t | d"d| _t | d#d| _t | d$| j	| _t | d%| j	| _t | d&d| _t | d'd| _t | d(d| _t | d)d| _ t | d*d| _!| j!rd+| _ t | d,d | _"t | d-d | _#t | d.d| _$t | d/d| _%t | d0d| _&t | d1d	| _'t | d2d| _(d S )3NrS   rP   i   encoder_ffn_embed_dimi   rF      encoder_attention_headsr2   r   Fr   rR   rQ   decoder_ffn_embed_dimrH   decoder_attention_headsr!  r   attention_dropoutr   activation_dropoutactivation_fnrelur   g?r0  r2  r   rU   rN   r   r   rn   r  r  decoder_input_dimr   r   r"  rV   r>   TrC   rG   r   r  r   r   quant_noise_scalar))rI   rS   rP   rb  rF   rd  r   r   rR   rQ   re  rH   rf  r!  r   rg  rh  ri  r   r0  r2  rU   rN   r   r   rn   r  r  rk  r   r   r"  rV   r>   rC   rG   r   r  r   r   rl  r   r%   r%   r&   rB     s   rB   c                 C   s   t | dd| _t | dd| _t | dd| _t | dd| _t | d	d
| _t | dd
| _t | dd| _t | dd| _t | dd| _	t | dd| _
t | dd| _t | dd| _t | dd| _t|  d S )NrP   i   rb  i   rd     rF      r   Tr!  rH   r   rQ   re  rf  rg  g{Gz?rh  r   )rI   rP   rb  rd  rF   r   r!  rH   rQ   re  rf  rg  rh  r   rB   rm  r%   r%   r&   transformer_deep  s$   rp  )T)0r   typingr   r   r   r   r   numpyr   torch.nnr   fairseqr   fairseq.distributedr   fairseq.modelsr	   r
   r   r   r   fairseq.modulesr   r   r   r   r   r   r   r   r   &fairseq.modules.checkpoint_activationsr   fairseq.modules.quant_noiser   r   r   rJ   rK   r7   r8   r   rl   rp   rb   r   rB   rp  r%   r%   r%   r&   <module>   s<   ,   l  !   
;