o
    ߥi                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZ dgZejejejd	G d
d de
ZG dd deddZdd Zdd Zdd Zdd ZdMddZdNddZdOddZdd ZdMd d!ZdMd"d#ZdMd$d%Zi d&fd'd(Z i d)fd*d+Z!dddi d,fd-d.Z"dPd1d2Z#dQd4d5Z$d6d7 Z%	8		dRd9d:Z&d;d< Z'	=dSd>d=Z(			 		dTd?d@Z)dAdB Z*dCdD Z+dEdF Z,dZ-dGdH Z.G dIdJ dJej/j0j1j2Z3dKdL Z4dS )U    N)
namedtuple)Dict)Models)ModelTensor)MODELS)TasksCsanmtForTranslation)module_namec                       s   e Zd Z fddZ			d#deeef deeef deeef deeef deeef f
d	d
Zdeeef deeef fddZ	dd Z
d$ddZdd Zd%ddZi dfddZ		d&ddZdd Zdd Zi dfdd Zd!d" Z  ZS )'r	   c                    s.   t  j|g|R i | || _t| j dS )zK
        Args:
            params (dict): the model configuration.
        N)super__init__paramsprint)self	model_dirargskwargs	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/csanmt/translation.pyr      s   zCsanmtForTranslation.__init__Ninputlabelprefix
prefix_hitreturnc           	      C   st   |du r-t jjd | |||d| j\}}W d   n1 s#w   Y  ||dS | ||\}}||dS )a  return the result by the model

        Args:
            input: the preprocessed input source sequence
            label: the ground truth target data for model training
            prefix: the preprocessed input target prefix sequence for interactive translation
            prefix_hit: the preprocessed target prefix subword vector for interactive translation

        Returns:
            output_seqs: output sequence of target ids
        NNmtModel)
input_widsprefix_widsr   )output_seqsoutput_scores)train_oploss)tfcompatv1variable_scopebeam_searchr   transformer_model_train_fn)	r   r   r   r   r   r   r    r!   r"   r   r   r   __call__   s"   
zCsanmtForTranslation.__call__c                 C   s   dS )z
        Run the forward pass for a model.

        Args:
            input (Dict[str, Tensor]): the dict of the model inputs for the forward method

        Returns:
            Dict[str, Tensor]: output from the model forward pass
        Nr   )r   r   r   r   r   forwardA   s   
zCsanmtForTranslation.forwardc                 C   s  |d }|d }t jjjd|d t jd}|d rBt jjjdt jjjd t jjjd	||g|d
}W d    n1 s<w   Y  n#t jjd t jjjd	||g|d
}W d    n1 s`w   Y  t jjd|g}t j|t j	dd d d df }t 
||gd}	t jt |	dt jd}
|
d d d df }t j|ddgddggdd}t |t |	t j}||d  }|d dkrt|}t |t |d}t j||}t|d}|d dkrt jj||d d}t||||}||fS )Nsrc_vocab_sizehidden_size              ࿩dtypeshared_source_target_embeddingShared_EmbeddingreuseWeightsinitializerSource_Embeddingencoder_input_bias   r   tensorpaddingsconstant_values      ?position_info_typeabsolute   maskingresidual_dropoutrate)r#   r$   r%   random_normal_initializerfloat32r&   
AUTO_REUSEget_variable
zeros_likeint64concatcast	not_equalpadgatherint32add_timing_signalmultiplyexpand_dimsnnbias_addattention_biasdropouttransformer_encoder)r   featuresr   r+   r,   r7   src_embeddingsrc_biaseos_paddingsrc_seqsrc_maskshift_src_maskencoder_inputencoder_self_attention_biasencoder_outputr   r   r   encoding_graphM   sf   

 

z#CsanmtForTranslation.encoding_graphc                 C   s  |d }t jjjd|d t jd}d }|d r|d }d}n|dkr)|d }d	}n|d
kr4|d }d}ntdt jjj|t jjjd t jjjd||g|d}W d    n1 s[w   Y  t j	|t j
dd d d df }	t ||	gd}
t jt |
dt jd}|d d d df }t j|ddgddggdd}t |t |
t j}||d  }t |t |d}t|d}|d dkrt jj||d d}t||||}|S )Nr,   r-   r.   r/   r1   r+   Shared_Semantic_EmbeddingsourceSource_Semantic_Embeddingtargettrg_vocab_sizeTarget_Semantic_Embeddingzerror: no right name specified.r3   r5   r6   r:   r   r;   r<   r@   rC   rD   rE   rF   )r#   r$   r%   rH   rI   
ValueErrorr&   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rU   rV   rY   rW   rZ   transformer_semantic_encoder)r   r\   r   namer,   r7   scope
vocab_sizeembedding_matr_   	input_seq
input_maskshift_input_maskrc   rd   re   r   r   r   semantic_encoding_graph   s\   
 
z,CsanmtForTranslation.semantic_encoding_graphc                 C   s@   d}d}|d rd }d }| j |||d}| j |||d}||fS )Nrh   rj   r1   ro   )rv   )r   r\   labelsr   source_nametarget_namefeature_outputlabel_outputr   r   r    build_contrastive_training_graph   s   z5CsanmtForTranslation.build_contrastive_training_graph-q=c                    sl   |d  |d  d dksJ  fdd}g }|||}| ||| t| ks/J tj|ddS )Nnum_of_samplesetarC   r   c              	      s   ||  }t jt |t jt |ddd  t jt |dddt jt |ddd d  }g }t d D ]+}t jt j	|dd| d t jt j	|ddd  }| ||  }|
| q;|S )NrC   T)input_tensoraxiskeepdimsr   r-         ?)r#   mathdivideabs
reduce_min
reduce_maxrangerandomnormalshapeappend)x_vectory_vectorbias_vectorw_rRiomegasampleKepsilonr   r   r   get_samples   s2   
z7CsanmtForTranslation.MGMC_sampling.<locals>.get_samplesr   )extendlenr#   rN   )r   x_embeddingy_embeddingr   r   r   ALL_SAMPLESr   r   r   MGMC_sampling   s   
z"CsanmtForTranslation.MGMC_samplingc              
   C   s  |d }|d }t jjjd|d t jd}|d rBt jjjdt jjjd t jjjd	||g|d
}	W d    n1 s<w   Y  n#t jjd t jjjd	||g|d
}	W d    n1 s`w   Y  t j|t j	dd d d df }
t 
||
gd}t jt |dt jd}|d d d df }t j|ddgddggdd}t |	t |t j}||d 9 }tt j|dd d}t j|ddgddgddggdd d d dd d f }|d dkrt|}t jj|dd|d   d}t||||d d ||d\}}| ||}|d }d|d  t j|d t jd }t jt |t j|||d}t ||j}t jj|t |d| }t j|dt j|d }|S )Nrk   r,   r-   r.   r/   r1   r2   r3   r5   r6   Target_Embeddingr:   r   r;   r<   r@   r   causalr=   r>   rA   rB   r   rE   rF   
states_key
states_valembedding_augmentationr   
confidence)depthon_value	off_value)logitsrx   r   )r#   r$   r%   rH   rI   r&   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rY   r   rT   rW   rZ   transformer_decoder
predictionone_hotr0   !softmax_cross_entropy_with_logitsstop_gradient
reduce_sum)r   re   rd   rx   r   r   rk   r,   r7   trg_embeddingr_   trg_seqtrg_maskshift_trg_maskdecoder_inputdecoder_self_attention_biasdecoder_outputattention_weightsr   r   r   soft_targetsmaskxentropyr"   r   r   r   decoding_graph   s   
 



z#CsanmtForTranslation.decoding_graphc           
      C   s   |  ||\}}d }|d ur:|d ur:| |||}t||d ddg}t||d dddg}t||d dg}| j|||||d}	|	S )Nr   r:   )r   )rf   r   r#   tiler   )
r   r\   rx   r   feature_embeddinglabel_embeddingre   rd   r   r"   r   r   r   build_training_graph)  s0   z)CsanmtForTranslation.build_training_graphc                    s  t | j}tjjjd|d | jd | jd }tjjj }t| t	| jd || j}tj
|tjd}| jd dkrGtjjj|}n)| jd d	krdtjjjj|| jd
 | jd | jd d}ntjjjd t  t|| jd }dd tjtj dd k  fdd fddd tjtjdd k fddfddddkrt }	t}
n g}	g}
dkrdd tD }ndg}g }g }t|D ]\}}t| tjjjtjj |dkrdnd dg tdd |f M | |	| |
| | j\}}| |	| |
| | j||}|| tjjjd! || d"d tjj! D }|j"||dd#}|| W d    n	1 sVw   Y  W d    n	1 sfw   Y  W d    n	1 svw   Y  qt#|t$| }t%|}|d$krt&t'| \}}t(||\}}t'||}|j)|tjjj d%}||fW  d    S 1 sw   Y  d S )&Nr   r6   num_gpusgradient_clip_normlearning_rate)valuer0   	optimizersgdadam
adam_beta1
adam_beta2adam_epsilon)r   beta1beta2r   zoptimizer not supportedupdate_cyclec                 S   s6   | }t |D ]}tj|| gdd}q|d |f }|S )Nr   r   )r   r#   rN   )inputsr   outputsr   r   r   r   	fill_gpuse  s
   zBCsanmtForTranslation.transformer_model_train_fn.<locals>.fill_gpusr   r   c                      s
    S Nr   r   )r\   r   r   r   r   <lambda>n     
 zACsanmtForTranslation.transformer_model_train_fn.<locals>.<lambda>c                          S r   r   r   )r\   r   r   r   o      )predtrue_fnfalse_fnc                      s
    S r   r   r   )r   rx   r   r   r   r   r  r   c                      r   r   r   r   )rx   r   r   r   s  r   c                 S   s   g | ]}d | qS )zgpu:%dr   ).0dr   r   r   
<listcomp>}  s    zCCsanmtForTranslation.transformer_model_train_fn.<locals>.<listcomp>zcpu:0Tr3   z%s_%dGPUzmle_loss_{}c                 S   s$   g | ]}d |j vrd|j vr|qS )Semantic_Embeddingmini_xlm_encoderrw   )r   vr   r   r   r     s    

)var_listcolocate_gradients_with_opsr-   )global_step)*get_initializerr   r#   r$   r%   r&   trainget_global_stepr   get_learning_rate_decayconvert_to_tensorrI   GradientDescentOptimizerAdamOptimizerlogginginfosysexitMultiStepOptimizercondr   shard_featuresr   	enumeratedeviceget_variable_scope
name_scoper}   r   r   summaryscalarformattrainable_variablescompute_gradientsadd_nr   average_gradientslistzipclip_by_global_normapply_gradients)r   r\   rx   r7   r   r   r   r   optfeature_shardslabel_shardsdevicesmulti_gradssharded_lossesr   r   r{   r|   mle_losstrainable_vars_listgrads_and_vars
total_lossgradsr   _r!   r   )r\   r   rx   r   r   r(   H  s   










 

&z/CsanmtForTranslation.transformer_model_train_fnc           	      C   s   |d }|d }|d r5|d rdnd}t jjj|dd t jjd	}W d    n1 s/w   Y  n
t jjd
t|g}t j|dd d }t |d|g}t j||dd}t |t 	||ggd}|S )Nr,   rk   $shared_embedding_and_softmax_weightsr1   r2   r   Tr3   r5   Softmaxr   r;   transpose_br   )
r#   r$   r%   r&   rK   tgt_vocab_sizer   reshapematmulrN   )	r   r   r   r,   rk   embedding_scopeweightsr   r   r   r   r   r     s(   
zCsanmtForTranslation.predictionFc	              
   C   sz  |d }	|d }
t jjjd|
d t jd}|d rBt jjjdt jjjd t jjjd	|	|
g|d
}W d    n1 s<w   Y  n#t jjd t jjjd	|	|
g|d
}W d    n1 s`w   Y  t |t 	|t j
}||
d 9 }tt j|dd d}t j|ddgddgddggdd d d dd d f }|d dkrt|}|s|d d dd d d f }|d d d d dd d d f }t||||||||d\}}|s|d d dd d f }|d d dd d f }n|}|}|d r|d rdnd}t jjj|dd t jjd	}W d    n	1 sw   Y  n
t jjd|	|
g}t j||dd}t j|}||||fS )Nrk   r,   r-   r.   r/   r1   r2   r3   r5   r6   r   r@   r   r:   r   r   r   r;   rA   rB   r   r  Tr  r  )r#   r$   r%   rH   rI   r&   rJ   rK   rR   rO   rS   rY   r   rQ   rT   r   r  rW   log_softmax)r   re   r{   rd   r   r   r   r   	is_prefixrk   r,   r7   r   r   r   r   r   decoder_output_lastattention_weights_lastr  r  r   log_probr   r   r   inference_func  s   	

	

z#CsanmtForTranslation.inference_funcc           !         s$  
d 
d 
d 
d 	
d 
d |d }d|v r)|d }|d	 }nd }d }t |d
 t|}t|}|d urPt t|t j}t|}|
\d}
d r`d }j|
|dfddt	D fddt	D t	D ]}| 	t 
d d g | 	t 
d d g qfddt	D fddt	D d}|d urtt j|t dgd
gdd}	t |	d }t|	}
fddt	D }fddt	D }j|
||
dd\}}fddt	D fddt	D t|}t ||d d dd d f t |d d dd d f t jj }t |
d d d df t t t |dt jdgd}	t|	}	t j|	t dgd
gdd}	n	t dgd
}	t dgt jjgd   g}t |dg}t |}|	}t gt jj}t t gd
t j}t|	||ff|||fd}	
f
dd  fd!d"} fd#d$}t jd
d%d}tt 
g d&t 
d d gt 
d d gffd'dt	D fd(dt	D ft 
d d gt 
g d&t 
d d gfd}t j||||gt 
g |gdd)d*}|d }|jd
 }|jd }|jd
 }|jd }|jd } |	d d g |	d d g t jjt j|dd+||}t jjt j|dd+| |} |d d d d |d df }|| fS ),N	beam_sizerk   r,   num_decoder_layerslp_ratemax_decoded_trg_lenr   r   r   r   rh   r1   rw   c                       g | ]}t  d gdqS r   r-   r#   fillr   layer
batch_sizer,   r   r   r          z4CsanmtForTranslation.beam_search.<locals>.<listcomp>c                    r$  r%  r&  r(  r*  r   r   r   $  r,  c                       g | ]	}t |  qS r   tile_to_beam_sizer(  )r   r   r   r   r   -      c                    r-  r   r.  r(  )r   r   r   r   r   1  r0  r:   rC   r   r;   c                       g | ]}t  | qS r   merge_first_two_dimsr(  r   r   r   r   ;      
c                    r1  r   r2  r(  r   r   r   r   ?  r5  Tr   r  c                       g | ]
}t |  qS r   split_first_two_dimsr(  r+  r   step_states_keyr   r   r   N      
c                    r8  r   r9  r(  r+  r   step_states_valr   r   r   S  r=  r-   r   statefinishc           "   
      sd  |j d d \}}|j\t|}fddtD }fddtD }j	
|||dd\}}t|}t|d| }	fddtD fddtD td	tj	| d
 tj
d d }
|	|
 }t|d g}tjj|d d\}}| }| }t||}tj|d d d d d df t|dgdd}td d
gtdtj}tj||gdd}t|d}|tj	|tj
dtj
j  }tj|\} t| }t|  t| }tj|d d d d d df t|dgdd}td
gtdtj}tj||gdd} fddtD } fddtD }||
 }|j\}}}|dtj	|tj
d tj
j  }tj||gd
d}tj||gd
d}tj|\}}t||}td
gtdtj}tj||gdd}tj||gd
d} t| |} t|||f||f|| |fd}!| d
 |!fS )NrC   c                    r1  r   r2  r(  r4  r   r   r   }  r5  zOCsanmtForTranslation.beam_search.<locals>._beam_search_step.<locals>.<listcomp>c                    r1  r   r2  r(  r6  r   r   r     r5  Fr7  c                    r8  r   r9  r(  r;  r   r   r     r=  c                    r8  r   r9  r(  r>  r   r   r     r=        @r:   r/         @r;   )kr   r   c                    r-  r   	gather_2dr(  )alive_indicesnext_states_keyr   r   r     r0  c                    r-  r   rF  r(  )rH  next_states_valr   r   r     r0  r   r@  )r   rA  r3  r   r  r:  r#   rV   powrO   rI   r  rW   top_krG  rN   r'  constantrS   equalminrB  BeamSearchState)"timerA  seqs	log_probs	flat_seqsflat_states_keyflat_states_valstep_log_probsstep_attn_weightscurr_log_probslength_penaltycurr_scores
top_scorestop_indicesbeam_indicessymbol_indicescandidate_seqspad_seqsflagsalive_scoresalive_symbols
alive_seqsalive_states_keyalive_states_valalive_log_probsprev_fin_flagsprev_fin_seqsprev_fin_scoresstep_fin_scores	fin_flags
fin_scoresfin_indicesfin_seqs	new_state)
r+  r   re   rd   r{   r"  r!  r   r   rk   )rH  rI  rJ  r   r   r<  r?  r   _beam_search_stepx  s   












z;CsanmtForTranslation.beam_search.<locals>._beam_search_stepc                    s   |j d }|jd }|jd }tdtjtjd d  }|d d df | }tj|tj|tjd dd}dtjtj|ddtjd }|tjj| 7 }tj	t
||d	}	tt| t|	}
|
S )
Nr:   r   rC   rC  r/   rD  r   r   r   r   )r   rB  r#   rK  rO   rI   r   
reduce_anyrO  
reduce_allgreaterlogical_andlesslogical_not)tsrS  finished_flagsfinished_scoresmax_lpbest_alive_scoreworst_finished_scoreadd_maskbound_is_metr   )r"  r#  r   r   _is_finished  s8   


z6CsanmtForTranslation.beam_search.<locals>._is_finishedc                    s    | |}|S r   r   )rz  r{  outs)rr  r   r   _loop_fn  s   
z2CsanmtForTranslation.beam_search.<locals>._loop_fnrQ  NNNc                       g | ]}t d d d  gqS r   r#   TensorShaper(  r,   r   r   r     r,  c                    r  r   r  r(  r  r   r   r     r,  F)r   body	loop_varsshape_invariantsparallel_iterations	back_proprs  ) r#   r   r/  r3  rO   rS   rf   rv   r   	set_shaper  rN   r'  r  where	ones_likerI   rO  rV   argmaxr:  rM  r   rL   boolrP  
while_loopr   rB  r$   r%   rt  )!r   r\   r   	src_inputr   r   ry   r)  fixed_length	init_seqsrT  rU  rV  rW  rX  rS  init_log_probsinit_scoresrp  rn  rm  rA  r  r  rQ  r  r   final_statere  rc  final_flags
final_seqsfinal_scoresr   )rr  r+  r   re   rd   r{   r,   r"  r#  r!  r   r   r   r   r<  r?  rk   r   r'     s6  






"
u






z CsanmtForTranslation.beam_searchr  r   )r~   NN)__name__
__module____qualname__r   r   strr   r  r)   r*   rf   rv   r}   r   r   r   r(   r   r  r'   __classcell__r   r   r   r   r	      sB    





"#
20
%
O
e
Cc                   @   s   e Zd ZdS )rP  N)r  r  r  r   r   r   r   rP  6  s    rP  r@  c                 C   s0   t j| dd} dg| jj }||d< t | |S )z#Tiles a given tensor by beam_size. r:   r   )r#   rV   r   ndimsr   )r=   r   	tile_dimsr   r   r   r/  ;  s   r/  c                 C   sp   t | } | jjd u rt | S | j }t | }g }tt|D ]}|| }|d u r0|| }|| q"|S r   )r#   r   r   dimsas_listr   r   r   )xstatic_shapedynamic_shaperetr   dimr   r   r   infer_shapeD  s   



r  c                 C   s,   t | }|g|g |dd   }t| |S Nr:   )r  r#   r  )r=   dim_0dim_1r   	new_shaper   r   r   r:  W  s   r:  c                 C   s2   t | }|d  |d 9  < |d t| |S )Nr   r:   )r  popr#   r  )r=   r   r   r   r   r3  ]  s   
r3  c                 C   sd   t | d }t |d }t || | }t |||g}t j||gdd}t j| ||d}|S )z Gather the 2nd dimension given indices
    :param params: A tensor with shape [batch_size, M, ...]
    :param indices: A tensor with shape [batch_size, N]
    :param name: An optional string
    :return: A tensor with shape [batch_size, N, ...]
    r   r:   r;   r   rw   )r#   r   r   r  stack	gather_nd)r   indicesro   r+  
range_size	batch_posoutputr   r   r   rG  d  s   rG  Tc                 C   s  t jjj|d| g|d t| ttfs| g} dd | D }t| t|kr*tdt j	t 
| d d d |ggdd}d	d | D } g }|rht|}t 	| d
} ||g}	t jjd|	}
|t | |
 n&tt|D ]}|| |g}	d| }t jj||	}
|t | | |
 qnt |}|r|g}	t jjd|	}t j||}t ||}|W  d    S 1 sw   Y  d S )Nlinear)default_namevaluesr0   c                 S   s   g | ]}|  d  qS r;   )	get_shape)r   itemr   r   r   r   {  s    zlinear.<locals>.<listcomp>z inputs and input_size unmatched!r   r;   r   c                 S   s"   g | ]}t |d |jd  gqS r  )r#   r  r   )r   inpr   r   r   r     s   " r:   matrixz	matrix_%dbias)r#   r$   r%   r&   
isinstancer  tupler   RuntimeErrorrN   r   sumrK   r   r  r   r   rW   rX   r  )r   output_sizer  rN   r0   rp   
input_sizeoutput_shaperesultsr   r  r   ro   r  r   r   r   r  u  s@   

$r  ư>c           
      C   s   t jjj|d| g|dP |   d }t jjjd|gt  d}t jjjd|gt  d}t 	| dd}t 	t 
| | dd}| | t jj||  }	|	| | W  d    S 1 s_w   Y  d S )N
layer_norm)r  r  r4   r;   layer_norm_scaler6   layer_norm_offsetT)r#   r$   r%   r&   r  r  rK   ones_initializerzeros_initializerreduce_meansquarersqrt)
r   r   ro   r4   channel_sizescaleoffsetmeanvariancenorm_inputsr   r   r   r    s"   

$r  c                 C   s,   |r|dkr| S |dkrt | S td| )Nnoner  Unknown mode %s)r  rm   )r  moder   r   r   _layer_process  s
   r  c                 C   s(   |r|dk rt jj|d| d}| | S )Nr   r:   rF   )r#   rW   rZ   )r  y	keep_probr   r   r   _residual_fn  s   r  c              	   C   s   |d }d|d  }t jjj|d| |gd^ t jjd t||dd}t j|}W d    n1 s6w   Y  |rK|dk rKt jj|d| d	}t jjd
 t||dd}W d    n1 sdw   Y  | | W  d    S 1 sww   Y  d S )Nr,   r   relu_dropoutembedding_augmentation_layerr  r  input_layerTr:   rF   output_layerr#   r$   r%   r&   r  rW   relurZ   )r  r   r   ro   r,   r  hiddenr  r   r   r   r    s$   $r  c              	   C   s   |d }|d }d|d  }t jjj|d| gd\ t jjd t| |dd}t j|}W d    n1 s9w   Y  |rN|dk rNt jj|d	| d
}t jjd t||dd}W d    n1 sgw   Y  |W  d    S 1 sxw   Y  d S )Nfilter_sizer,   r   r  	ffn_layerr  r  Tr:   rF   r  r  )r  r   ro   r  r,   r  r  r  r   r   r   transformer_ffn_layer  s"   $r  encoderc                 C   s\  |d }|d }|d }|d }|d }	|d }
|d }| }t |d}t jjj|t jjjd	t t|D ]b}t jjd
| O |d dkrK|d nd }tt||
d ||||||	|dd
\}}t	||d| }t||}t
t||
|}t	||d| }t||}t ||}W d    n1 sw   Y  q5t||
W  d    S 1 sw   Y  d S )Nnum_encoder_layersr,   	num_headsrE   attention_dropoutlayer_preproclayer_postprocrC   r3   layer_%drA   relativemax_relative_disencoder_self_attentionr  ro   r   )r#   rV   r$   r%   r&   rJ   r   multihead_attentionr  r  r  rU   )rc   rd   r   r   ro   r  r,   r  rE   r  r  r  r  r)  r  owr   r   r   r[     sP   



$r[   r   c                 C   s  |d }|d }|d }|d }|d }	|d }
|d }| }t |d}t jjj|t jjjd	 t|D ]Z}t jjd
| G |d }tt||
d ||||||	|dd
\}}t	||d| }t||}t
t||
|}t	||d| }t||}t ||}W d    n1 sw   Y  q5t jjjdt jjjd	" t j|ddt j|dd }tt j|dd|dd}W d    n1 sw   Y  t||
W  d    S 1 sw   Y  d S )Nnum_semantic_encoder_layersr,   r  rE   r  r  r  rC   r3   r  r  r  r  r   pooling_layerr:   rs  r   T)r#   rV   r$   r%   r&   rJ   r   r  r  r  r  rU   r   r  )rc   rd   r   r   ro   r  r,   r  rE   r  r  r  r  r)  r  r  r  r  r   r   r   rn     sf   




$rn   decoderc	                 C   s  |d }	|d }
|d }|d }|d }|d }|d }| }t jjj|t jjjd t|	D ]}t jjd	| ~ |d
 dkrE|d nd }|d urYt|t|||}t||}tt||d ||
|
|
||||||dd\}}t	||d| }t||}tt|||||
|
|
|||dd
\}}t	||d| }t||}t
t|||}t	||d| }t||}W d    n1 sw   Y  q/t|||fW  d    S 1 sw   Y  d S )Nr!  r,   r  rE   r  r  r  r3   r  rA   r  r  decoder_self_attention)r   r   r)  r  ro   r   encdec_attentionr  )r#   r$   r%   r&   rJ   r   r  r  r  r  r  )r   re   r   encoder_decoder_attention_biasr   r   r   r   ro   r!  r,   r  rE   r  r  r  r  r)  r  r  r  r   r   r   r   H  s~   	





-$r   r        @c              	   C   s  t | d }t | d }t t |t j}|d }tt|t| t |t jd  }|t t t |t j|   }t 	|dt 	|d }	t j
t |	t |	gdd}
t |
ddgdt jj|dgg}
t |
d||g}
| t |
| j S )Nr:   rC   r   r   )r#   r   rO   r   rI   r   logfloatexprV   rN   sincosrQ   r$   r%   modr  r0   )r  min_timescalemax_timescalelengthchannelspositionnum_timescaleslog_timescale_incrementinv_timescalesscaled_timesignalr   r   r   rT     s"   &$rT       ec                 C   s   |d u rt j}|t jkr|j}|dkr&| }d| | }t t |dd}n+|dkrK| }t jt ||gddd}|d|  }t |dd||g}ntd| t 	||S )NrD   r   r:   r   r;   r   r  )
r#   rI   rO  rV   linalg	band_partr'  r  rm   rO   )r   r  infr0   r   r  r	  lower_triangler   r   r   rY     s"   
rY   c           	      C   s   |}|   j}| jj}|d }|d d |g |r|| nd g }t| tt| d d |dggd}|| d|d gdd td|d D  |g }t	||S )Nr;   r   r:   c                 S   s   g | ]}|qS r   r   )r   r   r   r   r   r     s    zsplit_heads.<locals>.<listcomp>)
r  r  r   r  r#   r  rN   r  r   	transpose)	r  r  n	old_shaper  lastr  r  permr   r   r   split_heads  s   
$*
*r  r-   c              	   C   s  t jjj|d| ||gd t | }|d |d |d |d f\}}	}
}t |d }t |d }|d ur@|d |d }}|d u rMt j| |d	d
}n9t j| |d	d
}t t | g d|
||	 |g} t | t |g d}t t |g d||	|
|g}|| }|d ur||7 }t jj	|dd}|dkrt j
|d| }|d u rt |||fW  d    S t ||}t t |g d|
||	 |g}t ||}t t |g d||	|
|g}|| }t t |g d||	|
|g}||fW  d    S 1 sw   Y  d S )Ndot_product_attentionr  r   r:   rC      rpr_krpr_vTr  )rC   r   r:   r  )r   rC   r:   )r:   r   rC   r   rw   r-   r   )r#   r$   r%   r&   r   r  r  r  rW   softmaxrZ   )qrE  r   r  dropout_ratero   rprq_shapebshdlqdklkdvr  r  r   logits_part1logits_part2r  outputs_part1outputs_part2r   r   r   r   r    sd   
$%
&r  c                 C   s   t | g d} |  j}|dd  \}}|d d |r"|r"|| nd g }t | t t | d d dggd} | | | S )N)r   rC   r:   r  r;   r   )r#   r  r  r  r  rN   r   r  )r  r  abr  r   r   r   combine_heads  s   
"(
r2  
create_rprc           	      C   s   t |G t t |ddg}t t |ddg}|| }|| }t |d}t |d| }|| d d d f }t | |}|W  d    S 1 sOw   Y  d S )Nr;   r:   r   rC   )r#   r   r  r   maximumminimumrR   )	orginal_varlength_q	length_kvr  ro   idxsidysidsr#  r   r   r   r3    s   $c              	   C   sb  || dkrt dt|f || dkrt dt|f tjjj|d| |gd |d u rIt| |d | dddd	}tj||||gdd
\}}}n t| |dddd	}t||| dddd	}tj|||gdd
\}}|d ur|tj	||
 |gdd
 }||
< |	d urtj	|	|
 |gdd
 }|	|
< t
||}t
||}t
||}|| }||d 9 }t|d }t|d }|d u r|d urtjjdd| d || g}tjjdd| d || g}t||||}t||||}||d}t||||||d\}}n
t|||||\}}t|}t|d}t||dddd	}||fW  d    S 1 s*w   Y  d S )Nr   zFKey size (%d) must be divisible by the number of attention heads (%d).zHValue size (%d) must be divisible by the number of attention heads (%d).r  r  rC   Tqkv_transform)rp   r   q_transformkv_transformr:   r.   r  r  )r  r  )r#  output_transform)rm   key_size
value_sizer#   r$   r%   r&   r  splitrN   r  r   rK   r3  r  r2  r  )queriesmemoriesr  	key_depthvalue_depthoutput_depthr  r"  r   r   r)  r  ro   combinedr!  rE  r   key_depth_per_headr7  r8  r  r  r#  r  r  r   r   r   r    s   




&r  c                 C   s   | d dkr| d }t jj| |S | d dkr#t jjd| d S | d dkr5t jjj| d dddS | d d	krGt jjj| d dddS td
| d  )Nr7   uniforminitializer_scaler   r-   normal_unit_scalingfan_avg)r  distributionuniform_unit_scalingzUnrecognized initializer: %s)r#   r$   r%   random_uniform_initializerrH   variance_scaling_initializerrm   )r   max_valr   r   r   r   m  s(   
r   c                 C   s   |d dv r4t j|t jd}t j|d t jd}|d d }|t |d |d  |d d  }| | S |d d	krNt jjjt j|t jd|d
 |d S |d dkrV| S t	d)Nlearning_rate_decay)linear_warmup_rsqrt_decaynoamr/   warmup_stepsr,   r.   r:   g      piecewise_constantlearning_rate_boundarieslearning_rate_valuesr  zUnknown learning_rate_decay)
r#   rO   rI   r5  r$   r%   r   rW  rS   rm   )r   r   r   steprV  
multiplierdecayr   r   r   r     s    

r   c           
      C   sv   g }t |  D ]2}g }|D ]\}}t|d}|| qtjd|d}t|d}|d d }||f}	||	 q|S )Nr   )r   r  r:   )r  r#   rV   r   rN   r  )
tower_gradsaverage_gradsgrad_and_varsr  gr  
expanded_ggradr   grad_and_varr   r   r   r     s   r   c                 C   s   t d u r| S t j| t jjdS )N)compression)_ENGINE	allreduceCompressionfp16)r=   r   r   r   
all_reduce  s   ri  c                       sT   e Zd Z			 d fdd	Zdd Zdejjjj	j
dddfdd	Zdd
dZ  ZS )r   r:   Fc                    s2   t t| || || _|| _tj|dd| _d S )NrZ  rw   )r   r   r   
_optimizer_stepr#   r   _step_t)r   r   rZ  use_lockingro   r   r   r   r     s   zMultiStepOptimizer.__init__c                 C   sl   t | jd $ |d u r|W  d    S t|t jr!t |}t|W  d    S 1 s/w   Y  d S )N
_Allreduce)r#   r   _namer  IndexedSlicesr   ri  )r   r=   r   r   r   _all_reduce  s   
$zMultiStepOptimizer._all_reduceNc                    s&  j ||||||}tt| \}}jdkr(fdd|D }tt||S t|dd d}	jjdkr9dndd|	d	}
g }t||D ]E\}}|d
j t	|t
jret
j |j|jjd n	t
j |jd  fdd} fdd}t
t
|
d||}|| qFtt||S )Nr:   c                    s   g | ]}  |qS r   )rq  )r   rz  )r   r   r   r     s    z8MultiStepOptimizer.compute_gradients.<locals>.<listcomp>c                 S   s   | j S r   rw   )r  r   r   r   r     s    z6MultiStepOptimizer.compute_gradients.<locals>.<lambda>)keyr   iter)initial_valuero   colocate_withgrad_accrm  c                      r   r   r   r   )rv  r   r   	_acc_grad  s   z7MultiStepOptimizer.compute_gradients.<locals>._acc_gradc                      s     j S r   )rq  rk  r   rv  r   r   r   	_avg_grad  s   z7MultiStepOptimizer.compute_gradients.<locals>._avg_grad)rj  r   r  r  rk  rO  _create_non_slot_variable_zeros_slotro  r  r#   rp  scatter_addr  r  _use_locking
assign_addr   rN  r   )r   r"   r   gate_gradientsaggregation_methodr   	grad_lossr  r  	first_variter_var	new_gradsrb  varrx  rz  r   ry  r   r     s@   
z$MultiStepOptimizer.compute_gradientsc           	         s   j dkrjj| dS tt| \fdd} fdd}dt }tt	|d||}t
|g |jt|d jjd	}W d    n1 sYw   Y  tj||g S )
Nr:   rw   c                      s
   t j  S r   )r#   groupr   )r  r   r   _pass_gradients  s   
z;MultiStepOptimizer.apply_gradients.<locals>._pass_gradientsc                     s   j t } t| g( g }D ]}|d}||jt|j	d qtj
| }W d    n1 s<w   Y  tj
| |g S )Nrv  rw  )rj  r  r  r#   control_dependenciesget_slotr   assignrL   r~  r  )opzero_opsr  rv  zero_opr   r  ro   r   r   r   r   _apply_gradients  s    	z<MultiStepOptimizer.apply_gradients.<locals>._apply_gradientsrs  r   rw  )rk  rj  r  r  r  _get_non_slot_variabler#   get_default_graphr   rN  r  r  r  rl  r~  r  )	r   r  r   ro   r  r  r  	update_opiter_opr   r  r   r    s$   
z"MultiStepOptimizer.apply_gradients)r:   Fr   r  )r  r  r  r   rq  r#   r$   r%   r   	OptimizerGATE_OPr   r  r  r   r   r   r   r     s    

2r   c                    s   t | } t | d  g }t d- tD ] }|t t t jj	
 | fdd fdd qW d    n1 sCw   Y  t j| |ddS )Nr   z/cpu:0c                      s     d S r  r   r   r+  num_datashardsr   r   r   #  s    z shard_features.<locals>.<lambda>c                      s     S r   r   r   r  r   r   r   $  s    r   )r#   r   r   r   r   r   r   rv  r$   r%   r  rB  )r  r  size_splitsr   r   r  r   r     s$   
	r   r   )TNN)r  NN)r   r   )r  N)r-   NN)r3  )NNr   NN)5r   collectionsr   typingr   
tensorflowr#   modelscope.metainfor   modelscope.models.baser   r   modelscope.models.builderr   modelscope.utils.constantr   __all__register_moduletranslationr	   rP  r/  r  r:  r3  rG  r  r  r  r  r  r  r[   rn   r   rT   rY   r  r  r2  r3  r  r   r   r   re  ri  r$   r%   r   r  r   r   r   r   r   r   <module>   s         
)	


+
	


.
6

C

=

Ol