o
    ߥi                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZ d dl
mZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) G dd dej*Z+G dd dej*Z,G dd dej*Z-dd Z.G dd dej*Z/G dd dej*Z0G dd dZ1dd  Z2d!d" Z3e
j4j5d#e
j6d$e
j6d%e
j6d&e7d'e
j6f
d(d)Z8e
j4j5d#e
j6d$e
j6d%e
j6d&e7d'e
j6f
d*d+Z9G d,d- d-ej*Z:G d.d/ d/ej*Z;G d0d1 d1ej*Z<d2d3 Z=d4d5 Z>G d6d7 d7eZ?d8d9 Z@d:d; ZAdMd>d?ZBG d@dA dAZCdBdC ZDdDeeEe
j6f dEe?dFeFd'eeEe
j6f fdGdHZGG dIdJ dJee)ZHG dKdL dLZIdS )N    N)OrderedDict)CallableDictListOptionalUnion)get_argsmpu)get_global_memory_buffer)AttnMaskTypeFloat16Module	LayerNormbias_gelu_impl)FusedScaleMaskSoftmax)nn)
functional)PreTrainedModel)
TorchModel)
GPT3Config)TextGenerationModelOutputTokenGeneratorOutput)init_megatron_util)pre_load)StreamingOutputMixinc                       s(   e Zd ZdZ fddZdd Z  ZS )GPT3ParallelMLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    c                    sR   t    tj|j|jd|dd| _|j| _tj	| _
tj|j|jd|dd| _d S )NFT)gather_outputinit_methodskip_bias_addinput_is_parallelr   r   )super__init__r	   ColumnParallelLinearhidden_sizeffn_hidden_sizedense_h_to_4hbias_gelu_fusionFgeluactivation_funcRowParallelLineardense_4h_to_hselfconfigr   output_layer_init_method	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/gpt3/distributed_gpt3.pyr!   /   s"   
zGPT3ParallelMLP.__init__c                 C   sD   |  |\}}| jrt||}n| || }| |\}}||fS N)r%   r&   r   r)   r+   )r-   hidden_statesintermediate_parallelbias_paralleloutputoutput_biasr2   r2   r3   forwardE   s   zGPT3ParallelMLP.forward__name__
__module____qualname____doc__r!   r:   __classcell__r2   r2   r0   r3   r   '   s    r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )GPT3Embeddinga  Language model embeddings.

    Arguments:
        hidden_size: hidden size
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        init_method: weight initialization method
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    sr   t    |j| _|| _tj|j| j| jd| _t	|j
| j| _| | jj |j| _|j| _t|j| _d S )N)r   )r    r!   r#   r   r	   VocabParallelEmbedding
vocab_sizeword_embeddingsr   	Embeddingmax_position_embeddingsposition_embeddingsweightfp32_residual_connectionsequence_parallelDropouthidden_dropoutembedding_dropout)r-   r.   r   r0   r2   r3   r!   e   s   
zGPT3Embedding.__init__c                 C   s8   | j jjd d| j j_| jjjd d| jj_dS )z%Zero out all parameters in embedding.r   TN)rD   rH   datafill_sharedrG   r-   r2   r2   r3   zero_parametersz   s   
zGPT3Embedding.zero_parametersc                 C   s   |  |}| |}|| }|dd }| jr| }| jrCt|}t	 
  | |}W d    |S 1 s<w   Y  |S | |}|S )Nr      )rD   rG   	transpose
contiguousrI   floatrJ   r	   #scatter_to_sequence_parallel_regionget_cuda_rng_trackerforkrM   )r-   	input_idsposition_idswords_embeddingsrG   
embeddingsr2   r2   r3   r:      s    




zGPT3Embedding.forward)r<   r=   r>   r?   r!   rR   r:   r@   r2   r2   r0   r3   rA   W   s
    rA   c                       s,   e Zd Z fddZ			dddZ  ZS )NoopTransformerLayerc                    s   t    || _d S r4   )r    r!   layer_numberr-   r_   r0   r2   r3   r!      s   

zNoopTransformerLayer.__init__Nc                 C   s   |  S r4   )clone)r-   r5   attention_maskencoder_outputenc_dec_attn_maskinference_paramsr2   r2   r3   r:      s   zNoopTransformerLayer.forwardNNN)r<   r=   r>   r!   r:   r@   r2   r2   r0   r3   r^      s    r^   c                 C   s   |  |d | S )Ng     )masked_fill_)attention_scoresrb   r2   r2   r3   attention_mask_func   s   ri   c                       s*   e Zd Zejf fdd	Zdd Z  ZS )GPT3CoreAttentionc                    s   t    |j| _|j| _|j| _|j| _| jrd| _td|| _|| _|j	| _	|j
|j }t }t||| _t||j| _t|j|| _d }t| j| _| jr_| j}|  j|9  _t| j| j| j|jt| j|| _t|j| _d S )NTrS   )r    r!   fp16bf16apply_query_key_layer_scalingattention_softmax_in_fp32maxr_   attn_mask_typerJ   kv_channelsnum_attention_headsr	   $get_tensor_model_parallel_world_sizedividehidden_size_per_partitionhidden_size_per_attention_head!num_attention_heads_per_partitionmathsqrtnorm_factorr   masked_softmax_fusionri   scale_mask_softmaxr   rK   attention_dropout)r-   r.   r_   rp   projection_size
world_sizecoeffr0   r2   r3   r!      s@   
zGPT3CoreAttention.__init__c                 C   s  | d| d| d| df}||d |d |d  d}||d |d |d  d}t |d |d  |d |d f|jd}tj||dd|dddddd| j d	}|j| }| 	||}	| j
st   | |	}	W d    n1 sw   Y  n| |	}	| d| d| d| df}|| d|d |d  d}|	|d |d  |d d}	t|	|dd}
|
j| }
|
dddd }
|
  d d
 | jf }|
j| }
|
S )NrS      r      r	                 ?)betaalpha)sizeviewr
   
get_tensordtypetorchbaddbmmrT   rz   r|   rJ   r	   rX   rY   r}   bmmpermuterU   ru   )r-   query_layer	key_layervalue_layerrb   output_sizematmul_input_buffermatmul_resultrh   attention_probscontext_layernew_context_layer_shaper2   r2   r3   r:      s^   







zGPT3CoreAttention.forward)r<   r=   r>   r   paddingr!   r:   r@   r2   r2   r0   r3   rj      s    +rj   c                       s2   e Zd ZdZ fddZdd Zd	ddZ  ZS )
GPT3ParallelAttentionzParallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    c                    s   t    td|| _|j| _|j|j }t }t	||j| _
t	|j|| _tj|jd| d|d| _t|| j| _tj||jd|dd| _d S )NrS   r   F)r   r   Tr   )r    r!   ro   r_   params_dtyperq   rr   r	   rs   rt   rv   rw   r"   r#   query_key_valuerj   core_attentionr*   dense)r-   r.   r   r/   r_   r~   r   r0   r2   r3   r!   6  s2   
zGPT3ParallelAttention.__init__c                 C   s"   t j||| j| j| jt j dS )Nr   device)r   emptyrw   rv   r   cudacurrent_device)r-   inference_max_sequence_len
batch_sizer2   r2   r3   _allocate_memoryV  s   z&GPT3ParallelAttention._allocate_memoryNc                 C   sn  |r+| j |jvr#|j}|j}| ||}| ||}||f|j| j < n|j| j  \}}| |\}}	| d d | jd| j f }
|j	|
 }t
|d\}}}|r|j}||d }||dkseJ |j}||d }||dksxJ ||||||df< ||||||df< |d |||df }|d |||df }| ||||}| |\}}||fS )Nr   r   rS   r   .)r_   key_value_memory_dictmax_sequence_lenmax_batch_sizer   r   r   rw   rv   r   r	   split_tensor_along_last_dimbatch_size_offsetsequence_len_offsetr   r   )r-   r5   rb   re   inf_max_seq_leninf_max_batch_sizeinference_key_memoryinference_value_memorymixed_x_layer_new_tensor_shaper   r   r   batch_start	batch_endsequence_startsequence_endr   r8   biasr2   r2   r3   r:   _  sd   


zGPT3ParallelAttention.forwardr4   )r<   r=   r>   r?   r!   r   r:   r@   r2   r2   r0   r3   r   /  s
     	r   c                   @   s&   e Zd ZdddZdd Zdd ZdS )	nullcontextNc                 C   s
   || _ d S r4   enter_result)r-   r   r2   r2   r3   r!        
znullcontext.__init__c                 C   s   | j S r4   r   rQ   r2   r2   r3   	__enter__  s   znullcontext.__enter__c                 G   s   d S r4   r2   )r-   excinfor2   r2   r3   __exit__  s   znullcontext.__exit__r4   )r<   r=   r>   r!   r   r   r2   r2   r2   r3   r     s    
r   c                 C   s    t j| | ||d}|| }|S )N)ptraining)r'   dropout)xr   residualprobr   outr2   r2   r3   bias_dropout_add  s   r   c                        fdd}|S )Nc                    s   t | ||| S r4   r   r   r   r   r   r   r2   r3   _bias_dropout_add     z/get_bias_dropout_add.<locals>._bias_dropout_addr2   )r   r   r2   r   r3   get_bias_dropout_add  s   r   r   r   r   r   returnc                 C      t | |||dS )NTr   r   r2   r2   r3   bias_dropout_add_fused_train     r   c                 C   r   )NFr   r   r2   r2   r3    bias_dropout_add_fused_inference  r   r   c                       s*   e Zd ZdZ fddZdddZ  ZS )GPT3ParallelTransformerLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    c                    s   t    || _|j| _|j| _|j| _t|j|j|j	|j
d| _t||||| _|j| _|j| _t|j|j|j	|j
d| _t|||| _ttjdd }ttjdd }|dkpd|dkod|dk}|rlt| _d S tj| _d S )Nepsno_persist_layer_normrJ   .r   rS   
   )r    r!   r_   (apply_residual_connection_post_layernormrl   rI   r   r#   layernorm_epsilonr   rJ   input_layernormr   self_attentionrL   bias_dropout_fusionpost_attention_layernormr   mlpintr   __version__splitr   enable_gradbias_dropout_add_exec_handler)r-   r.   r   r/   r_   TORCH_MAJORTORCH_MINORuse_nvfuserr0   r2   r3   r!     sH   
z%GPT3ParallelTransformerLayer.__init__Nc                 C   s
  |  |}| j|||d\}}| jr|}n|}| jr#| jr t}nt}nt| j}|   |||	||| j
}	W d    n1 sBw   Y  | |	}| |\}
}| jrY|}n|	}|   ||
|	||| j
}W d    n1 suw   Y  tj||jdd}|S )Nre   T)inprequires_grad
keep_graph)r   r   r   r   r   r   r   r   r   	expand_asrL   r   r   r	   make_viewless_tensorr   )r-   r5   rb   re   layernorm_outputattention_outputattention_biasr   bias_dropout_add_funclayernorm_input
mlp_outputmlp_biasr8   r2   r2   r3   r:     sH   




z$GPT3ParallelTransformerLayer.forwardr4   r;   r2   r2   r0   r3   r     s    -r   c                       s:   e Zd ZdZ			d
 fdd	Zdd Zddd	Z  ZS )GPT3ParallelTransformerzTransformer class.Tc                    s   t    j| _j| _|| _|| _|| _d | _j| _j	| _
fdd | j
dkr<d| _
tjtdg| _ntj fddt| j
D | _| jrb| jrdtjjjjd| _d S d S d S )Nc                    s   t  | S r4   )r   )r_   )r.   r   r/   r2   r3   build_layerX  s   z5GPT3ParallelTransformer.__init__.<locals>.build_layerr   rS   c                    s   g | ]} |d  qS rS   r2   .0i)r   r2   r3   
<listcomp>b      z4GPT3ParallelTransformer.__init__.<locals>.<listcomp>r   )r    r!   rl   rI   post_layer_normpre_processpost_processinput_tensorrJ   num_hidden_layers
num_layersr   r   
ModuleListr^   layersranger   r#   r   r   final_layernorm)r-   r.   r   r/   r   r   r  r0   )r   r.   r   r/   r3   r!   B  s0   

z GPT3ParallelTransformer.__init__c                 C   s
   | j | S r4   )r  r`   r2   r2   r3   
_get_layerl  r   z"GPT3ParallelTransformer._get_layerNc                 C   s   | j s| j}tj|ddd}| jrt  }nt }| t| j	D ]}| 
|}||||d}q#W d    n1 s<w   Y  | jrL| jrL| |}|S )NT)r   r   r   )r   r  r	   r   rJ   rX   rY   r   r  r  r	  r  r   r  )r-   r5   rb   re   rng_contextindexlayerr2   r2   r3   r:   o  s.   


zGPT3ParallelTransformer.forward)TTTr4   )r<   r=   r>   r?   r!   r	  r:   r@   r2   r2   r0   r3   r   ?  s    *r   c                       s.   e Zd ZdZ fddZ		dddZ  ZS )GPT3TransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
        embedding_dropout_prob: dropout probability for embeddings
        num_tokentypes: size of the token-type embeddings. 0 value
                        will ignore this embedding
    c                    s@   t    |j| _|| _d | _t|| j| _t|| j|| _d S r4   )	r    r!   r#   r   encoder_hidden_staterA   	embeddingr   encoderr,   r0   r2   r3   r!     s   

z%GPT3TransformerLanguageModel.__init__Nc                 C   sL   |  ||}|d u r| jd ur| j|||d}|S | j}|S ||j}|S )Nr   )r  r  r  tor   )r-   enc_input_idsenc_position_idsenc_attn_maskre   enc_hidden_statesencoder_inputrc   r2   r2   r3   r:     s   
	z$GPT3TransformerLanguageModel.forward)NNr;   r2   r2   r0   r3   r    s    r  c                    r   )z!Init method based on N(0, sigma).c                       t jj| d dS Nr   )meanstdr   initnormal_tensorsigmar2   r3   init_     z!init_method_normal.<locals>.init_r2   )r!  r"  r2   r   r3   init_method_normal  s   r$  c                    s"   | t d|    fdd}|S )z3Init method based on N(0, sigma/sqrt(2*num_layers).g       @c                    r  r  r  r  r  r2   r3   r"    r#  z(scaled_init_method_normal.<locals>.init_)rx   ry   )r!  r  r"  r2   r%  r3   scaled_init_method_normal  s   r&  c                       sF   e Zd ZeZ fddZdd Zedd Z				d
dd	Z	  Z
S )	GPT3Modelc                    s.   t  | t|t|jt|j|j| _d S r4   )r    r!   r  r$  init_method_stdr&  r  language_model)r-   r.   r0   r2   r3   r!     s   

zGPT3Model.__init__c                 C   s   | j jjjS r4   )r)  r  rD   rH   rQ   r2   r2   r3   word_embeddings_weight  s   z GPT3Model.word_embeddings_weightc                 C   s\   |  d}ttjdd||f| jd}|dk }tj|tj| jd}|d| }||fS )NrS   r   g      ?r   r   )	r   r   trilonesr   arangelong	unsqueezer   )tokens
seq_lengthrb   r[   r2   r2   r3   %build_attention_mask_and_position_ids  s   

z/GPT3Model.build_attention_mask_and_position_idsNc                 K   s   |d u r|d u r|  |\}}| j||||d}tj||  d dd| jj}d }	|d urG|dd	 }t
|  |}	|	dd	 }	t|}
|
dd	 }
|
|	fS )Nr   FTr   rS   )r3  r)  r	   /LinearWithGradAccumulationAndAsyncCommunicationapplyr*  r.   rJ   rT   rU   vocab_parallel_cross_entropyra   rV   (gather_from_tensor_model_parallel_region)r-   rZ   rb   r[   re   labelskwargs	lm_outputlogits_parallellosseslogitsr2   r2   r3   r:     s.   
zGPT3Model.forward)NNNN)r<   r=   r>   r   config_classr!   r*  staticmethodr3  r:   r@   r2   r2   r0   r3   r'    s    
r'  c                 C   s,   | t | |d d k }| |td dS )z-Set the logits for none top-k values to -inf.r   ).r   N-InfN)r   topkrg   rV   )r=  top_kfilter_r2   r2   r3   !modify_logits_for_top_k_filtering,  s   rD  c                 C   s   t j| dd\}}|jddjdd}||k}|ddddf  |ddddf< d|d< |d||}| |td	 dS )
z-Set the logits for none top-p values to -inf.T
descendingr   dimNrS   r   ).r   r@  )r   sortsoftmaxcumsumra   scatterrg   rV   )r=  top_psorted_logitssorted_indicescumulative_probsrC  r2   r2   r3   !modify_logits_for_top_p_filtering3  s   (rQ  r   r   c                 C   s   | j dks	J d|dkr|dksJ dtj| dd}nU|  } |dkr*| | |dkrQ|dks6J d	|| dksAJ d
|rK||k sKJ dt| | n|dkrb|dks]J dt| | | jdd}tj	|dd
d}|r~tj|d|d d}|S )a9   Sample and generate a token.
    Note: logits has the dimension [b, v] where b is the batch size
          and v is the vocabulary size.
    If vocab_size is provided, we will make sure the sample that is
    generated is in [0, vocab-size). This will avoid out of vocabulary
    generations due to padding.
    r   z*expected the logits to be of [b, v] shape.rS   r   z+cannot set both greedy and top-p samplings.r   rG  r   z*cannot set both top-k and top-p samplings.z top-k is larger than logit size.z top-k is larger than vocab size.ztop-p should be in (0, 1].)num_samplesr   )minro   )ndimr   argmaxra   div_r   rD  rQ  rJ  multinomialr   clamp)r=  rB  rM  temperaturerC   samplesprobsr2   r2   r3   sampleI  s*   


r\  c                   @   s    e Zd ZdZdd Zdd ZdS )InferenceParamszInference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference.c                 C   s"   || _ || _d| _d| _i | _dS )zNote that offsets are set to zero and we always set the
        flag to allocate memory. After the first call, make sure to
        set this flag to False.r   N)r   r   r   r   r   )r-   r   r   r2   r2   r3   r!   }  s
   
zInferenceParams.__init__c                 C   s|   t | jdkrtd| j D ]+}| j| \}}t ||jd ks$J |dd|f }|dd|f }||f| j|< qdS )zswap between batchesr   z"should not swap when dict in emptyrS   N)lenr   
ValueErrorkeysshape)r-   	batch_idxr_   r   r   new_inference_key_memorynew_inference_value_memoryr2   r2   r3   swap_key_value_dict  s   

z#InferenceParams.swap_key_value_dictN)r<   r=   r>   r?   r!   re  r2   r2   r2   r3   r]  y  s    
r]  c           
      C   sj   t j| ||}t j||}tj| ||d}g }t|D ]}tj||d | |d}	||	 q|S )NrG  )	r	   utilsrt   r   r   r   r  catappend)
r  num_partitionspartition_dimstrideper_partition_sizeper_partition_per_stride_sizepartitions_list
partitionsr   	partitionr2   r2   r3   split_into_partitions  s    
rq  
state_dictmodelro  c                 C   sh   |dkr| S t  }| D ]#\}}|j| | jkrqt|jd}|j}t| | |||| | |< q| S )NrS   r   )r	   get_tensor_model_parallel_ranknamed_parametersra  ro   rj  partition_striderq  )rr  rs  ro  rankname
parametersrH  rk  r2   r2   r3   split_state_dict  s   

rz  c                
       s   e Zd Z	d%dd fddZd&def fdd	Z					d'd
dZ				d(ddZd)ddZe	
 d&ddZe	
 dd Zd*ddZ	d&dddefddZ			d+deeejf d eeee f d!ed"ee f fd#d$Z  ZS ),DistributedGPT3rs  N)megatron_cfgc                   s   t  j|g|R i | t|||d t|| _t| j}| D ]}t	| q#|
tj
  | jjs;| jjrAt|| j}|| _t }	t dd }
|
d u rU|	n|
}
t |
 |	 }t|||d}t|||	|
 }| jj||ddd d | _d S )N)rw  %checkpoint_tensor_model_parallel_size)tagstrictT)r  )r    r!   r   r   from_pretrainedr.   r'  ry  r	   8set_defaults_if_not_set_tensor_model_parallel_attributesr   r   r   rk   rl   r   
dist_modelrs   r   getrt  r   rz  load_state_dictre   )r-   	model_dirrw  path_load_tagr|  argsr9  rs  param	tensor_wsckpt_ws	ckpt_rank
load_modelr0   r2   r3   r!     s(   

zDistributedGPT3.__init__Tmodec                    s   |rd | _ t |S r4   )re   r    train)r-   r  r0   r2   r3   r    s   zDistributedGPT3.trainc                 C   s.  | j |||| j|d\}}d }	|d u r| j j|d7  _nrtj| tj|jd}
|d u rAt|D ]\}}d|
||d f< q3n&t|D ]\}}d|
||d d f< qEt|D ]\}}d|
|d |d f< qX| }|
	d }
|

 }|dkrt
|	d }	nt
|	d|
 | }	t||	dS )N)re   r8  rS   r   r   r   )r=  loss)r  re   r   r   r   r-  rV   r   	enumerater   sumzero_r   )r-   r1  rb   r[   r8  prompts_len
inputs_lenr=  r<  r  	loss_maskr   lmask_sumr2   r2   r3   r:     s8   
zDistributedGPT3.forwardFc           #      k   s   | d| jj}| d| jj}| d| jj}	| d|d| jj }
|d}|}|d u r=tj|dg|j	d}|
  }t
|
| jj}||krRtd||d }|dkrqtj|||j	d }tj||fd	d
}t||| _| jj}tj|tjtj d}t|\}}d}t||D ]}|d d ||f }|d d ||f }|d||d |f }| |||j}|d d d	d d f }t||||	| jjd}||k}|| |||f< t|d d d |d f dV  |}|r|dk | @ }|dk |d d |d f dk @ | @ }||B } n&|r7|dk | @ }|dk | @ }!||!B } n
||k | @ } || B }t|}"|rS|"rS d S qd S )NrB  rM  rY  
max_lengthrS   r   r+  -context length + tokens_to_generate too larger   rG  r   .)rB  rM  rY  rC   )	sequencesit     ) popr.   rB  rM  rY  r   tokens_to_generater   r  r   rS  itemrF   r_  zerosr/  rg  r]  re   eod_iduint8r   r   r'  r3  r  r=  r\  rC   r   byteall)#r-   r1  r  #use_eod_token_for_early_terminationstop_on_double_eolstop_on_eolr9  rB  rM  rY  r  r   lengthsmin_prompt_lengthmax_sequence_length
pad_lengthpadstermination_idis_generation_donerb   r[   prev_context_lengthcontext_length
tokens2usepositions2useattention_mask2user=  last_token_logits
new_samplestartedhit_double_eolhit_two_eols
done_tokenhit_eoldoner2   r2   r3   r\  
  s   
	




zDistributedGPT3.sample   rS   c           $         s  | d}|dksJ |dtj| dg|jd }| jj}tjd| jj	|jd
 | }tj||fdd}| d}	t|	| jj}	||	krMtdt||	| _t|}
d}tj|tjtj d	d}||d}t|\}}d}t||	D ]}|d d ||f }|d d ||f }|d
||d |f }| |||j}| d}tj|dd}|d d dd d f | }||krtj|dd d f dd\}}ntj|ddd\}}t |d d|  |! 
 }|d d|  | }|d d|  }g }t"t#|||D ]9\}\}} }!| |kr1||k}"|"r!q
|
$||! % | |d |  n|&|| |!f t'||krB nq
|
(|)  |d | rWd} n8|*dd |D }#||#d d f }|*dd |D |d d |f< |*dd |D d}| j+|# |}q||st|D ]}!|
$||! % ||! |d |  qt,|
j-dd dd t|t' } fddt|D } fddt|D }tj.|dd}tj.|dd}t/||dS )Nr   rS   prompt_lengthr+  r   rG  r  Fr   .r   TrE  c                 S      g | ]}|d  qS )r   r2   r   r  r2   r2   r3   r         z/DistributedGPT3.beam_search.<locals>.<listcomp>c                 S   r  r   r2   r  r2   r2   r3   r     r  c                 S   r  r   r2   r  r2   r2   r3   r     r  c                 S   s   | d S )Nr   r2   )r   r2   r2   r3   <lambda>  s    z-DistributedGPT3.beam_search.<locals>.<lambda>)keyreversec                       g | ]} | d  qS r  r2   r   sorted_hypsr2   r3   r     r   c                    r  r   r2   r   r  r2   r3   r     r   )r  scores)0r   r  r   r  r   r  r.   r  r-  r  r/  rg  rS  rF   r_  r]  re   BeamHypothesesr  float32r   r   r0  repeatr'  r3  r  r=  r'   log_softmaxrI  r   divtruncr  zipaddra   rh  r^  is_donero   newre  sortedbeamsstackr   )$r-   r1  	beam_sizenum_return_genr9  r   r  
stop_tokenr  final_sequence_lengthbeam_hypr  r  rb   r[   r  r  r  r  r  r=  rC   	log_probs
new_scoressorted_scoresindicesbest_beam_ids
best_wordsbest_scores
next_beamsbeam_token_ranktoken_id
beam_scorebeam_id&is_beam_token_worse_than_top_num_beamsbest_batchesr2   r  r3   beam_searcht  s   








zDistributedGPT3.beam_searchc                 O   sF   |rd }| j |g|R i |D ]}|}q|S | j|g|R i |S r4   )r\  r  )r-   r1  	do_sampler  r9  last_outputr8   r2   r2   r3   generate  s   zDistributedGPT3.generatec                 O   s   | j |g|R i |S r4   )r\  )r-   r1  r  r9  r2   r2   r3   stream_generate  s   zDistributedGPT3.stream_generate c                 C   s   | j |||S r4   )r  rr  )r-   destinationprefix	keep_varsr2   r2   r3   rr    r   zDistributedGPT3.state_dictrr  zOrderedDict[str, torch.Tensor]r  c                 C   s   | j ||S r4   )r  r  )r-   rr  r  r2   r2   r3   r    s   zDistributedGPT3.load_state_dicttarget_foldersave_checkpoint_namessave_functionr.   c                    s   d|d d< |d  dd  |d  dd  |d  dd  |d  dd  t j}t j}|| |d d	< t j||||fi |S )
Nzgpt3-generationpipelinetypers  rw  r|  megatronr}  r   )r  r   tensor_model_parallel_sizepipeline_model_parallel_sizer    save_pretrained)r-   r  r  r  r.   r9  tp_sizepp_sizer0   r2   r3   r    s   
zDistributedGPT3.save_pretrained)rs  )T)NNNNN)NTFF)r  rS   )Nr  Frf   )r<   r=   r>   r!   boolr  r:   r\  r  r   no_gradr  r  rr  r  r   strosPathLiker   r   r   dictr  r@   r2   r2   r0   r3   r{    sR    &
*

jq	


r{  c                   @   sh   e Zd Z		ddededefddZdd	 Z	
ddej	dede
ej	 fddZdededefddZd
S )r  r   F	num_beamslength_penaltyearly_stoppingc                 C   s"   || _ || _|| _g | _d| _dS )z7
        Initialize n-best list of hypotheses.
        g    eAN)r  r  r  r  worst_score)r-   r  r  r  r2   r2   r3   r!     s
   
zBeamHypotheses.__init__c                 C   s
   t | jS )z3
        Number of hypotheses in the list.
        )r^  r  rQ   r2   r2   r3   __len__  s   
zBeamHypotheses.__len__Nhypsum_logprobsbeam_indicesc                 C   s   ||j d | j  }t| | jk s|| jkrL| j|||f t| | jkrCtdd t| jD }| j|d d = |d d | _dS t	|| j| _dS dS )z3
        Add a new hypothesis to the list.
        r   c                 S   s   g | ]\}\}}}||fqS r2   r2   )r   idxsr   r2   r2   r3   r   0  s    z&BeamHypotheses.add.<locals>.<listcomp>r   rS   N)
ra  r  r^  r  r  r  rh  r  r  rS  )r-   r  r  r	  scoresorted_next_scoresr2   r2   r3   r  %  s   zBeamHypotheses.addbest_sum_logprobscur_lenr   c                 C   s8   t | | jk r	dS | jrdS ||| j  }| j|k}|S )z
        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
        one in the heap, then we are done with this sentence.
        FT)r^  r  r  r  r  )r-   r  r  	cur_scoreretr2   r2   r3   r  8  s   
zBeamHypotheses.is_done)r   Fr4   )r<   r=   r>   r   rV   r  r!   r  r   
LongTensorr   r  r  r2   r2   r2   r3   r    s(    
	
r  )r   r   r   N)Jrx   r  collectionsr   typingr   r   r   r   r   r   megatron_utilr   r	   megatron_util.global_varsr
   megatron_util.modelr   r   r   r   !megatron_util.model.fused_softmaxr   r   torch.nnr   r'   transformers.modeling_utilsr   modelscope.modelsr   modelscope.models.nlp.gpt3r   modelscope.outputsr   r   modelscope.utils.megatron_utilsr   $modelscope.utils.nlp.load_checkpointr   !modelscope.utils.streaming_outputr   Moduler   rA   r^   ri   rj   r   r   r   r   jitscriptTensorrV   r   r   r   r   r  r$  r&  r'  rD  rQ  r\  r]  rq  r  r   rz  r{  r  r2   r2   r2   r3   <module>   s   0A yna7	
B
0
  ]