o
    ߥi'"                     @   s   d Z ddlZddlmZ ddlm  mZ ddlmZm	Z	 ddl
mZ ddlmZ ddd	ZG d
d dejjZG dd dejjZdd ZdS )zGPT-2 model.    N)mpuprint_rank_0)PromptSpell   )GPT2ParallelTransformer{Gz?c                    s    fdd}|S )zInit method based on normal distribution.

    This is only used for embeddings. The transformer has its
    own initializer.
    c                    s   t jjj| d dS )N        )meanstd)torchnninitnormal_)tensorr
    a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/model/modeling_glm.pyinit_!   s   z!init_method_normal.<locals>.init_r   )r
   r   r   r   r   init_method_normal   s   r   c                       sN   e Zd ZdZ								d fdd		Zdd
dZddddddZ  ZS )GLMModelzGLM Language model.

    The output of the forward method are the logits (parallel or
    serial depending on the `parallel_output` flag.
    r   TFNlstm      ?c                    s   t t|   || _|| _|| _tdd}tj|||d| _	t
|||||	||||
||||d| _|d ur>t|| j|| _d S d S )Nr   r   init_method)attention_scalerelative_encodingblock_position_encoding)superr   __init__parallel_outputoutput_predicthidden_sizer   r   VocabParallelEmbeddingword_embeddingsr   transformerr   prompt_spell)self
num_layers
vocab_sizer!   num_attention_headsembedding_dropout_probattention_dropout_proboutput_dropout_probmax_sequence_lengthmax_memory_lengthcheckpoint_activationscheckpoint_num_layersr   r   r   r    spell_length
spell_funcr   r   	__class__r   r   r   .   s8   

zGLMModel.__init__c                 C   s`   d}| j d | jd |d ur*|d| d7 }t|D ]}| jj| d qt| d S )NzFreeze transformerFz tune z prefix layersT)r#   requires_grad_r$   rangelayersr   )r&   tune_prefix_layerslog_strir   r   r   freeze_transformerc   s   zGLMModel.freeze_transformer)return_memorydetach_memory
prompt_posc                G   s   | d}| |}	|	}
|d ur)|
 }
|  }tj||jdd}||
||f< | j|
|||||d}|\}}|}| j	r\t
|}t|| jj}| jrS|g|R S t
|g|R S |g|R S )Nr   )devicer   )r<   r=   )sizer#   cloner%   r   aranger?   	unsqueezer$   r    r   copy_to_model_parallel_regionFlinearweightr   !gather_from_model_parallel_region)r&   	input_idsposition_idsattention_maskr<   r=   r>   mems
batch_sizewords_embeddings
embeddingsprompt_embedsbatch_indextransformer_outputlogitshidden_layersoutputslogits_parallelr   r   r   forwardm   sB   
	


zGLMModel.forward)r   TFFTNr   r   N)__name__
__module____qualname____doc__r   r;   rW   __classcell__r   r   r3   r   r   '   s     
5r   c                       s0   e Zd ZdZ			d fdd	Zdd Z  ZS )	EncoderDecoderzSeq2Seq Transformer Model
    The output of the forward method are the logits (parallel or serial depending on the `parallel_output` flag).
    r   Tc                    sv   t t|   || _|| _tdd}tj|||d| _t	|||||	||||
|
| _
t	|||||	||||
|dd| _d S )Nr   r   r   T)use_decoder_layer)r   r^   r   r   r    r   r   r"   r#   r   encoderdecoder)r&   r'   r(   r!   r)   r*   r+   r,   r-   r.   r/   r0   r   r    r   r3   r   r   r      s2   
zEncoderDecoder.__init__c                 C   sv   |  |}|  |}| |||\}	}
| |||\}}
| jr8t|}t|| j j}| j	r2|fS t
|fS |fS rX   )r#   r`   ra   r    r   rD   rE   rF   rG   r   rH   )r&   
source_ids
target_idssource_position_idstarget_position_idssource_masktarget_masksource_embeddingstarget_embeddingsencoder_output_decoder_outputoutput_parallelrV   r   r   r   rW      s"   


zEncoderDecoder.forward)r   TT)rY   rZ   r[   r\   r   rW   r]   r   r   r3   r   r^      s    +r^   c                 C   s   dg i}g dd}|   D ]@}t|tjtjjfr+|d dd t|j	 D  q|d dd t|j
 D  |d dd t|j
 D  q||fS )Nparamsr   )rn   weight_decayc                 S   s   g | ]}|d ur|j r|qS rX   requires_grad).0pr   r   r   
<listcomp>   s
    z@glm_get_params_for_weight_decay_optimization.<locals>.<listcomp>c                 S   s*   g | ]\}}|d ur|j r|dkr|qS Nbiasrp   rr   nrs   r   r   r   rt      
    c                 S   s*   g | ]\}}|d ur|j r|dkr|qS ru   rp   rw   r   r   r   rt      ry   )modules
isinstancer   	LayerNormr   r   extendlist_parametersvaluesitems)moduleweight_decay_paramsno_weight_decay_paramsmodule_r   r   r   ,glm_get_params_for_weight_decay_optimization   s   


r   )r   )r\   r   torch.nnr   torch.nn.functional
functionalrE   megatron_utilr   r   'modelscope.models.nlp.mglm.model.promptr   r$   r   r   Moduler   r^   r   r   r   r   r   <module>   s   
rI