o
    ߥi                     @   s   d dl Z d dlZd dlm  mZ dd ZG dd dejjZG dd dejjZ	G dd	 d	ejjZ
G d
d dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZG dd dejjZdS )    Nc                 C   s4   | dt dt |    t d| t |    S )z%Mindspore's fast gelu implementation.   gZd;gZd;?)torchexpabs)x r   [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/codegeex/codegeex.py	fast_gelu   s   r	   c                       s(   e Zd ZdZ fddZdd Z  ZS )MLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension. At the end, dropout is also
    applied.
    c                    sN   t t|   || _tj| jd| j | _t| _	tjd| j | j| _
d S )N   )superr
   __init__hidden_sizer   nnLineardense_h_to_4hr	   activation_funcdense_4h_to_h)selfr   	__class__r   r   r      s   
zMLP.__init__c                 C   s"   |  |}| |}| |}|S N)r   r   r   )r   hidden_statesintermediate_paralleloutputr   r   r   forward+   s   


zMLP.forward__name__
__module____qualname____doc__r   r   __classcell__r   r   r   r   r
      s    r
   c                       8   e Zd ZdZ		d	 fdd	Z				d
ddZ  ZS )SelfAttentionzself-attention layer abstract class.

    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
    Tc                       t t|   || _|| _|| _|| _td|| _| j| j dks#J t	| j| j | _
tj| j| j| _tj| j| j| _tj| j| j| _t| j
| _tjjdd| _tj| j| j| _d S Nr   r   dim)r   r#   r   r   num_attention_headsfp16attention_softmax_in_fp32maxlayer_numberinthidden_size_per_attention_headr   r   r   querykeyvaluemathsqrtnorm_factorSoftmaxsoftmaxdenser   r   r)   r-   r*   r+   r   r   r   r   <       zSelfAttention.__init__NFc                 C   s<  |  |}| |}| |}	| d d | j| jf }
|j|
 }| d d | j| jf }
|j|
 }|	 d d | j| jf }
|	j|
 }	|d urh|\}}tj|	||fdd}tj|	|	|	fdd}	|rn||	f}|d|d|d|df}|
 |d |d |d  d}|
 |d |d |d  d}t|dd|dddd| j }|j| }|rt 2 |d ur|d|dd d |df d}n|dd |dd |df }W d    n1 sw   Y  |d urt|}d|d d d d |d d d f< ||d	  }| jr*| |  }n| |}|	d|	d|d|	df}|	|	d|d |d  d}	||d |d  |d d}t||	dddd}|j| }|dddd
 }| d d
 | jf }|j| }| |}|r||g}|S Nr&   r   r'   r         .Tg     @r0   r1   r2   sizer)   r/   viewr   cattype_as
contiguousmatmul	transposer5   no_grad	unsqueezecloner+   r7   floathalfbmmsqueezepermuter   r8   )r   r   attention_mask
layer_pastget_key_valueprompt_lengthcontext_lengthquery_layer	key_layervalue_layernew_query_layer_shapepast_key
past_valuepresentoutput_sizematmul_resultattention_scoresattention_probscontext_layernew_context_layer_shaper   r   r   r   r   X   s   










 




zSelfAttention.forwardTTNFNNr   r   r   r   r   r#   5   s     r#   c                       r"   )TopQuerySelfAttentionzTop query self-attention layer abstract class.

    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
    Tc                    r$   r%   )r   rc   r   r   r)   r*   r+   r,   r-   r.   r/   r   r   r   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r   r   r   r      r:   zTopQuerySelfAttention.__init__NFc                 C   s<  |  |}| |}	| |}
| d d | j| jf }|j| }|	 d d | j| jf }|	j| }	|
 d d | j| jf }|
j| }
|d urh|\}}tj|	|	|	fdd}	tj|	|
|
fdd}
|rn|	|
f}|d|d|d|	df}|
 |d |d |d  d}|	
 |d |d |d  d}	t|dd|	dddd| j }|j| }|rt 2 |d ur|d|dd d |df d}n|dd |dd |df }W d    n1 sw   Y  |d urt|}d|d d d d |d d d f< ||d	  }| jr*| |  }n| |}|
d|
d|d|
df}|
|
d|d |d  d}
||d |d  |d d}t||
dddd}|j| }|dddd
 }| d d
 | jf }|j| }| |}|r||g}|S r;   r?   )r   r   query_hidden_staterO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r   r   r   r   r   	  s   










 




zTopQuerySelfAttention.forwardra   rb   r   r   r   r   r   rc      s    !rc   c                       s:   e Zd ZdZ			d
 fdd	Z				ddd	Z  ZS )TransformerLayerzA single transformer layer.

    Transformore layer takes input with size [b, s, h] and returns an
    output of the same size.
    h㈵>Tc                    sl   t t|   || _|| _|| _tjj|| jd| _	t
|||||| _tjj| j| jd| _t| j| _d S Neps)r   re   r   r   layernorm_epsilonr-   r   r   	LayerNorminput_layernormr#   	attentionpost_attention_layernormr
   mlp)r   r   r)   r-   rj   r*   r+   r   r   r   r     s   	zTransformerLayer.__init__NFc                 C   sd   |  |}| j||||||d}|r|\}}	|}
||
 }| |}| |}|| }|r0||	g}|S NrP   rQ   rR   rS   rl   rm   rn   ro   )r   r   rO   rP   rQ   rR   rS   layernorm_outputattention_outputpresentsresiduallayernorm_input
mlp_outputr   r   r   r   r     s&   


zTransformerLayer.forward)rf   TTrb   r   r   r   r   r   re     s     re   c                       s6   e Zd ZdZ	d	 fdd	Z				d
ddZ  ZS )TopQueryLayerzA single top query layer.

    Top query layer takes input with size [b, s, h] and returns an
    output of the same size.
    rf   c                    sv   t t|   || _|| _|| _|| _tjj	| j| jd| _
t| j| j| j| _tjj	| j| jd| _t| j| _d S rg   )r   ry   r   r   r)   rj   r-   r   r   rk   rl   rc   rm   rn   r
   ro   )r   r   r)   r-   rj   r   r   r   r     s    zTopQueryLayer.__init__NFc              	   C   sv   |d ksJ |  |}| j|||||||d}	|r|	\}	}
|}|	| }| |}| |}|}|| }|r9||
g}|S rp   rr   )r   r   rd   rO   rP   rQ   rR   rS   rs   rt   ru   rv   rw   rx   r   r   r   r   r     s,   
	

zTopQueryLayer.forwardrf   rb   r   r   r   r   r   ry     s    !ry   c                       sV   e Zd ZdZ	d fdd	Zdd Zdd Z			
				dddZ				
dddZ  Z	S )TransformerzTransformer class.rf   c                    s   t t  |_|_|_|_d _jd u sJ jd u r&j_jj dks2J dfdd tj	
 fddtjD _tjjj_tj	jjjd_d S )Nr   z?number of layers should be divisible by number of unique layersc                    s   t  j j| S r   )re   r   r)   )r-   )r   r   r   build_layerT  s   
z)Transformer.__init__.<locals>.build_layerc                    s   g | ]} |d  qS )r   r   ).0i)r|   r   r   
<listcomp>Y  s    z(Transformer.__init__.<locals>.<listcomp>rh   )r   r{   r   r   r)   rj   
num_layersnum_unique_layersr   r   
ModuleListrangelayersry   topQueryLayerrk   final_layernorm)r   r   r)   r   rj   r   )r|   r   r   r   ;  s.   
zTransformer.__init__c                 C   s
   || j  S r   )r   r   r-   r   r   r   _get_layer_indexb  s   
zTransformer._get_layer_indexc                 C   s   | j | | S r   )r   r   r   r   r   r   
_get_layere  s   zTransformer._get_layerNFc              	   C   s   | dd }| dd }|rg }t| jD ]&}	| |	}
d }|d ur*||	 }|
||||||d}|r?|\}}|| q| |}d }|d urP|| j }| j|||||||d}|rg|\}}|| | dd }|ru||g}|S )Nr   r   rq   )rF   rD   r   r   r   appendr   r   )r   r   rd   rO   rP   rQ   rR   rS   ru   indexlayerpastrZ   hidden_states_r   r   r   r   r   h  sR   



	
zTransformer.forward c                 C   s   |  |||S r   )
state_dict)r   destinationprefix	keep_varsr   r   r   state_dict_for_save_checkpoint  s   z*Transformer.state_dict_for_save_checkpointrz   rb   Nr   F)
r   r   r   r    r   r   r   r   r   r!   r   r   r   r   r{   8  s    '
@r{   c                       B   e Zd ZdZ fddZdd Z			dd	d
ZdddZ  ZS )	EmbeddingLanguage model embeddings.

    Arguments:
        hidden_size: hidden size
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
    c                    sd   t t|   || _|| _|| _tj| j| j| _d| _	tj| j| j| _
| j
 | _
d| _d S )Nword_embeddingsposition_embeddings)r   r   r   r   
vocab_sizemax_sequence_lengthr   r   r   _word_embeddings_keyr   rK   _position_embeddings_keyr   r   r   r   r   r   r   r     s   


zEmbedding.__init__c                 C   s    |  |}| |}|| }|S r   )r   r   )r   	input_idsposition_idswords_embeddingsr   
embeddingsr   r   r   r     s   

zEmbedding.forwardNr   Fc                 C   s4   i }| j ||||| j< | j||||| j< |S zFor easy load.)r   r   r   r   r   r   r   r   r   state_dict_r   r   r   r     s   z(Embedding.state_dict_for_save_checkpointTc                 C   s   | j |v r|| j  }ni }| D ]}d|v r"|| ||dd < q|d d| j |d< | jj||d | j|v rA|| j }ni }| D ]}d|v rX|| ||dd < qG| jj||d dS )	Customized load.r   zword_embeddings.r   weightNstrictr   zposition_embeddings.)r   keyssplitr   r   load_state_dictr   r   r   r   r   r   r1   r   r   r   r     s&   

zEmbedding.load_state_dictr   T	r   r   r   r    r   r   r   r   r!   r   r   r   r   r     s    		
r   c                       r   )QueryEmbeddingr   c                    sJ   t t|   || _|| _|| _tj| j| j| _	| j	
 | _	d| _d S )Ntop_query_embeddings)r   r   r   r   r   r   r   r   r   r   rK   _top_query_embeddings_keyr   r   r   r   r     s   
zQueryEmbedding.__init__c                 C   s   |  |}|S r   )r   )r   r   r   r   r   r   r     s   
zQueryEmbedding.forwardNr   Fc                 C      i }| j ||||| j< |S r   )r   r   r   r   r   r   r   r   $  s   z-QueryEmbedding.state_dict_for_save_checkpointTc                 C   sZ   | j |v r|| j  }ni }| D ]}d|v r"|| ||dd < q| jj||d dS )r   r   ztop_query_embeddings.r   r   N)r   r   r   r   r   r   r   r   r   r   1  s   
zQueryEmbedding.load_state_dictr   r   r   r   r   r   r   r     s    	
r   c                       L   e Zd ZdZ fddZ				dddZ			dd	d
ZdddZ  ZS )TransformerLanguageModela  Transformer language model.

    Arguments:
        transformer_hparams: transformer hyperparameters
        attention_mask_func: a function that takes `unmaksed-attention-scores`
            with size [b, np, s, s] and an `attention-mask` and will apply
            the masking. The function should return a masked score of the
            same size [b, np, s, s].
          masked-attention-scores = attention_mask_func(
                                     unmaksed-attention-scores, attention-mask)
        vocab_size: vocabulary size
        max_sequence_length: maximum size of sequence. This
                             is used for positional embedding
    c                    s~   t t|   || _|| _|| _|| _|| _t| j| j| j| _	d| _
t| j| j| j| _d| _t| j| j| j| _d| _d S )N	embeddingtopQueryEmbeddingtransformer)r   r   r   r   r   r)   padded_vocab_sizemax_position_embeddingsr   r   _embedding_keyr   r   _topQueryEmbedding_keyr{   r   _transformer_keyr   r   r   r)   r   r   r   r   r   r   Q  s(   

z!TransformerLanguageModel.__init__NFc              	   C   s6   |  ||}|}	| |	}
| j||
|||||d}|S rp   )r   r   r   )r   r   r   rO   rP   rQ   rR   rS   embedding_outputquery_position_idsqueryEmbedding_outtransformer_outputr   r   r   r   q  s   
	z TransformerLanguageModel.forwardr   c                 C   sJ   i }| j ||||| j< | j||||| j< | j||||| j< |S r   )r   r   r   r   r   r   r   r   r   r   r   r     s   z7TransformerLanguageModel.state_dict_for_save_checkpointTc                 C   s   | j |v r|| j  }ni }| D ]}d|v r|| ||< q| jj||d | j|v r1|| j }ni }| D ]}d|v rC|| ||< q7| jj||d | j|v rW|| j }ni }| D ]}d|v rn|| ||dd < q]| jj||d dS )r   _embeddingsr   ztransformer.r   N)	r   r   r   r   r   r   r   r   r   r   r   r   r   r     s0   


z(TransformerLanguageModel.load_state_dictrb   r   r   r   r   r   r   r   r   A  s    %

r   c                       r   )CodeGeeXModelz/CodeGeeX: A Multilingual Code Generation Model.c                    s*   t t|   t|||||| _d| _d S )Nlanguage_model)r   r   r   r   r   _language_model_keyr   r   r   r   r     s   
zCodeGeeXModel.__init__NFc              	   C   sL   | j |||||||d}|r|\}}	t|| j jjj }
|r$|
|	g}
|
S rp   )r   Flinearr   r   r   rK   )r   r   r   rO   rP   rQ   rR   rS   	lm_outputru   r   r   r   r   r     s$   	zCodeGeeXModel.forwardr   c                 C   r   r   )r   r   r   r   r   r   r   r     s   z,CodeGeeXModel.state_dict_for_save_checkpointTc                 C   s(   | j |v r
|| j  }| jj||d dS )r   r   N)r   r   r   )r   r   r   r   r   r   r     s   

zCodeGeeXModel.load_state_dictrb   r   r   r   r   r   r   r   r     s    
!
r   )r3   r   torch.nn.functionalr   
functionalr   r	   Moduler
   r#   rc   re   ry   r{   r   r   r   r   r   r   r   r   <module>   s"   ' 2 0MVvT? 