o
    ߥi                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlmZm	Z	m
Z
mZmZmZmZ ddlZddlm  mZ ddlZddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZm Z  dd	l!m"Z"m#Z#m$Z$ dd
l%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ej<dkrej=>d ej=?d ej=@d ej=Ad e5B Z4dZCdZDdgZEG dd deZFdd ZGG dd dejjHZIejJjKd d! ZLd"d# ZMG d$d% d%ejjHZNd&d' ZOejJjKd(d) ZP			d>d*d+ZQG d,d- d-ejjHZRG d.d/ d/ejjHZSG d0d1 d1ejjHZTG d2d3 d3ejjHZUG d4d5 d5e0e&ZVd6ZWd7ZXe)d8eWG d9d: d:eVZYe.jZe7j[e,j\d;G d<d= d=eVZ]dS )?z PyTorch ChatGLM model.     N)AnyCallableDictListOptionalTupleUnion)nn)CrossEntropyLoss	LayerNorm)	skip_init)LogitsProcessor)GenerationConfigLogitsProcessorListModelOutputStoppingCriteriaList)BaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsCausalLMOutputWithPast)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)Models)MODELSModel
TorchModel)
OutputKeys)logger)Tasks   )ChatGLMConfig)ChatGLMTokenizerdarwinFTzTHUDM/ChatGLM-6BChatGLM6BConfigzTHUDM/chatglm-6bc                   @   s(   e Zd ZdejdejdejfddZdS )InvalidScoreLogitsProcessor	input_idsscoresreturnc                 C   s0   t | st | r|  d|d< |S )Ng     j@).   )torchisnananyisinfzero_)selfr&   r'    r0   a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/chatglm/text_generation.py__call__:   s   z$InvalidScoreLogitsProcessor.__call__N)__name__
__module____qualname__r*   
LongTensorFloatTensorr2   r0   r0   r0   r1   r%   8   s    r%   c                 C   s  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6t||	D ]\}
}|
d}
tdd	 |
D rzt	d
d|
  q\| }|
D ]|}|d|r|d|}n|g}|d dks|d dkrt|d}nH|d dks|d dkrt|d}n6|d dkrt|d}n*|d dkrt|d}nz	t||d }W n ty   t	d
d|
  Y q~w t|dkrt|d }|| }q~|dd dkr
t|d}n
|dkr||}z|j|jks)J d|j d|j dW n tyC } z| j|j|jf7  _ d}~ww t	d|
  t||_q\| S )z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c                 s   s    | ]}|d v V  qdS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepNr0   ).0nr0   r0   r1   	<genexpr>_   s    z0load_tf_weights_in_chatglm_6b.<locals>.<genexpr>z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r    i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorr   errorospathabspathinfotrainlist_variablesload_variableappendzipsplitr,   join	fullmatchgetattrAttributeErrorlenint	transposeshapeAssertionErrorargsr*   
from_numpydata)modelconfigtf_checkpoint_pathrL   nptftf_path	init_varsnamesarraysnamerb   arraypointerm_namescope_namesnumer0   r0   r1   load_tf_weights_in_chatglm_6bB   s   



rw   c                       s0   e Zd ZdZ fddZdejfddZ  ZS )PrefixEncoderz
    The torch.nn model to encode the prefix
    Input shape: (batch-size, prefix-length)
    Output shape: (batch-size, prefix-length, 2*layers*hidden)
    c              	      s   t    |j| _| jr7tj|j|j| _tj	tj
|j|jtj tj
|j|j|j d | _d S tj|j|j|j d | _d S )NrJ   )super__init__prefix_projectionr*   r	   	Embeddingpre_seq_lenhidden_size	embedding
SequentialLinearTanh
num_layerstransr/   rh   	__class__r0   r1   rz      s    




zPrefixEncoder.__init__prefixc                 C   s,   | j r| |}| |}|S | |}|S N)r{   r   r   )r/   r   prefix_tokenspast_key_valuesr0   r0   r1   forward   s   


zPrefixEncoder.forward)	r3   r4   r5   __doc__rz   r*   Tensorr   __classcell__r0   r0   r   r1   rx      s    rx   c                 C   s*   d|  dt d|  dd|  |      S )zOpenAI's gelu implementation.      ?      ?gQ63E?gHm?)r*   tanhxr0   r0   r1   	gelu_impl   s
   r   c                 C   s   t | S r   )r   r   r0   r0   r1   gelu   s   r   c                       sD   e Zd Zdejdf fdd	Zdd Zdd	d
Z fddZ  Z	S )RotaryEmbedding'  Fc                    sx   t    d|td|d |   }| }|| _|r(tj|| _	d | _
n| d| d | _
d | _d | _|| _d S )Nr   r   rJ   inv_freq)ry   rz   r*   arangefloathalf	learnabler	   	Parameterr   max_seq_len_cachedregister_buffer
cos_cached
sin_cached	precision)r/   dimbaser   r   r   r   r0   r1   rz      s   

zRotaryEmbedding.__init__c                 C   s   d S r   r0   )r/   
state_dictr   local_metadatastrictmissing_keysunexpected_keys
error_msgsr0   r0   r1   _load_from_state_dict      z%RotaryEmbedding._load_from_state_dictr    Nc           	      C   s  |d u r	|j | }| jd u s|| jkr{| jrd n|| _tj||j| jjd}td|| j}tj	||fdd
|j}| jtjkrE| }| d d d d d f }| d d d d d f }| jtjkrm| }| }| jrt||fS ||| _| _| jd |df | jd |df fS )N)devicedtypezi,j->ijr   .)rb   r   r   r*   r   r   r   r   einsumcattor   bfloat16r   cossinr   r   )	r/   r   seq_dimseq_lentfreqsembr   r   r0   r0   r1   r      s*   


$zRotaryEmbedding.forwardc                    s8   | j d ur|| j | _ | jd ur|| j| _t |S r   )r   r   ry   _apply)r/   fnr   r0   r1   r      s
   

zRotaryEmbedding._apply)r    N)
r3   r4   r5   r*   r   rz   r   r   r   r   r0   r0   r   r1   r      s
    
r   c                 C   sP   | dd | j d d f | d| j d d d f }}tj| |f|jd dS )N.r   rJ   r    r   )rb   r*   r   ndim)r   x1x2r0   r0   r1   rotate_half   s
   6r   c                 C   sd   t ||ddt ||dd}}| | t| |  || t||  } }| |fS )Nr    rJ   )Fr   squeeze	unsqueezer   )qkr   r   position_idr0   r0   r1   apply_rotary_pos_emb_index   s   
r   c
                 C   sV  |d ur|d |d }
}t j|
|fdd}t j||fdd}|j\}}}}|	r-||f}nd }t|d }|r@|t||  }|d|d|d|df}||d |d |d  d}||d |d |d  d}t jddd|j	|j
d}t j||dd|dddddd	d
}|j| }| jr|| j_| || }n#|dk s||d |j	}| }|| }tj|dd}||}|d|d|d|df}||d|d |d  d}||d |d  |d d}t ||dd}|j| }|dddd }| d d |f }|j| }|||f}|S )Nr   r    r   rJ   r      r   r   g        r   )rE   alpha     )r*   r   rb   r   mathsqrtsizeviewzerosr   r   baddbmmra   scale_mask_softmaxscale
contiguousallmasked_fill_r   softmaxtypebmmpermute)r/   query_layer	key_layervalue_layerattention_maskhidden_size_per_partitionlayer_id
layer_pastscaling_attention_score	use_cachepast_key
past_valuer   bnhr~   presentquery_key_layer_scaling_coeffoutput_sizematmul_resultattention_scoresattention_probsr   context_layernew_context_layer_shapeoutputsr0   r0   r1   attention_fn   s   



	




r   c                       sz   e Zd Zddejdf fdd	Zedd Z	ddd	Z			dd
ej	dej	de
eej	ej	f  dedef
ddZ  ZS )SelfAttentionNTc                    s   t t|   || _|| _|| _|| _|| _|| _t	|r$| j| jd  n| j| j dt
jdd| _d | _|d u r>|| | _n|| _|| j | _tt
jj|d| j ||d| _tt
jj| j|||d| _d S )NrJ   r   F)r   r   r   r   rF   r   )ry   r   rz   r   r~   r   num_attention_heads!num_attention_heads_per_partitionposition_encoding_2dr   r*   r   
rotary_embr   hidden_size_per_attention_headinner_hidden_sizer   r	   r   query_key_valuedense)r/   r~   r   r   r   rF   params_dtyper   r   r0   r1   rz   w  sH   
	zSelfAttention.__init__c                 C   s   |  |d | S )Nr   )r   )r   r   r0   r0   r1   attention_mask_func  s   z!SelfAttention.attention_mask_funcFc                 C   sF   |  d }| | | }tj|||d}|r!tdd |D S |S )a#  Split a tensor along its last dimension.
        Arguments:
            tensor: input tensor.
            num_partitions: number of partitions to split the tensor
            contiguous_split_chunks: If True, make each chunk contiguous
                                    in memory.
        r    r   c                 s   s    | ]}|  V  qd S r   )r   )r>   chunkr0   r0   r1   r@         z<SelfAttention.split_tensor_along_last_dim.<locals>.<genexpr>)r   r   r*   rZ   tuple)r/   tensornum_partitionscontiguous_split_chunkslast_dimlast_dim_sizetensor_listr0   r0   r1   split_tensor_along_last_dim  s   z)SelfAttention.split_tensor_along_last_dimhidden_statesr   r   r   output_attentionsc                 C   s  |  |}| dd | jd| j f }	|j|	 }| |d\}
}}| jr|
jd|
jd d\}}|jd|jd d\}}| j	||
 d d\}}|dddddf dd |dddddf dd }}t|||||\}}t|||||\}}tj||g|jd d}
tj||g|jd d}n|dd}| j	||
 d d\}}t|
||||\}
}t| |
|||| j|||d		\}}}| |}||f}|r||f7 }|S )
q
        hidden_states: [seq_len, batch, hidden_size]
        attention_mask: [(1, 1), seq_len, seq_len]
        Nr   r   rJ   r    r   )r   r   )	r/   r   r   r   r   r   r   r   r   )r   r   r   r   r   r
  r   r  r   r   maxra   r   r   r*   concatr   r   r   )r/   r  position_idsr   r   r   r   r  mixed_raw_layernew_tensor_shaper   r   r   q1q2k1k2r   r   block_position_idsr   r   r   outputr   r0   r0   r1   r     sZ   


  




zSelfAttention.forwardFNFF)r3   r4   r5   r*   r   rz   staticmethodr   r
  r   r   r   boolr   r   r0   r0   r   r1   r   u  s0    3

r   c                       s$   e Zd Z fddZdd Z  ZS )GEGLUc                    s   t    tj| _d S r   )ry   rz   r   r   activation_fnr/   r   r0   r1   rz     s   
zGEGLU.__init__c                 C   s&   |j d|jd d\}}|| | S )NrJ   r    r   )r  r   r  )r/   r   r   r   r0   r0   r1   r     s   zGEGLU.forward)r3   r4   r5   rz   r   r   r0   r0   r   r1   r    s    r  c                       s2   e Zd Zdddeejf fdd	Zdd Z  ZS )GLUNTc                    sr   t t|   || _|| _|| _|d u rd| }|| _ttj	j
| j| j||d| _ttj	j
| j| j||d| _d S )N   r   )ry   r   rz   r   activation_funcr~   r   r   r*   r	   r   dense_h_to_4hdense_4h_to_h)r/   r~   r   r   rF   r"  r   r   r0   r1   rz     s*   zGLU.__init__c                 C   s"   |  |}| |}| |}|S )z>
        hidden_states: [seq_len, batch, hidden_size]
        )r#  r"  r$  )r/   r  intermediate_parallelr  r0   r0   r1   r   :  s   


zGLU.forward)	r3   r4   r5   r   r*   r   rz   r   r   r0   r0   r   r1   r     s     r   c                       sh   e Zd Zddedejddf fdd	Z			ddejdejd	ee	ejejf  d
e
de
f
ddZ  ZS )GLMBlockNT   c              	      sn   t t|   || _|||d| _|| _t||||||	| jd| _|||d| _|
| _	t
|||||	d| _d S )Neps)r   rF   r   r   )r   rF   r   r   )ry   r&  rz   r   input_layernormr   r   	attentionpost_attention_layernormr   r   mlp)r/   r~   r   layernorm_epsilonr   r   r   	layernormuse_biasr   r   r   r   r0   r1   rz   K  s0   
zGLMBlock.__init__Fr  r   r   r   r  c              	   C   s   |  |}| j|||||||d}	|	d }
|	dd }d| j d }|| |
 }| |}| |}|| | }|rA|f| }|S |f|dd  }|S )r  )r   r   r   r   r  r   r    NrJ   r   )r*  r+  r   r,  r-  )r/   r  r  r   r   r   r   r  attention_inputattention_outputsattention_outputr   r   	mlp_input
mlp_outputr  r0   r0   r1   r   z  s,   
	


zGLMBlock.forwardr  )r3   r4   r5   r   r*   r   rz   r   r   r   r  r   r   r0   r0   r   r1   r&  I  s.    5r&  c                       sr   e Zd ZdZdZdZeZdZdgZ	 fddZ
dejfd	d
Zdd ZdddZdddZe fddZ  ZS )ChatGLMPreTrainedModelz
    An abstract class to handle weights initialization and
    a simple interface for downloading and loading pretrained models.
    FTtransformerr&  c                    s*   t  j|jfi | t t| | d S r   )ry   rz   name_or_pathr   )r/   rh   kwargsr   r0   r1   rz     s   zChatGLMPreTrainedModel.__init__modulec                 C   s   dS )zInitialize the weights.Nr0   )r/   r:  r0   r0   r1   _init_weights  r   z$ChatGLMPreTrainedModel._init_weightsc           	         sz   |j \}} fdd|D }tj|||f|d}|  t|D ]\}}d||d d d |f< q |d |dk  }|S )Nc                       g | ]}|   jjqS r0   tolistindexrh   bos_token_idr>   seqr  r0   r1   
<listcomp>      z4ChatGLMPreTrainedModel.get_masks.<locals>.<listcomp>r   r    r   )rb   r*   onestril_	enumerate
unsqueeze_r  )	r/   r&   r   
batch_size
seq_lengthcontext_lengthsr   icontext_lengthr0   r  r1   	get_masks  s   


z ChatGLMPreTrainedModel.get_masksc                    s   |j \}fdd|D }jrPtjtj dd|d}t|D ]\}}	|| |||	d f< q% fdd|D }
tj|
dd}
tj||
fdd}|S tjtj dd|d}|sst|D ]\}}	|| ||	d < qf|S )Nc                    r<  r0   r=  rA  r  r0   r1   rC    rD  z;ChatGLMPreTrainedModel.get_position_ids.<locals>.<listcomp>r   r   r    c              
      s>   g | ]}t t j|t j d t j| t j d d fqS )r   r    )r*   r   r   longr   )r>   rN  )r   rK  r0   r1   rC    s"    
r   )	rb   r   r*   r   rP  r   repeatrH  stack)r/   r&   mask_positionsr   gmaskrJ  rL  r  rM  rN  r  r0   )r   r/   rK  r1   get_position_ids  s>   




z'ChatGLMPreTrainedModel.get_position_idsc                 C   s   t |tr
||_d S d S r   )
isinstanceChatGLMModelgradient_checkpointing)r/   r:  valuer0   r0   r1   _set_gradient_checkpointing  s   

z2ChatGLMPreTrainedModel._set_gradient_checkpointingc                    s<   | dd}| dd tt| jdd|i|}||_|S )a'  Instantiate the model.

        Args:
            kwargs: Input args.
                    model_dir: The model dir used to load the checkpoint and the label information.

        Returns:
            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
        	model_dirNcfgpretrained_model_name_or_pathr0   )popry   r   from_pretrainedr[  )clsr9  r[  rg   r   r0   r1   _instantiate  s   z#ChatGLMPreTrainedModel._instantiater  )r3   r4   r5   r   is_parallelizablesupports_gradient_checkpointingr!   config_classbase_model_prefix_no_split_modulesrz   r	   Moduler;  rO  rU  rZ  classmethodra  r   r0   r0   r   r1   r6    s    

$r6  aM  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
    usage and behavior.

    Parameters:
        config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a:
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`ChatGLM6BTokenizer`].
            See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range `[0, config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
            than the model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zdThe bare ChatGLM-6B Model transformer outputting raw hidden-states without any specific head on top.c                       s   e Zd ZdZdef fddZdd Zdejfdd	Z	ej
fd
dZeedeeeed									ddeej deej deej deeeejejf df  deej dee dee dee dee deeejdf ef fddZ  ZS )rW  a  

    The model can behave as an encoder (with only self-attention) as well
    as a decoder, in which case a layer of cross-attention is added between
    the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the
    `is_decoder` argument of the configuration set to `True`.
    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
    argument and `add_cross_attention` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.
    rh   c                    s$  t  | |j_|j_tj_|j_|j_|j	_	|j
_
|j_jj _|j_|j_|j_ttjjjjjd_d_fdd tj fddtj	D _tjj
d_jd ur D ]}d|_qstj _t|_ tj!d_"d S d S )	N)num_embeddingsembedding_dimr   Fc                    s*   t  j j j|  j jtd j jd
S )NT)r   r   r/  r0  r   r   )	r&  r~   r   r.  r   r   r   r   r   )r   r  r0   r1   	get_layerw  s   z(ChatGLMModel.__init__.<locals>.get_layerc                    s   g | ]} |qS r0   r0   )r>   r   )rk  r0   r1   rC    s    z)ChatGLMModel.__init__.<locals>.<listcomp>r(  g?)#ry   rz   max_sequence_lengthr~   r*   r   r   r   
vocab_sizer   r.  r   r   r   r}   r{   r   r	   r|   word_embeddingsrX  
ModuleListrangelayersr   final_layernorm
parametersrequires_gradr   rP  r   rx   prefix_encoderDropoutdropout)r/   rh   paramr   )rk  r/   r1   rz   _  sD   

zChatGLMModel.__init__c                 C      | j S r   rn  r  r0   r0   r1   get_input_embeddings     z!ChatGLMModel.get_input_embeddingsnew_embeddingsc                 C   
   || _ d S r   rz  r/   r}  r0   r0   r1   set_input_embeddings     
z!ChatGLMModel.set_input_embeddingsc                 C   sp   | j d|d|}| ||}||| j| jd | j	| j
| j	 }| |}|g dd}|S )Nr   r   rJ   )rJ   r    r   r   r!  )r   r   expandr   ru  r   r   r}   r   r   r~   rw  r   rZ   )r/   rJ  r   r   r   r   r0   r0   r1   
get_prompt  s   

zChatGLMModel.get_promptzbatch_size, sequence_length)
checkpointoutput_typerd  Nr&   r  r   r   .inputs_embedsr   r  output_hidden_statesreturn_dictr(   c
                    s<  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	| jr2| jr2|r2d}|d ur>|d ur>td|d urL|jd d \}
}n|d ur[|jd d \}
}}ntd|d u rh| 	|}|d u r| j
d ur| j|jd |j|jd}n
td gt| j }|d u r| j||jd}|d u r| j j| j j}}||v r|n| ||v rdnd} fd	d
|D }| j|||j|d}| j
d ur|d urt|
d|d| j
|j}|dk  }tj||fdd}|dd}|rdnd }|rdnd }|rdnd }|d u rtjdd|jd }n||j}t| jD ]V\}}|r.||f }|| }| jrL| jrLtjj||||t ||||}n||||t ||||d}|d }|rh||d f }|rw|||rrdnd f }q"| !|}|r||f }|	stdd ||||fD S t"||||dS )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerJ   z5You have to specify either input_ids or inputs_embedsr   )rJ  r   r   rE  Tc                    s   g | ]	}|   qS r0   )r>  r?  rA  
mask_tokenr0   r1   rC        z(ChatGLMModel.forward.<locals>.<listcomp>)rS  r   rT  r    r   r   r   r   r0   )r  r   r   r   r   r  c                 s   s    | ]	}|d ur|V  qd S r   r0   )r>   vr0   r0   r1   r@   2  s    z'ChatGLMModel.forward.<locals>.<genexpr>)last_hidden_stater   r  
attentions)#rh   r  r  r   use_return_dictrX  training
ValueErrorrb   rn  r}   r  r   r   r  r_   rq  rO  mask_token_idgmask_token_idrU  r*   rF  r   r   r  r   ra   r   rH  utilsr  r  rr  r   )r/   r&   r  r   r   r  r   r  r  r  rJ  rK  _MASKgMASK	use_gmaskrS  prefix_attention_maskr  presentsall_self_attentionsall_hidden_statesrM  layerr   	layer_retr0   r  r1   r     s   





	


zChatGLMModel.forward)	NNNNNNNNN)r3   r4   r5   r   r!   rz   r{  r*   r   r  r   r  r   CHATGLM_6B_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r6   r   r  r   r   r   r   r0   r0   r   r1   rW  K  s\    9	
rW  )module_namec                       sZ  e Zd Zdef fddZdd Zdd Z		d>d	ed
ee	e
f dededee	e
f f
ddZ				d?dejdeej deej deej deej defddZ										d@deej deej deej deeej  deej deej dee dee dee dee fddZedeeejejf d f d!ejdeeejejf d f fd"d#Zd$d% Ze 		&	'	(	)	*	dAd+e	d,eee	e	f  d-efd.d/Ze 		&	(	)	*	dBd+e	d,eee	e	f  d-efd0d1Ze 				d?d2ee d3ee d4ee  d5ee!eejgee f  fd6d7Z"dCd8efd9d:Z#d;edefd<d=Z$  Z%S )DChatGLMForConditionalGenerationrh   c                    sz   t  | |j| _|j| _t|| _ttj|j	|j
dtjd| _|| _d| _| jjr4| j| jjdd t|j| _d S )NFr   T)
empty_init)ry   rz   rl  r   rW  r7  r   r	   r   r~   rm  r*   r   lm_headrh   	quantizedquantization_bitquantizer"   r_  r8  	tokenizerr   r   r0   r1   rz   A  s    
z(ChatGLMForConditionalGeneration.__init__c                 C   ry  r   r  r  r0   r0   r1   get_output_embeddings^  r|  z5ChatGLMForConditionalGeneration.get_output_embeddingsc                 C   r~  r   r  r  r0   r0   r1   set_output_embeddingsa  r  z5ChatGLMForConditionalGeneration.set_output_embeddingsFr   model_kwargsis_encoder_decoderstandardize_cache_formatr(   c           	      C   s   | j ||d|d< d|v rO|d }|d urO|jtjkrOtj||g |jd d dR gdd}|d d d d dd f  }d|d	< tj||gd
d|d< d|v r{|d }|ddd f  }|d d dd d f  d7  < tj||gdd|d< |S )N)r  r   r   r   r    r   r   F).r   rJ   r  .)_extract_past_from_model_outputr   r*   r  r   new_onesrb   clone)	r/   r   r  r  r  r   new_attention_maskr  new_position_idr0   r0   r1   #_update_model_kwargs_for_generationd  s2   


zCChatGLMForConditionalGeneration._update_model_kwargs_for_generationNr&   pastr   r   r  c                    s  |j \}jjjj}}	|	|v r|	n| |	|v rdnd}
| } fdd|D }|d us3|d ur|d d df d}|d urV|jtjkrV|d d d d dd f }nd }|d ure|ddd f }n6fdd|D }j	rtj
fddt||D tj|jd	d}ntj
d
d |D tj|jd	d}|d u r|}||||dS |d ur|jtjkrd }|d u rj||jd}|d u rΈj||j||
d}||||dS )NTFc                    s   g | ]}|  qS r0   )r?  rA  r  r0   r1   rC    s    zQChatGLMForConditionalGeneration.prepare_inputs_for_generation.<locals>.<listcomp>r   .c                    s   g | ]	}|  jjqS r0   )r?  rh   r@  rA  r  r0   r1   rC    r  c                    s   g | ]
\}}| | gqS r0   r0   )r>   mask_positionrN  )rK  r0   r1   rC    s    r   c                 S   s   g | ]}|qS r0   r0   )r>   r  r0   r0   r1   rC    s    )r&   r   r  r   rE  )r   rS  rT  )rb   rh   r  r  r>  r   r   r*   r  r   r  rY   rP  r   rO  rU  )r/   r&   r  r   r   r  r9  rJ  r  r  r  seqsrS  
last_tokenrL  r0   )r  r/   rK  r1   prepare_inputs_for_generation  sx   


z=ChatGLMForConditionalGeneration.prepare_inputs_for_generationr  labelsr   r  r  r  c                 C   s:  |d ur|n| j j}|
d ur|
n| j j}
| j||||||||	|
d	}|d }| |ddd }d }|d ur{|tj	}|dd dd d f  }|ddd f  }t
dd}||j}||d|d|d}||j}||j}|
s|f|dd   }|d ur|f| S |S t|||j|j|jd	S )
N)	r&   r  r   r   r  r   r  r  r  r   r    rJ   .r   i)ignore_index)losslogitsr   r  r  )rh   r   r  r7  r  r   r   r   r*   float32r
   r   r   r   r   r   r   r  r  )r/   r&   r  r   r   r  r  r   r  r  r  transformer_outputsr  	lm_logitsr  shift_logitsshift_labelsloss_fctr  r0   r0   r1   r     sL   
z'ChatGLMForConditionalGeneration.forward.beam_idxc                    s   t  fdd| D S )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c              	   3   sH    | ]}|d   d |d  j|d  d |d jfV  qdS )r   r    N)index_selectr   r   )r>   r   r  r0   r1   r@     s    zAChatGLMForConditionalGeneration._reorder_cache.<locals>.<genexpr>)r  )r  r  r0   r  r1   _reorder_cache
  s   z.ChatGLMForConditionalGeneration._reorder_cachec                 C   s   |  }|dd}ddgddgddgd	d
gddgg}|D ] }td|d  d|d  |}td|d  d|d  |}q|S )Nu   [[训练时间]]u   2023年,u   ，!u   ！:u   ：;u   ；z\?u   ？z([\u4e00-\u9fff])%sr   z\1%sr    z%s([\u4e00-\u9fff])z%s\1)stripreplacerL   sub)r/   responsepunktsitemr0   r0   r1   process_response  s    z0ChatGLMForConditionalGeneration.process_response   r    Tffffff?ffffff?queryhistory
max_lengthc
                 K   s   |d u rg }|	d u rt  }	|	t  ||||||	d|
}|s#|}n d}t|D ]\}\}}|d|||7 }q)|dt||7 }||gdd}|| j}| jd	i ||}|	 d t|d d d  }|
|}| |}|||fg }||fS )
N)r  	num_beams	do_sampletop_ptemperaturelogits_processor    [Round {}]
问：{}
答：{}
   [Round {}]
问：{}
答：ptreturn_tensorsr   r&   r0   )r   rX   r%   rH  r  r_   r   r   generater>  decoder  )r/   r  r  r  r  r  r  r  r  r  r9  
gen_kwargspromptrM  	old_queryr  inputsr   r0   r0   r1   _chat,  s<   	 

z%ChatGLMForConditionalGeneration._chatc	                 k   s
   |d u rg }|d u rt  }|t  |||||d|	}
|s#|}n d}t|D ]\}\}}|d|||7 }q)|dt||7 }||gdd}|| j}| jd	i ||
D ](}|	 d t|d d d  }|
|}| |}|||fg }||fV  qZd S )
N)r  r  r  r  r  r  r  r  r  r  r   r&   r0   )r   rX   r%   rH  r  r_   r   r   stream_generater>  r  r  )r/   r  r  r  r  r  r  r  r  r9  r  r  rM  r  r  r  r   new_historyr0   r0   r1   stream_chatW  s>    

z+ChatGLMForConditionalGeneration.stream_chatgeneration_configr  stopping_criteriaprefix_allowed_tokens_fnc              	   +   s   |j d |j d }}|d u r| j}t|}|jdi |}	|j|j}}
t|
tr/|
g}
|	dd u o:|j
d u}|rN|jd u rNtd|j
 dt n|jd urj|j| |_
|sjtd|j d|j
 dt ||j
kr| jjrud	nd
}td| d| d|j
 d |d ur|nt }|d ur|nt }| j|||||d}| j||d}| |}||j d d}d }	 | j|fi |	}| di |dddd}|jd d dd d f }|||}|||}tjj|dd}|jrt j!|dd"d nt j#|dd t j$| d d d f gdd}| j%||	| jjd}	|&t' fdd|
D ( }|) dks<|||r>d S |V  q)Nr   r   r  zUsing `max_length`'s default (z) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.zBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)decoder_input_idsr&   zInput length of z is z, but `max_length` is set to zX. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.)r  input_ids_seq_lengthencoder_input_idsr  r  )r  r  r    TF)r  r  r  r   )num_samples)r  c                 3   s    | ]} |kV  qd S r   r0   )r>   rM  next_tokensr0   r1   r@     r  zBChatGLMForConditionalGeneration.stream_generate.<locals>.<genexpr>r0   )*rb   r  copydeepcopyupdater@  eos_token_idrV  r`   getr  max_new_tokenswarningswarnUserWarningr   rh   r  warningr   r   _get_logits_processor_get_stopping_criteria_get_logits_warpernewfill_r  r  r	   
functionalr   r  r*   multinomialr   argmaxr   r  mulsumrP  r  )r/   r&   r  r  r  r  r9  r  r  r  r  has_default_max_lengthinput_ids_stringlogits_warperunfinished_sequencesr'   model_inputsr   next_token_logitsnext_token_scoresprobsr0   r  r1   r    s   








z/ChatGLMForConditionalGeneration.stream_generatebitsc                 K   sX   |dkrd S ddl m} | jrtd | S d| _|| j_|| j|fd|i|| _| S )Nr   r    )r  zAlready quantized.Tr  )quantizationr  r  r   rT   rh   r  r7  )r/   r  r  r9  r  r0   r0   r1   r    s    
z(ChatGLMForConditionalGeneration.quantizeinputc           	   	   C   s   |d }|d }d|v r|d }nd}d|v r|d }nd}d|v r'|d }nd}d	|v r2|d	 }nd
}t |tjkr?| }| j| j||||||d\}}td tj	|tj
|iS )Ntextr  r  r  r  r  r  r    r  T)r  r  r  r  zGeneration finished.)r   r*   r   r>  r  r  r   rT   r   RESPONSEHISTORY)	r/   r  r  r  r  r  r  r  r  r0   r0   r1   chat  s6   





z$ChatGLMForConditionalGeneration.chat)FF)NNNN)
NNNNNNNNNN)Nr  r    Tr  r  N)Nr  Tr  r  Nr  )&r3   r4   r5   r!   rz   r  r  r   r   strr   r  r  r*   r6   r   r   dictr  r   r7   r   r  r  r  no_gradr   r`   r  r  r   r   r   r   r  r  r  r   r0   r0   r   r1   r  >  s    


&
H	

=*(nr  )NTF)^r   r  r   rQ   rL   sysr  typingr   r   r   r   r   r   r   r*   torch.nn.functionalr	   r  r   torch.utils.checkpointtorch.nnr
   r   torch.nn.utilsr   &transformers.generation.logits_processr   transformers.generation.utilsr   r   r   r   transformers.modeling_outputsr   r   r   transformers.modeling_utilsr   transformers.utilsr   r   r   modelscope.metainfor   modelscope.modelsr   r   r   modelscope.outputsr   modelscope.utilsr   loggingmodelscope.utils.constantr   configurationr!   tokenizationr"   platform_C_jit_set_profiling_mode_jit_set_profiling_executor_jit_override_can_fuse_on_cpu_jit_override_can_fuse_on_gpu
get_loggerr  r  (CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LISTr%   rw   rg  rx   jitscriptr   r   r   r   r   r   r   r  r   r&  r6  CHATGLM_6B_START_DOCSTRINGr  rW  register_moduler  
chatglm_6br  r0   r0   r0   r1   <module>   s    $

J 
6

v 1f_2 p