o
    ߥi                  
   @   s  d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
mZ ddlZddlm  mZ ddlZddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZm Z  dd
l!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ej3dkrej45d ej46d ej47d ej48d e,9 Z+dZ:dZ;dgZ<dd Z=G dd deZ>G dd dejj?Z@	dCd ejAd!eBd"eCd#e	ejA fd$d%ZDG d&d' d'ej?ZEejFjGd(ejAd)ejAd#ejAfd*d+ZHG d,d- d-ejj?ZIG d.d/ d/ejj?ZJG d0d1 d1ejj?ZKd2d3 ZLG d4d5 d5ejj?ZMG d6d7 d7ejj?ZNG d8d9 d9ejj?ZOG d:d; d;e%e"ZPG d<d= d=ejj?ZQG d>d? d?ePZRe0jSe.jTe'jUd@G dAdB dBePZVdS )Dz PyTorch ChatGLM model.     N)AnyCallableDictListOptionalTuple)nn)CrossEntropyLoss	LayerNorm)	skip_init)LogitsProcessor)GenerationConfigLogitsProcessorListModelOutputStoppingCriteriaList)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Model
TorchModel)Models)
OutputKeys)logger)Tasks   )MODELS   )ChatGLM2ConfigdarwinFTzTHUDM/ChatGLM2-6BChatGLM6BConfigzTHUDM/chatglm2-6bc                 O   s   | |i |S N )clsargskwargsr!   r!   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/chatglm2/text_generation.pydefault_init2      r&   c                   @   s(   e Zd ZdejdejdejfddZdS )InvalidScoreLogitsProcessor	input_idsscoresreturnc                 C   s0   t | st | r|  d|d< |S )Ng     j@).   )torchisnananyisinfzero_)selfr)   r*   r!   r!   r%   __call__8   s   z$InvalidScoreLogitsProcessor.__call__N)__name__
__module____qualname__r-   
LongTensorFloatTensorr3   r!   r!   r!   r%   r(   6   s    r(   c                       s6   e Zd ZdZdef fddZdejfddZ  Z	S )PrefixEncoderz
    The torch.nn model to encode the prefix
    Input shape: (batch-size, prefix-length)
    Output shape: (batch-size, prefix-length, 2*layers*hidden)
    configc                    s   t    |j| _| jr:|j|j |j d }tj|j	|| _
tjtj||jtj tj|j|| _d S tj|j	|j|j |j d | _
d S )N   )super__init__prefix_projection
num_layerskv_channelsmulti_query_group_numr-   r   	Embeddingpre_seq_len	embedding
SequentialLinearhidden_sizeTanhtrans)r2   r:   kv_size	__class__r!   r%   r=   G   s    


zPrefixEncoder.__init__prefixc                 C   s,   | j r| |}| |}|S | |}|S r    )r>   rD   rI   )r2   rM   prefix_tokenspast_key_valuesr!   r!   r%   forwardV   s   


zPrefixEncoder.forward)
r4   r5   r6   __doc__r   r=   r-   TensorrP   __classcell__r!   r!   rK   r%   r9   @   s    r9   tensornum_partitionscontiguous_split_chunksr+   c                 C   sF   |   d }|  | | }tj| ||d}|r!tdd |D S |S )a5  Split a tensor along its last dimension.

    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.

    Returns:
        A list of Tensors
    r   dimc                 s   s    | ]}|  V  qd S r    )
contiguous).0chunkr!   r!   r%   	<genexpr>v       z.split_tensor_along_last_dim.<locals>.<genexpr>)rX   sizer-   splittuple)rT   rU   rV   last_dimlast_dim_sizetensor_listr!   r!   r%   split_tensor_along_last_dim_   s   rd   c                       sV   e Zd Z				d fdd	Z	ddeded	ejd
ejdef
ddZdddZ	  Z
S )RotaryEmbeddingr   FNc                    sR   t    ddtjd|d|dj|d|   }| d| || _|| _|| _d S )N      ?'  r   r;   device)dtypeinv_freq)	r<   r=   r-   arangetoregister_bufferrX   original_impl
rope_ratio)r2   rX   rp   ro   ri   rj   rk   rK   r!   r%   r=   }   s   

zRotaryEmbedding.__init__rg   seq_lenn_elemrj   ri   basec           
   	   C   s   d|t jd|d||d|   }t j|||d| j }t || }t jt |t |gdd}	|t jt j	t j
fv rI|t j	krE|		 n|	 }	|	S )aM  Enhanced Transformer with Rotary Position Embedding.

        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
        transformers/rope/__init__.py. MIT License:
        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
        rf   r   r;   rj   ri   rW   )r-   rl   rp   outerfloatstackcossinfloat16bfloat16int8half)
r2   rq   rr   rj   ri   rs   thetaseq_idx	idx_thetacacher!   r!   r%   forward_impl   s(   
zRotaryEmbedding.forward_implr   c                 C   s   | j || j| jj| jjdS Nrt   )r   rX   rk   rj   ri   )r2   max_seq_lenoffsetr!   r!   r%   rP      s   zRotaryEmbedding.forward)r   FNN)rg   )r   )r4   r5   r6   r=   intr-   rj   ri   r   rP   rS   r!   r!   rK   r%   re   {   s&    
!re   x
rope_cachec           	      C   s   |  d|  d|  d|  df\}}}}|jd d }| dd |f | d|d f } }|d | }| |d||d d}||dd| dd}t|d |d  |d	 |d	   |d	 |d  |d |d	   gd}|d}tj||fdd
S )Nr   r   r;   r   .ru   ).r   ).r   rW   )r^   shapereshapeviewr-   rx   flattencat)	r   r   sq_nprot_dimx_passxshapedx_out2r!   r!   r%   apply_rotary_pos_emb   s$   ,"
	r   c                       s4   e Zd Z			d fdd	ZdejfddZ  ZS )	RMSNormh㈵>Nc                    s.   t    tjtj|||d| _|| _d S )Nri   rj   )r<   r=   r-   r   	Parameteremptyweighteps)r2   normalized_shaper   ri   rj   r$   rK   r!   r%   r=      s
   

zRMSNorm.__init__hidden_statesc                 C   sF   |j }|tjdjddd}|t|| j  }| j| |S )Nr;   ru   T)keepdim)	rj   rm   r-   float32powmeanrsqrtr   r   )r2   r   input_dtypevariancer!   r!   r%   rP      s   zRMSNorm.forward)r   NN)r4   r5   r6   r=   r-   rR   rP   rS   r!   r!   rK   r%   r      s    r   c                       s*   e Zd Zdef fddZdd Z  ZS )CoreAttentionr:   c                    s   t t|   |j| _|j| _| jrd| _td|| _|j|j }|| _	||j | _
|j| _d }t| j
| _| jrD| j}|  j|9  _|| _tj|j| _d S )NTr   )r<   r   r=   apply_query_key_layer_scalingattention_softmax_in_fp32maxlayer_numberr@   num_attention_headshidden_size_per_partitionhidden_size_per_attention_head!num_attention_heads_per_partitionmathsqrtnorm_factorcoeffr-   r   Dropoutattention_dropout)r2   r:   r   projection_sizer   rK   r!   r%   r=      s"   zCoreAttention.__init__c                 C   s  t tjdd }|dkradd |||fD \}}}|d u r5|jd |jd kr5tjjj|||dd}n|d ur<| }tjj||||}|dddd	}|	 d d
 | j
f }|j| }|S |	d|	d|	d|	df}||d |d |d  d}||d	 |d |d  d}tj|d |d  |d |d	 |j|jd}	tj|	|dd|dddddd| j d}
|
j| }| jr| }| jd ur|| j }|d u r|jd |jd	 krtj|d d|d |d	 |jtjd}|  | }|d ur||td}tj|dd}||}| |}|	d|	d|	d|	d	f}||	d|d |d  d}||d |d  |d d}t||dd}|j| }|dddd	 }|	 d d
 | j
f }|j| }|S )N.r   r;   c                 S   s   g | ]
}| d dddqS )r   r;   r   r   )permute)rZ   kr!   r!   r%   
<listcomp>   s    z)CoreAttention.forward.<locals>.<listcomp>T)	is_causalr   r   r   ru   rt   g        rf   )betaalphar   z-infrW   ) r   r-   __version__r_   r   r   
functionalscaled_dot_product_attentionr   r^   r   r   r   r   rj   ri   baddbmm	transposer   r   rw   r   onesbooltril_masked_fillFsoftmaxtype_asr   bmmrY   )r2   query_layer	key_layervalue_layerattention_maskpytorch_major_versioncontext_layernew_context_layer_shapeoutput_sizematmul_input_buffermatmul_resultattention_scoresattention_probsr!   r!   r%   rP      s   
\



	




	


zCoreAttention.forward)r4   r5   r6   r   r=   rP   rS   r!   r!   rK   r%   r      s    r   c                       sD   e Zd ZdZddef fddZ		dddZ		dd	d
Z  ZS )SelfAttentionzParallel self-attention layer abstract class.

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    Nr:   c                    s   t t|   td|| _|j|j | _| j|j | _|j| _	|j
| _
d| j | _| j
r<|j| _| jd| j |j  | _tj|j| jf|jpH|j|dt|| _t|| j| _tj| j|jf|j|dt|| _d S )Nr   r   r;   biasri   )r<   r   r=   r   r   r@   r   r   r   r   multi_query_attentionqkv_hidden_sizerA   $num_multi_query_groups_per_partitionr   rF   rG   add_bias_linearadd_qkv_bias_config_to_kwargsquery_key_valuer   core_attentiondense)r2   r:   r   ri   rK   r!   r%   r=   v  s@   
zSelfAttention.__init__c                 C   s,   | j r| j}n| j}tj|||| j||dS r   )r   r   r   r-   r   r   )r2   inference_max_sequence_len
batch_sizeri   rj   r   r!   r!   r%   _allocate_memory  s   zSelfAttention._allocate_memoryTc                 C   s  |  |}| jrU|j| j| j | j| j | j| j gdd\}}}	|| d d | j| jf }|| d d | j| jf }|	|	 d d | j| jf }	n| d d | jd| j f }
|j|
 }t|d\}}}	|d urt	||}t	||}|d ur|\}}t
j||fdd}t
j||	fdd}	|r||	f}nd }| jr|d}|ddd| j| j d}| | d d | j| jf }|	d}	|	ddd| j| j d}	|	 |	 d d | j| jf }	| |||	|}| |}||fS )Nru   rW   r   r   r   r;   )r   r   r_   r   r   r   r   r^   rd   r   r-   r   	unsqueezeexpandrY   r   r   )r2   r   r   rotary_pos_embkv_cache	use_cachemixed_x_layerr   r   r   new_tensor_shapecache_kcache_vr   outputr!   r!   r%   rP     s   









	
zSelfAttention.forwardr    )NNNT)	r4   r5   r6   rQ   r   r=   r   rP   rS   r!   r!   rK   r%   r   o  s    %
r   c                 C   s   d| j i}|S )Nrj   )torch_dtype)r#   common_kwargsr!   r!   r%   r     s   r   c                       0   e Zd ZdZddef fddZdd Z  ZS )	MLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    Nr:   c                    sx   t t|   |j| _tj|j|jd f| j|dt	|| _
dd }|| _tj|j|jf| j|dt	|| _d S )Nr;   r   c                 S   s&   t j| ddd} t| d | d  S )Nr;   ru   rW   r   r   )r-   r[   r   silu)r   r!   r!   r%   swiglu'  s   zMLP.__init__.<locals>.swiglu)r<   r   r=   r   add_biasr   rF   rG   ffn_hidden_sizer   dense_h_to_4hactivation_funcdense_4h_to_h)r2   r:   ri   r   rK   r!   r%   r=     s,   zMLP.__init__c                 C   s"   |  |}| |}| |}|S r    )r   r   r   )r2   r   intermediate_parallelr   r!   r!   r%   rP   5  s   


zMLP.forwardr    r4   r5   r6   rQ   r   r=   rP   rS   r!   r!   rK   r%   r     s    r   c                       s6   e Zd ZdZd	def fddZ		d
ddZ  ZS )GLMBlockzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    Nr:   c                    s   t t|   || _|j| _|j| _|jrtnt}||j	|j
||jd| _t|||d| _|j| _||j	|j
||jd| _t||d| _d S )Nr   ri   rj   rh   )r<   r   r=   r   (apply_residual_connection_post_layernormfp32_residual_connectionrmsnormr   r
   rG   layernorm_epsilonr   input_layernormr   self_attentionhidden_dropoutpost_attention_layernormr   mlp)r2   r:   r   ri   LayerNormFuncrK   r!   r%   r=   E  s,   zGLMBlock.__init__Tc                 C   s   |  |}| j|||||d\}}| jr|}n|}tjjj|| j| jd}	||	 }	| 	|	}| 
|}
| jr9|}n|	}tjjj|
| j| jd}|| }||fS )Nr   r   )ptraining)r  r  r  r-   r   r   dropoutr  r  r  r	  )r2   r   r   r   r   r   layernorm_outputattention_outputresiduallayernorm_input
mlp_outputr   r!   r!   r%   rP   d  s2   





zGLMBlock.forwardr    r   r   r!   r!   rK   r%   r   >  s    $r   c                       sR   e Zd ZdZddef fddZdd Z				dd
ee dee fddZ	  Z
S )GLMTransformerzTransformer class.Nr:   c                    s   t t|   j| _j| _j| _fdd tj fddt	| jD | _
| jrAjr3tnt}|jjjd| _d| _d S )Nc                    s   t  | dS )Nrh   )r   )r   )r:   ri   r!   r%   build_layer  r'   z,GLMTransformer.__init__.<locals>.build_layerc                    s   g | ]} |d  qS )r   r!   rZ   i)r  r!   r%   r     s    z+GLMTransformer.__init__.<locals>.<listcomp>r   F)r<   r  r=   r  post_layer_normr?   r-   r   
ModuleListrangelayersr  r   r
   rG   r  r   final_layernormgradient_checkpointing)r2   r:   ri   r
  rK   )r  r:   ri   r%   r=     s"   
zGLMTransformer.__init__c                 C   s
   | j | S r    )r  )r2   r   r!   r!   r%   
_get_layer  s   
zGLMTransformer._get_layerTFr   output_hidden_statesc              	   C   s   |sdd t | jD }|rdnd }| jr!| jr!|r!td d}d }|r'dnd }	t | jD ]9}
|r7|	|f }	| |
}| jrQ| jrQtjj		||||||
 |}n||||||
 |d}|\}}|rg||f }q.|ro|	|f }	| j
rw| |}|||	|fS )Nc                 S   s   g | ]}d qS r    r!   )rZ   r   r!   r!   r%   r     s    z*GLMTransformer.forward.<locals>.<listcomp>r!   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr  )r  r?   r  r  r   warning_oncer  r-   utils
checkpointr  r  )r2   r   r   r   	kv_cachesr   r  presentsall_self_attentionsall_hidden_statesindexlayer	layer_retr   r!   r!   r%   rP     sH   	




zGLMTransformer.forwardr    )NTF)r4   r5   r6   rQ   r   r=   r  r   r   rP   rS   r!   r!   rK   r%   r    s    r  c                       sr   e Zd ZdZdZdZeZdZdgZ	 fddZ
dejfd	d
ZdddZdd ZdddZe fddZ  ZS )ChatGLMPreTrainedModelz
    An abstract class to handle weights initialization and
    a simple interface for downloading and loading pretrained models.
    FTtransformerr   c                    s*   t  j|jfi | t t| | d S r    )r<   r=   name_or_pathr   )r2   r:   r$   rK   r!   r%   r=     s   zChatGLMPreTrainedModel.__init__modulec                 C   s   dS )zInitialize the weights.Nr!   )r2   r-  r!   r!   r%   _init_weights  s   z$ChatGLMPreTrainedModel._init_weightsNc                 C   s   |j \}}tj||||jd}|  d}|r |d d j d }|r3tjtj||||jd|fdd}|d ur>||d }|sM|d urM||dd 8 }|dk  }|d |S )Nrh   r   ru   rW   r   g      ?)	r   r-   r   ri   r   r   r   r   
unsqueeze_)r2   r)   rO   padding_maskr   
seq_lengthfull_attention_maskpast_lengthr!   r!   r%   	get_masks  s:   

	
z ChatGLMPreTrainedModel.get_masksc                 C   s.   |j \}}tj|tj|dd|d}|S )Nrt   r   r   )r   r-   rl   longr   repeat)r2   r)   ri   r   r1  position_idsr!   r!   r%   get_position_ids  s   
z'ChatGLMPreTrainedModel.get_position_idsc                 C   s   t |tr
||_d S d S r    )
isinstancer  r  )r2   r-  valuer!   r!   r%   _set_gradient_checkpointing  s   

z2ChatGLMPreTrainedModel._set_gradient_checkpointingc                    s<   | dd}| dd tt| jdd|i|}||_|S )a'  Instantiate the model.

        Args:
            kwargs: Input args.
                    model_dir: The model dir used to load the checkpoint and the label information.

        Returns:
            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
        	model_dirNcfgpretrained_model_name_or_pathr!   )popr<   r   from_pretrainedr<  )r"   r$   r<  modelrK   r!   r%   _instantiate#  s   z#ChatGLMPreTrainedModel._instantiater    F)r4   r5   r6   rQ   is_parallelizablesupports_gradient_checkpointingr   config_classbase_model_prefix_no_split_modulesr=   r   Moduler.  r4  r8  r;  classmethodrB  rS   r!   r!   rK   r%   r*    s    

r*  c                       r   )	rB   zLanguage model embeddings.Nr:   c                    s<   t t|   |j| _tj|j| j|j|d| _|j| _d S r   )	r<   rB   r=   rG   r   padded_vocab_sizer   word_embeddingsr  )r2   r:   ri   rK   r!   r%   r=   :  s   zEmbedding.__init__c                 C   s0   |  |}|}|dd }| jr| }|S )Nr   r   )rL  r   rY   r  rw   )r2   r)   words_embeddings
embeddingsr!   r!   r%   rP   F  s   
zEmbedding.forwardr    r   r!   r!   rK   r%   rB   7  s    rB   c                       s   e Zd Zddef fddZdd Zejfdd	Z								dd
e	ej
 de	ej de	ej de	eeej
ej
f df  de	ej
 de	e de	e de	e fddZdefddZ  ZS )ChatGLMModelNTr:   c                    s4  t  | |rt}nt}i }|d ur||d< |t|fi || _|j| _|j| _|j| _|j	| _	|jd u r<|j
|j n|j}t|d |j|j||jd| _|t|fi || _|tj|j
|jfd|jd|| _|j| _|j| _| jd ur|  D ]}d|_q{t| j | _t|| _ tj!d| _"d S d S )Nri   r;   )rp   ro   ri   rj   F)r   rj   g?)#r<   r=   r   r&   rB   rD   r?   rA   r@   r1  rG   r   re   rp   original_roper   r   r  encoderr   rF   rK  output_layerrC   r>   
parametersrequires_gradr-   rl   r5  rN   r9   prefix_encoderr   r  )r2   r:   ri   
empty_initinit_methodinit_kwargs
rotary_dimparamrK   r!   r%   r=   T  sV   


zChatGLMModel.__init__c                 C   s   | j jS r    )rD   rL  )r2   r!   r!   r%   get_input_embeddings  s   z!ChatGLMModel.get_input_embeddingsc                 C   sj   | j d|d|}| ||}||| j| jd | j	| j
}| |}|g dd}|S )Nr   ru   r;   )r;   r   r   r      )rN   r   r   rm   rU  typer   rC   r?   rA   r@   r  r   r_   )r2   r   ri   rj   rN   rO   r!   r!   r%   
get_prompt  s   

zChatGLMModel.get_promptr7  r   r2  rO   .inputs_embedsr   r  return_dictc
                 C   sb  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|j\}
}|d u r,| |}| jd urR|d u r?| j|
|j|j	d}|d urRt
j||
| jf|gdd}|d u rl|d ur^| rd|rl|dkrl| j|||d}| | j}|d ur{|| }n|d d |f }|dd }| j||||||d\}}}}|	stdd	 ||||fD S t||||d
S )N)r   ri   rj   ru   rW   r   )r0  r   )r   r#  r   r  c                 s   s    | ]	}|d ur|V  qd S r    r!   )rZ   vr!   r!   r%   r\     s    z'ChatGLMModel.forward.<locals>.<genexpr>)last_hidden_staterO   r   
attentions)r:   r  r   use_return_dictr   rD   rC   r^  ri   rj   r-   r   new_onesallr4  r   r1  r   rY   rQ  r`   r   )r2   r)   r7  r   r2  rO   r_  r   r  r`  r   r1  r   r   r$  r&  r%  r!   r!   r%   rP     sr   




zChatGLMModel.forwardweight_bit_widthc                 C   s   ddl m} || j| | S )Nr   quantize)quantizationri  rQ  )r2   rg  ri  r!   r!   r%   ri    s   zChatGLMModel.quantizer   )NNNNNNNN)r4   r5   r6   r   r=   r[  r-   r~   r^  r   rR   
BoolTensorr   r   rP   r   ri  rS   r!   r!   rK   r%   rO  R  s@    +	

KrO  )module_namec                       s  e Zd Zd>def fddZ		d?dedeeef d	e	d
e	deeef f
ddZ
				d@dejdeej deej deej de	defddZ											dAdeej deej deej deeej  deej deej dee	 dee	 dee	 dee	 dee	 fddZedeeejejf df d ejdeeejejf df fd!d"Zd#d$ Z	dBd%ed&eeeef  fd'd(Z	dBd%ed&eeeef  fd)d*Ze 			+		,	,	dCd%ed&eeeef  d-efd.d/Ze 					,	,		dDd%ed&eeeef  d-efd0d1Ze 					dEd2ee d3ee d4ee  d5ee!eejgee f  fd6d7Z"dFd8efd9d:Z#d;edefd<d=Z$  Z%S )G ChatGLM2ForConditionalGenerationTNr:   c                    sR   t  | |j| _t|||d| _|| _d| _| jjr'| j	| jjdd d S d S )NrV  ri   FT)rV  )
r<   r=   
max_lengthmax_sequence_lengthrO  r+  r:   	quantizedquantization_bitri  )r2   r:   rV  ri   rK   r!   r%   r=     s   z)ChatGLM2ForConditionalGeneration.__init__Foutputsmodel_kwargsis_encoder_decoderstandardize_cache_formatr+   c                 C   s   | j ||d|d< d|v r$|d }tj|||jd dfgdd|d< d|v rE|d }|d	dd f  }|d7 }tj||gdd|d< d
|d< |S )N)rv  rO   r   r   r   ru   rW   r7  .Fis_first_forward)_extract_past_from_model_outputr-   r   re  r   clone)r2   rs  rt  ru  rv  r   r7  new_position_idr!   r!   r%   #_update_model_kwargs_for_generation  s,   

	
zDChatGLM2ForConditionalGeneration._update_model_kwargs_for_generationr)   rO   r   r7  rw  c                 K   sP   |d u r| j ||jd}|s |ddd f }|d d dd f }||||ddS )Nrh   .ru   T)r)   rO   r7  r   return_last_logit)r8  ri   )r2   r)   rO   r   r7  rw  r$   r!   r!   r%   prepare_inputs_for_generation  s   	z>ChatGLM2ForConditionalGeneration.prepare_inputs_for_generationr_  labelsr   output_attentionsr  r`  r|  c              
   C   sL  |d ur|n| j j}|
d ur|
n| j j}
| j|||||||	|
d}|d }|r-|dd  }| j|}|dd }d }|d ur|tj	}|dd dd d f  }|ddd f  }t
dd}||j}||d|d|d}||j}||j}|
s|f|dd   }|d ur|f| S |S t|||j|j|jdS )	N)r)   r7  r   rO   r_  r   r  r`  r   ru   r   .i)ignore_index)losslogitsrO   r   rc  )r:   r   rd  r+  rR  r   rY   rm   r-   r   r	   ri   r   r^   rj   r   rO   r   rc  )r2   r)   r7  r   rO   r_  r~  r   r  r  r`  r|  transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctr   r!   r!   r%   rP   '  sP   
z(ChatGLM2ForConditionalGeneration.forwardpast.beam_idxc                    s   t  fdd| D S )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c              	   3   sH    | ]}|d   d |d  j|d  d |d jfV  qdS )r   r   N)index_selectrm   ri   )rZ   
layer_pastr  r!   r%   r\   r  s    zBChatGLM2ForConditionalGeneration._reorder_cache.<locals>.<genexpr>)r`   )r  r  r!   r  r%   _reorder_cachef  s   z/ChatGLM2ForConditionalGeneration._reorder_cachec                 C   s   |  }|dd}|S )Nu   [[训练时间]]u   2023年)stripreplace)r2   responser!   r!   r%   process_responsew  s   z1ChatGLM2ForConditionalGeneration.process_responsequeryhistoryc                 C   s,   |j ||d}||gdd}|| j}|S )Nr  ptreturn_tensors)build_promptrm   ri   )r2   	tokenizerr  r  promptinputsr!   r!   r%   build_inputs|  s   z-ChatGLM2ForConditionalGeneration.build_inputsc                 C   s|   |r%d t|d |}|j|dd}|dd  }|j|d fgddd}nd t|d |}||gdd}|| j}|S )	Nu   

[Round {}]

问：{}

答：r   F)add_special_tokensr  )r  r  u   [Round {}]

问：{}

答：r  )formatlenencodebatch_encode_plusrm   ri   )r2   r  r  r  r  r)   r  r!   r!   r%   build_stream_inputs  s   z4ChatGLM2ForConditionalGeneration.build_stream_inputsr   皙?ro  c
                 K   s   |d u rg }|	d u rt  }	|	t  |d u r| j}||||||	d|
}| j|||d}| jdi ||}| d t|d d d  }||}| 	|}|||fg }||fS )N)ro  	num_beams	do_sampletop_ptemperaturelogits_processorr  r   r)   r!   )
r   appendr(   r1  r  generatetolistr  decoder  )r2   r  r  r  ro  r  r  r  r  r  r$   
gen_kwargsr  rs  r  r!   r!   r%   _chat  s.   	 

z&ChatGLM2ForConditionalGeneration._chatc                 k   s   |d u rg }|	d u rt  }	|	t  |d u r| j}|||||	d|}|d u r4|
s4| j|||d}n| j|||d}|d urp|d d jd }| jjd urU|| jj8 }| j	|7  _	|j
}tj|d||fdd}||d< | jdi |||
d|D ]?}|
r|\}}| d t|d d d  }||}|r|d	 d
kr| |}|||fg }|
r|||fV  q||fV  qd S )N)ro  r  r  r  r  r  r   r   rW   r   )rO   return_past_key_valuesr)   ru   u   �r!   )r   r  r(   r1  r  r  r   r+  rC   r7  r   r-   r   re  stream_generater  r  r  r  )r2   r  r  r  rO   ro  r  r  r  r  r  r$   r  r  r3  r   rs  r  new_historyr!   r!   r%   stream_chat  sh   
 


z,ChatGLM2ForConditionalGeneration.stream_chatgeneration_configr  stopping_criteriaprefix_allowed_tokens_fnc              	   +   s   |j d |j d }}	|d u r| j}t|}|jdi |}
|j|j}}t|tr/|g}|	dd u o:|j
d u}|rN|jd u rNtd|j
 dt n|jd urj|j|	 |_
|sjtd|j d|j
 dt |	|j
kr| jjrud	nd
}td| d|	 d|j
 d |d ur|nt }|d ur|nt }| j||	|||d}| j||d}| |}||j d d}d }	 | j|fi |
}| di |dddd}|jd d dd d f }|||}|||}tjj|dd}|jrt j!|dd"d nt j#|dd t j$| d d d f gdd}| j%||
| jjd}
|&t' fdd|D ( }|r9||j)fV  n|V  |* dksI|||rKd S q)Nr   ru   ro  zUsing `max_length`'s default (z) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.zBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)decoder_input_idsr)   zInput length of z is z, but `max_length` is set to zX. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.)r  input_ids_seq_lengthencoder_input_idsr  r  )r  r  r   TF)r`  r  r  rW   )num_samples)ru  c                 3   s    | ]} |kV  qd S r    r!   r  next_tokensr!   r%   r\   ^  r]   zCChatGLM2ForConditionalGeneration.stream_generate.<locals>.<genexpr>r!   )+r   r  copydeepcopyupdatebos_token_ideos_token_idr9  r   getro  max_new_tokenswarningswarnUserWarningr   r:   ru  warningr   r   _get_logits_processor_get_stopping_criteria_get_logits_warpernewfill_r}  r  r   r   r   r  r-   multinomialsqueezeargmaxr   r{  mulsumr5  rO   r   )r2   r)   r  r  r  r  r  r$   r   r  rt  r  has_default_max_lengthinput_ids_stringlogits_warperunfinished_sequencesr*   model_inputsrs  next_token_logitsnext_token_scoresprobsr!   r  r%   r    s   








z0ChatGLM2ForConditionalGeneration.stream_generatebitsc                 K   s^   |dkrd S ddl m} | jrtd | S d| _|| j_|| jj|f||d|| j_| S )Nr   r   rh  zAlready quantized.Trn  )	rj  ri  rq  r   infor:   rr  r+  rQ  )r2   r  rV  ri   r$   ri  r!   r!   r%   ri  h  s$   

z)ChatGLM2ForConditionalGeneration.quantizeinputc           
   	   C   s   |d }|d }d|v r|d }nd}d|v r|d }nd}d|v r'|d }nd}d	|v r2|d	 }nd
}t |tjkr?| }| j|||||||d\}	}tj|	tj|iS )Ntextr  ro  i   r  gffffff?r  r   r  T)ro  r  r  r  )r]  r-   rR   r  r  r   RESPONSEHISTORY)
r2   r  r  r  r  ro  r  r  r  r  r!   r!   r%   chat~  s4   




z%ChatGLM2ForConditionalGeneration.chat)TN)FF)NNNT)NNNNNNNNNNFr    )NNr   Tr  r  N)NNNTr  r  NF)NNNNF)FN)&r4   r5   r6   r   r=   r   r   strr   r   r{  r-   r7   r   rR   dictr}  r   r8   rP   staticmethodr  r  r   r  r  no_gradr   r  r  r   r   r   r   r  ri  r  rS   r!   r!   rK   r%   rm    s   


$
	

?

#;prm  rC  )WrQ   r  r   sysr  typingr   r   r   r   r   r   r-   torch.nn.functionalr   r   r   torch.utils.checkpointtorch.nnr	   r
   torch.nn.utilsr   &transformers.generation.logits_processr   transformers.generation.utilsr   r   r   r   transformers.modeling_outputsr   r   transformers.modeling_utilsr   
modelscoper   r   modelscope.metainfor   modelscope.outputsr   modelscope.utilsr   loggingmodelscope.utils.constantr    r   configurationr   platform_C_jit_set_profiling_mode_jit_set_profiling_executor_jit_override_can_fuse_on_cpu_jit_override_can_fuse_on_gpu
get_logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOC(CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LISTr&   r(   rI  r9   rR   r   r   rd   re   jitscriptr   r   r   r   r   r   r   r  r*  rB   rO  register_moduler  chatglm2_6brm  r!   r!   r!   r%   <module>   s     

"
9  ,WTN 