o
    ߥi8                     @   s  d Z ddlmZmZmZmZ ddlZddlZddl	m
  mZ ddlmZ ddlmZ ddlmZ e Zeeeef  Zee Zeee  Zdeded	ed
efddZdd ZdejdefddZdd Z				d8dededeeeef  dededefddZd d d!d"ee d#ee d$ee ded%ed&ed'efd(d)Z d d d!d"ee d#ee d*ee ded%ed+ed&ed'efd,d-Z!	 	 d9d"eejef ded%ed+eded&ed'ed
efd.d/Z"G d0d1 d1eZ#dd2e$d3 fd4d5Z%d6d7 Z&dS ):zGeneration support.    )IterableListTupleUnionN)PreTrainedTokenizer)LogitsProcessor)
get_loggerbatchpad_id
seq_lengthreturnc                 C   s2   | D ]}t |}||k r||g||   q| S N)lenextend)r	   r
   r   tokenscontext_length r   d/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/qwen/qwen_generation_utils.py	pad_batch   s   r   c              
   C   sX  |   \}}|r|}nd}ttj|||f| jd|d||}tj|   tj| jd}	|r5d|	| |k< tj|tj| jd}
|
	d
| }
|rM|
 }
|sQ|rt|D ]M}|
|| | |kf }|rg| }d}t|  d D ]0}|| }|rd||d|d dd|d f< |r|
||d df  |d | 8  < |d }qqqU|dk }||	|
fS )z4Build masks and position id for left to right model.   )device)dtyper           r   Ng      ?)sizetorchtrilonesr   viewfloatarangelong	unsqueeze	expand_asclonerange)data	eod_tokenreset_position_idsreset_attention_maskeod_mask_lossmicro_batch_sizer   att_mask_batchattention_mask	loss_maskposition_idsb	eod_index
prev_indexjir   r   r   get_ltor_masks_and_position_ids!   sN   

 $
r4   context_tokenseod_idc                 C   s2   |   | j}t||dddd\}}}|||fS )z#Generate batch from context tokens.F)r'   r(   r)   )
contiguoustor   r4   )r5   r6   r   r,   _r.   r   r   r   	get_batch`   s   
r:   c                 C   sH   | dkr| d|jgg}|S | dkr|jg|jgg}|S td| )NrawzHuman:chatmlUnknown chat format )encoder6   	im_end_idim_start_idNotImplementedError)chat_format	tokenizerstop_words_idsr   r   r   get_stop_words_idso   s   rE       r<   rC   queryhistorysystemmax_window_sizerB   c              	      s  |d u rg }|dkrd\}}j g}jg}	d  fdd}
|
d|\}}|| |	 }d}g }t|D ]O\}}|
d|\}}|| |	 }|
d	|\}}|| |	 } |   | }d| | | d| | | }t|t| t| }||k r|| }|| }q7 || }| | | | }| | |
d|d
  |	   | d	   7 }|d| d| | d| d7 }||fS |dkr|}|}||fS td|)Nr<   )z<|im_start|>z
<|im_end|>
c                    s&   |  d|  |    | fS )NrL   )r>   )rolecontent	nl_tokensrC   r   r   _tokenize_str   s   z#make_context.<locals>._tokenize_strrJ   rF   user	assistantr   zuser
z
assistant
r;   r=   )r@   r?   r>   reversedr   rA   )rC   rH   rI   rJ   rK   rB   im_startim_endim_start_tokensim_end_tokensrQ   system_textsystem_tokens_partsystem_tokensraw_textr5   
turn_queryturn_response
query_textquery_tokens_partquery_tokensresponse_textresponse_tokens_partresponse_tokensnext_context_tokens	prev_chatcurrent_context_sizer   rO   r   make_contexty   sh   

 
rh   F)verbosereturn_end_reasonr   
stop_words	eod_wordsraw_text_lenri   rj   c                C   s   | | |d  }|rtd| dt|  }|D ]
}	||	d }q|D ]}
|
|v r1d|
}||
d }q&| }|rItd| td| |rO||fS |S )Nz
Raw Generate: Gen length rF   Gen r   
End Reason:z
Generate: )decodeprintr   replacestripsplit)r   rk   rl   rC   rm   ri   rj   trim_decode_tokens
end_reason	stop_wordeod_wordr   r   r   _decode_default   s"   




rz   eod_token_idsr   c                C   s   dt |  }|}	t|t | D ]}	| |	 |v r%d|| |	 g} nq|| d |	 |d  }
|rKtd|| |d   td|
 td| |D ]
}|
|d }
qM|
 }
|rctd|
 |ri|
|fS |
S )Nrn   ro   z
Raw Generate w/o EOD:z
Raw Generate:rp   rF   z

Generate:)r   r$   rq   rr   rs   rt   )r   rk   r{   rC   rm   r   ri   rj   rw   eod_token_idxrv   rx   r   r   r   _decode_chatml   s4   



r}   c              
   C   sr   t | r|    } |dkr!t| g |j|jg|||||dS |dkr2t| dgdg||||dS t	d|)Nr<   )rk   r{   rC   rm   r   ri   rj   r;   z<|endoftext|>)rk   rl   rC   rm   ri   rj   r=   )
r   	is_tensorcpunumpytolistr}   r@   r?   rz   rA   )r   rC   rm   r   rB   ri   rj   r   r   r   decode_tokens   s0   
	


r   c                   @   s|   e Zd ZdZdeee  defddZdejdej	dej	fd	d
Z
dejdee defddZdee dee fddZdS )StopWordsLogitsProcessora  
    :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.

    Args:
        stop_words_ids (:obj:`List[List[int]]`):
            List of list of token ids of stop ids. In order to get the tokens of the words
            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
            add_prefix_space=True).input_ids`.
        eos_token_id (:obj:`int`):
            The id of the `end-of-sequence` token.
    rD   eos_token_idc                    s   t |trt|dkrtd| dtdd |D r$td| dtdd |D r5td| dtt fd	d
|| _ | _| jD ]}t|dksVJ d	|qGd S )Nr   z4`stop_words_ids` has to be a non-emtpy list, but is .c                 s   s    | ]	}t |t V  qd S r   )
isinstancelist).0bad_word_idsr   r   r   	<genexpr>8  s    z4StopWordsLogitsProcessor.__init__.<locals>.<genexpr>z3`stop_words_ids` has to be a list of lists, but is c                 s   s"    | ]}t d d |D V  qdS )c                 s   s*    | ]}t |ttjf p|d k V  qdS )r   N)r   intnpinteger)r   token_idr   r   r   r   >  s    
z>StopWordsLogitsProcessor.__init__.<locals>.<genexpr>.<genexpr>N)any)r   stop_word_idsr   r   r   r   =  s    
zLEach list in `stop_words_ids` has to be a list of positive integers, but is c                    s
   |  gkS r   r   )bad_token_seqr   r   r   <lambda>G  s   
 z3StopWordsLogitsProcessor.__init__.<locals>.<lambda>z7Stop words token sequences {} cannot have an empty list)
r   r   r   
ValueErrorr   r   filterrD   r   format)selfrD   r   stop_token_seqr   r   r   __init__1  s:   



z!StopWordsLogitsProcessor.__init__	input_idsscoresr   c                 C   s6   |  |}t|D ]\}}|rtd||| jf< q	|S )Ni   )_calc_stopped_samples	enumerater   r   )r   r   r   stopped_samplesr3   should_stopr   r   r   __call__P  s   
z!StopWordsLogitsProcessor.__call__prev_tokensr   c                 C   sF   t |dkrdS t |t |krdS |t | d   |kr!dS dS )Nr   TF)r   r   )r   r   r   r   r   r   _tokens_matchX  s   z&StopWordsLogitsProcessor._tokens_matchprev_input_idsc                 C   s@   g }|D ]}d}| j D ]}| ||rd} nq|| q|S )NFT)rD   r   append)r   r   r   prev_input_ids_slicematchr   r   r   r   r   f  s   
z.StopWordsLogitsProcessor._calc_stopped_samplesN)__name__
__module____qualname____doc__r   r   r   r   
LongTensorFloatTensorr   r   boolr   r   r   r   r   r   r   $  s(    


r   r   Infc           
      C   s   |dkr| t | |d d k }|| |< |dkr^t j| ddd\}}t jtj|dddd}||k}|dd	df  |dd
d	f< d|d< t|dD ]}	||	 ||	  }|| |	 |< qM| S )zThis function has been mostly taken from huggingface conversational
    ai code at
        https://medium.com/huggingface/how-to-build-a-state-of-the-art-
             conversational-ai-with-transfer-learning-2d818ac26313r   ).Nr   Tr   )
descendingdim)r   .Nr   ).r   )	r   topksortcumsumFsoftmaxr#   r$   r   )
logitstop_ktop_pfilter_valueindices_to_removesorted_logitssorted_indicescumulative_probssorted_indices_to_remover3   r   r   r   top_k_logitsu  s*   

r   c                 C   s   | | }d| |  ||  S )Nr   )type_as)val1val2booleanr   r   r   switch  s   
r   )NrF   rG   r<   )FF)'r   typingr   r   r   r   r   r   r   torch.nn.functionalnn
functionalr   transformersr   transformers.generationr   modelscope.utils.loggerr   loggerstrHistoryTyper   
TokensTypeBatchTokensTyper   r4   r   r:   rE   rh   r   rz   r}   r   r   r   r   r   r   r   r   r   <module>   s   

	?
H
(	
+
%Q"