o
     iw                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZmZ d dlZd dlm  mZ d dlmZmZ d dlmZ d dlmZmZmZ z
d d	lmZmZ W n eyn   ed
ddgZedddgZY nw eG dd dZdd Z dd Z!d.ddZ"e# 									d/ddZ$d.ddZ%e# 										d0dd Z&G d!d" d"Z'ej(fd#ee)ef fd$d%Z*eG d&d' d'Z+e# 	(			)d1d*d+Z,	)d2d,d-Z-dS )3    N)
namedtuple)	dataclassfield)partial)CallableOptionalSequenceUnion)	rearrangerepeat)Tensor)ProfilerActivityprofilerecord_function)GreedySearchDecoderOnlyOutputSampleDecoderOnlyOutputr   	sequencesscoresr   c                   @   sd   e Zd ZU dZeed< eed< dZeed< dZeed< ee	dZ
e	ed< d	Zee ed
< dd Zd	S )InferenceParamszInference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference.
max_seqlenmax_batch_sizer   seqlen_offsetbatch_size_offsetdefault_factorykey_value_memory_dictNlengths_per_samplec                 C   s.   || _ || _d| _| jd ur| j  d S d S )Nr   )r   r   r   r   zero_)selfr   r    r   Y/home/ubuntu/.local/lib/python3.10/site-packages/xformers/_flash_attn/utils/generation.pyreset#   s   
zInferenceParams.reset)__name__
__module____qualname____doc__int__annotations__r   r   r   dictr   r   r   r   r!   r   r   r   r    r      s   
 r   c                 C   s,   | t | |d d k }| |td dS )z<Set the logits for none top-k values to -inf. Done in-place.r   ).Nz-InfN)torchtopkmasked_fill_float)logitstop_kindices_to_remover   r   r    !modify_logits_for_top_k_filtering-   s   r1   c                 C   sh   |dks|dkr
dS t j| dd\}}|jddjdd}|d| k}|d||}| |td	 dS )
z<Set the logits for none top-p values to -inf. Done in-place.              ?NF)
descendingr)   dim   z-inf)r*   sortsoftmaxcumsumscatterr,   r-   )r.   top_psorted_logitssorted_indicescumulative_probssorted_indices_to_remover0   r   r   r    !modify_logits_for_top_p_filtering5   s   rA   r7   r2   r3   c                 C   s   |dkr
| j ddS |dkr|dksJ d|dkrVt|| d}tj| |dd\}}|dkr4|| }t|| |tj|jd |jdtj	tj
|dddd	jddf S |dkr^| | n|  }t|| tj	tj
|dddd	jddS )
zfSample from top-k logits.
    Arguments:
        logits: Tensor of shape (batch_size, vocab_size)
    r7   r)   r5   r2   r3   top-p should be in (0, 1].r   devicenum_samples)argmaxminsizer*   r+   rA   arangeshaperD   multinomialr9   squeezeclone)r.   r/   r<   temperature
logits_topindicesr   r   r    sampleE   s(   

rR   Fc                    s  | j \ }durj d ndr1tdsd_tj ||	d_jj}|  nt d} 
fdd}	fd	d
}fdd}tjj	|d}tjj	|d}|rn|	dkrjtj
  |  g | g}}||d |s|||d | | j|d j d 7  _|||d | ||d |r{|r|  |	dkrtj
  tj  td||dd dkrtnt}|tj|ddt|dS )a  Decoding, either greedy or with top-k or top-p sampling.
    If top-k = 0, don't limit the number of candidates (pure sampling).
    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
    then top-p.
    We assume that all sequences in the same batch have the same length.

    Arguments:
        input_ids: (batch, seq_len)
        max_length: int
        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
            logits, the next token is taken from the teacher_outputs. Useful for testing.
    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
        sequences: (batch, max_length)
        scores: tuples of (batch, vocab_size)
    Nr7   r   _decoding_cache)tensor_parallelr   r   c                    s   |j dk}|rtj df|j tj| jd}nd }r|s*| ||ddjjdd}nj| ||j jdd}d urC|dd f S |S )Nr   r7   dtyperD   position_idsinference_paramsnum_last_tokensr5   .)	r   r*   fulllongrD   r.   rM   rS   run)	input_idsrZ   decodingrY   r.   )
batch_sizecgmodel
vocab_sizer   r    
get_logits   s0   
zdecode.<locals>.get_logitsc                    s@   d u s	 |j krt| d}n	d d |j f }|dS )Nr/   r<   rO   r7   )r   rR   	unsqueeze)r.   rZ   token)teacher_output_lenteacher_outputsrO   r/   r<   r   r    sample_tokens   s   
zdecode.<locals>.sample_tokensc                    s<   |j dkrdS  d ur|  k rdS |j d krdS dS )Nr   FTr7   )r   all)current_tokenrZ   )eos_token_id
max_lengthr   r    should_stop   s   
zdecode.<locals>.should_stop)enable_timingr)   #Prompt processing + decoding time: .0fmsr5   r   r   )rK   hasattrrS   update_graph_cacherZ   r!   r   r*   cudaEventdistributedbarrierrecordappendr   synchronizeprintelapsed_timer   r   cattuple)r_   rc   ro   r/   r<   rO   rn   rj   rd   rT   rb   rq   	seqlen_ogrZ   re   rk   rp   startendr   r   
output_clsr   )ra   rb   rn   ro   rc   ri   rj   rO   r/   r<   rd   r    decodeb   sN   

	


r   c                 C   s  | j \}}}|d }	|j ||	|fksJ |j ||	fksJ |jtjtjfv s(J |dkr4|dks4J d|dkr<| | n|  } |dkrH|| n| }|dkrbt|| d}t| | t|| t	| | t	|| tj
| dd}
tj
|dd}dd	 }tj||	|
jd
||| ||
ddddf |k}|jdd}t||	| jdd}tj|
ddddf | dd}tj||
ddddf gdd}t|jdt|d|ddd}tj|ddjdd}t|d}||dd|f< ||d fS )a  Algorithm 1 from [1]
    [1] Fast Inference from Transformers via Speculative Decoding
    Yaniv Leviathan, Matan Kalman, Yossi Matias
    https://arxiv.org/abs/2211.17192

    Arguments:
        logits: Tensor of shape (batch_size, seqlen + 1, vocab_size)
        logits_draft: Tensor of shape (batch_size, seqlen, vocab_size)
        tokens_draft: Tensor of shape (batch_size, seqlen)
    Return:
        tokens: Tensor of shape (batch_size, seqlen + 1)
        num_generated_tokens: Tensor of shape (batch_size), with value in [1, seqlen + 1].
            For each sequence in the batch, the number of valid tokens that were sampled by
            speculative sampling.
    r7   r2   r3   rB   r   r)   r5   c                 S   s   t | jdt |dddS )Nr)   z... -> ... 1r6   indexz... 1 -> ...)r
   gather)probstokensr   r   r    <lambda>   s    z$sample_speculative.<locals>.<lambda>rC   N)rH   z
b -> b 1 d)dr   zb 1 d -> b drE   )r   r7   )rK   rW   r*   int64int32rN   rH   rI   r1   rA   r9   randrD   rl   wherer&   argminclampr   r
   r   r   rL   rM   Fpad)r.   logits_drafttokens_draftr/   r<   rO   batch
seqlen_p_1rd   seqlenr   probs_draftr   acceptedaccepted_allfirst_rejected_idx
probs_diffresample_probsresampler   r   r   r    sample_speculative   sF   



""r      c           /   
      s
  | j \}}|dksJ d|du sJ d|r[t|dsd|_t||j|||d|
d|_|jj}||| t|ds>d|_t||j|||td|d |
d|_|jj}||| nt||d	}t||d	}d% fdd	}d&dd}t|||d}t	t
fi |}t	|||d}t	|||d}t	||||d}t	||||d}|rddlm} |d}|r|
dkrtj  tj  t }| gg }}d}d} g }!||d kr|| dd\}"}#||" ||# nt||| d }$|| |$d\}%}&| |$7 } |r |tj| |%gdd|$d dj}'t|&|'ddddf     |tj| |%gdd||$d d}(|d7 }|rQ|tj| |%gdd|$d dj})t|(|)    t|(|&|%fi |\}"}*|!|*d  |rot|" t|* ||"ddd|*d f  ||(ddd|*d f  |*d  }+||+ d |_|+dkr|jd n|j|_|rtj| |d gdd},||,|*d  d dj}-t|d |-ddddf     	 |j|d krn.|j|d kr||d ddddf dd\}"}#||" ||# nt|||j d }$||d ddddf |$d\}%}&| |$7 } |rM|tj|,|%gdd|$d dj}'t|&|'ddddf     |tj|d ddddf |%gdd||$d d}(|d7 }|r|tj|,|%gdd|$d dj})t|(|)    t|(|&|%fi |\}"}*|!|*d  |rt|" t|* ||"ddd|*d f  ||(ddd|*d f  |*d  }+| j|+7  _|+dkr|jd n|j|_|rtj|,|d gdd},||,|*d  d dj}-t|d |-ddddf     q|rL|
dkrtj  tj  tdt | d dd td|  td t|!  |  d! d"d# tj|dd}tj|dd}|rv||j}-t||-dd|d df     |dkr}tnt }.|.||d$S )'a  
    TD: WIP, for my own understanding, lightly tested. Only support batch_size == 1 for now.

    Speculative decoding, either greedy or with top-k or top-p sampling.
    If top-k = 0, don't limit the number of candidates (pure sampling).
    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
    then top-p.
    We assume that all sequences in the same batch have the same length.

    Arguments:
        input_ids: (batch, seq_len)
        max_length: int
    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
        sequences: (batch, max_length)
        scores: tuples of (batch, vocab_size)
    r7   z>Speculative decoding implementation only supports batch_size=1Nz@Speculative decoding implementation doesn't support eos_token_idrS   )r7      )decoding_seqlensrT   r   rU   Fc           
         s   |j dk}|r0| jd }	 tj| jd f|j tj| jd}|d d d f tj|tj| jd }nd }|r6|s@|| |||dj	}	n|| jd ksIJ |j
| ||j d d | d f }	 d urg|	dd  f S |	S )Nr   r7   TrV   rX   .)r   rK   r*   r\   r   rD   r   rJ   r]   r.   rS   r^   )
r_   rZ   rc   r[   rb   r`   r   cache_seqlensrY   r.   rd   r   r    re   P  s<   




	z&decode_speculative.<locals>.get_logitsc                 S   s   |dksJ | gg }}t |D ]*}|||d |dddf  | j|d jd 7  _|||d d qtj|dd ddtj|ddfS )a  Sample `num_tokens` tokens from the model, given the previous logits.
        Also return the logits of the sampled tokens.
        Arguments:
            input_ids: (batch, seqlen)
        Return:
            tokens: (batch, num_tokens)
            scores: (batch, num_tokens), which contains @previous_logits and the logits of the next
                (num_tokens - 1) tokens. The logits of the last token isn't computed.
        r7   r)   Nr5   )ranger}   r   rK   rg   r*   r   stack)r_   get_logits_fnrZ   	sample_fn
num_tokensr   r   ir   r   r    rk   u  s   
 $z)decode_speculative.<locals>.sample_tokensrf   )rc   rb   )r   r   rZ   r   )AutoTokenizergpt2)r   r5   )r[   r)   Trr   i  rs   rt   zNumber of calls to main model: zAcceptance rate: d   z.2f%ru   )r7   Fr7   )!rK   rv   rS   rw   rZ   r!   r   r   r(   r   rR   transformersr   from_pretrainedr*   rz   r{   rx   r~   timer}   rH   r   r.   r   absmaxr   itemr   sumr   r   )/r_   rc   model_draftro   speculative_lookaheadr/   r<   rO   rn   rd   rT   rb   rq   debugra   r   inference_params_draftrZ   re   rk   sampling_kwargsr   get_logits_mainget_logits_draftsample_tokens_mainsample_tokens_draftr   	tokenizerr   r   r   num_main_model_callsnum_draft_tokensnum_accepted_tokens_historyr   
scores_newn_spec_tokensr   scores_draftscores_draft_refr.   
logits_refnum_generated_tokensnum_generatedcur_ids
scores_refr   r   r   r    decode_speculative  sn  
!


	
%



$


($

	
$$


(D


"
(r   c                   @   s*   e Zd Zd
ddZ					ddd	ZdS )GenerationMixinNc                 K   s   t N)NotImplementedError)r   ra   r   rW   kwargsr   r   r    allocate_inference_cache7  s   z(GenerationMixin.allocate_inference_cacher7   r2   r3   Fc           
      K   s4   t || |f|||d|}	|sd |	_|r|	S |	jS )Nrf   )r   r   r   )
r   r_   ro   r/   r<   rO   return_dict_in_generateoutput_scoresr   outputr   r   r    generate:  s   zGenerationMixin.generater   )r7   r2   r3   FF)r"   r#   r$   r   r   r   r   r   r    r   6  s    
r   layersc                    sN   t jt jt jfv sJ | |d||ft|trt|} fdd|D S )Nr   c                    s   i | ]}|t j d qS ))rD   rW   )r*   empty).0r   rD   rW   kv_cache_shaper   r    
<dictcomp>Z  s    z,allocate_inference_cache.<locals>.<dictcomp>)r*   float16bfloat16float32
isinstancer&   r   )r   r   nheadsheaddimr   rD   rW   r   r   r    r   M  s
   	
r   c                   @   sd   e Zd ZU dZeed< dZeed< dZdZe	e
dZe
ed< dZdZee ed< dZee ed< dS )	DecodingCGCacher   r   r   Nr   	callablesrZ   r^   )r"   r#   r$   r   r&   r'   r   rD   rW   r   r(   r   mempoolrZ   r   r   r^   r   r   r   r   r    r   ]  s   
 r   r   r   c	              
      sv   d u rt   tt|  }	|	j}
|d u r|	j}|
|f j jfks-| jks-| jkri  _d  _	d  _
t  |
| _ _|| _ _t| drU| |||}nt| jd| jj| jj }t||| jj| || jj|
|}tj|f|tj|
d}t|||||d _
tjj  _	|D ]}||f jvrt|  j
||| j	|d j||f< q fdd}| _d j
_ S )	Nr   head_dimrV   )r   r   r   r   r   )decoding_seqlenr   	n_warmupsc                    s(   | j d d \}} j||f | ||S )Nr   )rK   r   )r_   rY   r   ra   r   cacher   r    dispatch  s   z$update_graph_cache.<locals>.dispatchr   )r   nextiter
parametersrD   rW   r   r   r   r   rZ   gccollectrv   r   getattrconfighidden_sizenum_attention_headsnum_hidden_layersr*   r\   r   r   rx   graphsgraph_pool_handlecapture_graphr^   r   )rc   r   ra   r   r   r   rT   rW   r   param_examplerD   	inf_cacher   r   r   r   r   r   r    rw   i  sp   



	
rw   c                    s^  t t|  j}tj||fdtj|dtj||fdtj|dj}|| _jjd d < tj	
 }	|	tj	  tj	|	& t|D ]}
| |djqI|	  tj rctj  W d    n1 smw   Y  tj	 |	 tj	  tj	j |d | |djW d    n1 sw   Y   fdd}|_|S )Nr   rV   rX   )poolc                    s2   |j d d < |  |     S r   )r   copy_replayrN   )new_input_idsnew_position_idsr   graphrZ   r_   r.   rY   r   r    r^     s
   

zcapture_graph.<locals>.run)r   r   r   rD   r*   r\   r]   r   r   rx   Streamwait_streamcurrent_streamstreamr   r.   r~   rz   is_initializedr{   	CUDAGraphr  )rc   rZ   ra   r   r   r   r   rD   seqlen_offset_ogs_r^   r   r  r    r     sL   




r   )r7   r2   r3   )	r7   r2   r3   NNNr7   FF)
r   r7   r2   r3   NNr7   FFF)r   r7   Nr   )r7   Nr   ).r   r   collectionsr   dataclassesr   r   	functoolsr   typingr   r   r   r	   r*   torch.nn.functionalnn
functionalr   einopsr
   r   r   torch.profilerr   r   r   transformers.generationr   r   ImportErrorr   r1   rA   rR   inference_moder   r   r   r   r   r&   r   r   rw   r   r   r   r   r    <module>   s~   

n;  +

L