o
    ̳iW                     @   s   d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlm	Z	 d dlm
Z
 d dlmZmZmZmZ d dlmZmZ d dlmZ edZG d	d
 d
Zejde	ddfddZedkrdee  dS dS )    N)AnyDictList)
DictConfig)nn)config
generationtrainingutils)MessageRole)FullModelTorchTuneCheckpointerDEBUGc                   @   s   e Zd ZdZdeddfddZdeddfddZd	ed
eee	f de
jfddZdeeef dee fddZe defddZdS )InferenceRecipea,  
    Recipe for generating tokens from a dense Transformer-based LLM.

    Currently this recipe supports single-GPU generation only. Speculative
    decoding is not supported.

    For more details on how to use this recipe for generation, please see our
    tutorial: https://pytorch.org/torchtune/main/tutorials/e2e_flow.html#generation

    For using this recipe with a quantized model, please the following section of
    the above tutorial:
    https://pytorch.org/torchtune/main/tutorials/e2e_flow.html#speeding-up-generation-using-quantization
    cfgreturnNc                 C   s\   t j|jd| _tj|j| jd| _t	|j
| _t| j| _tj|j|dd d d S )N)devicedtyper   cudnn_deterministic_mode)seed
debug_mode)r
   
get_devicer   _devicer	   	get_dtyper   _dtyper   instantiate	quantizer
_quantizerget_quantizer_mode_quantization_modeset_seedr   get)selfr    r$   D/home/ubuntu/.local/lib/python3.10/site-packages/recipes/generate.py__init__%   s   
zInferenceRecipe.__init__c                 C   s   t |j}| jd urt|tstdd| jv rtd| jd u r'| }n|jdd}| j|j	|t
j d| _t |j| _d S )NzQuantization is only supported for models quantized and saved with the FullModelTorchTuneCheckpointer - please ensure you have quantized your model and are using the quantized weights!qata  You have specified a quantizer with 'QAT' - QAT quantizers should only be used during quantization aware training and when quantizing models. Please use the corresponding post-training quantizer e.g. Int8DynActInt4WeightQuantizer for Int8DynActInt4WeightQATQuantizer.F)weights_only)	model_cfgmodel_state_dict)r   r   checkpointerr    
isinstancer   
ValueErrorload_checkpoint_setup_modelmodelr	   	MODEL_KEY_model	tokenizer
_tokenizer)r#   r   r+   	ckpt_dictr$   r$   r%   setup/   s$   




zInferenceRecipe.setupr)   r*   c              	   C   s   t | j  | j t|}W d    n1 sw   Y  W d    n1 s)w   Y  | jd ur[| j|}|j	| j| jd}|
 D ]\}}|	| j||< qF|j|dd n|| t j| | jd td| j d |S )N)r   r   T)assign)r   z$Model is initialized with precision .)r	   set_default_dtyper   r   r   r   r    r   quantizetoitemsload_state_dictvalidate_expected_param_dtypenamed_parametersloggerinfo)r#   r)   r*   r0   kvr$   r$   r%   r/   M   s"    


zInferenceRecipe._setup_modelpromptc                 C   sf   g }d|v r|d dur| td|d d |td|d dtdddg | jd|idd	d
 S )z
        Convert the prompt string to a user message with optional system messages
        and tokenize using the prompt template defined on the tokenizer.
        systemN)rolecontentuser	assistant messagesT)	inferencetokens)appendr   extendr4   )r#   rD   rK   r$   r$   r%   convert_prompt_to_tokensf   s   
z(InferenceRecipe.convert_prompt_to_tokensc              
   C   s  |  |j}tj|tj| jd}d }|jr7| j | jjd| j	|
 |j d W d    n1 s2w   Y  | jd urttd tjtjddd}t }tj| j|d|j|j| jj|d	}t | }td
|dd | j  t }tj| j||j| jj|j|j| jj|d\}}| }t | }t| j|d  tdd t| j  | j! D }	t"|d |#d }
|
| }td|dd|dd td|	| d dd | jj$dkrt%& }td|' d dd d S d S )Nr      )
batch_sizer   decoder_max_seq_lenz:Starting compilation to improve generation performance ...zmax-autotuneT)mode	fullgraph   )r0   rD   max_generated_tokenstemperaturetop_kstop_tokenscustom_generate_next_tokenz&Warmup run for quantized model takes: z.02fz sec)r0   rD   rW   pad_idrX   rY   rZ   r[   r   c                 S   s   g | ]
}|  |jj qS r$   )numelr   itemsize).0pr$   r$   r%   
<listcomp>   s    z,InferenceRecipe.generate.<locals>.<listcomp>zTime for inference: z sec total, z tokens/seczBandwidth achieved: g    eAz GB/scpuzMemory used: z GB)(rP   rD   torchtensorintr   enable_kv_cacher2   setup_cachesr   r]   max_new_tokensr    r@   rA   compiler   generate_next_tokentimeperf_countergeneraterX   rY   r4   rZ   reset_cachesr\   tolistdecodesum	itertoolschain
parametersbufferslensizetyper
   get_torch_device_namespacemax_memory_allocated)r#   r   rM   rD   r[   t0_tgenerated_tokens
model_sizetokens_generated
tokens_sectorch_devicer$   r$   r%   rm   z   s   
	
	


	zInferenceRecipe.generate)__name__
__module____qualname____doc__r   r&   r6   r   strr   r   Moduler/   r   r   re   rP   rc   inference_moderm   r$   r$   r$   r%   r      s$    




r   r   r   c                 C   s4   t jd| d t| d}|j| d |j| d d S )Nr   )recipe_namer   )r   )r   
log_configr   r6   rm   )r   reciper$   r$   r%   main   s   
r   __main__)rr   sysrk   typingr   r   r   rc   	omegaconfr   r   	torchtuner   r   r	   r
   torchtune.datar   r   torchtune.trainingr   
get_loggerr@   r   parser   r   exitr$   r$   r$   r%   <module>   s$   
 2