o
    ̳i('                     @   s   d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlm	Z
 d dlmZmZ d dlmZ d dlmZmZmZ d dlmZmZmZ d dlmZ d dlmZ G d	d
 d
eZG dd dZejdeddfddZe dkrue!e  dS dS )    N)AnyDictList)
DictConfig	OmegaConf)parallelize_module)configtrainingutils)
load_imageMessage$padded_collate_tiled_images_and_mask)sample)	Transformc                   @   s.   e Zd ZdZdeeef dee fddZ	dS )SingleTurnYAMLToMessagesa  
    Converts a single turn conversation in YAML format to a list of messages.

    Expects the YAML to look like:
        system: You are a helpful AI assistant.
        user: What is the capital of France?

    or if it includes an image:
        system: You are a helpful AI assistant.
        user:
            image: url or path_to_image
            text: Describe the image in detail.
    promptreturnc                 C   s   g }|  D ]K\}}|d u rqt|trd|dg}n-d| v r6|d }t|}d|dd|d dg}nd| v s@J dd|d dg}|t||d q|tddd |S )Ntext)typecontentimagez4Multiple entries per role expect at least a text key)roler   	assistant )items
isinstancestrkeysr   appendr   )selfr   messagesr   r   new_content	image_locr    r#   W/home/ubuntu/.local/lib/python3.10/site-packages/recipes/dev/generate_v2_distributed.py__call__'   s&   
z!SingleTurnYAMLToMessages.__call__N)
__name__
__module____qualname____doc__r   r   r   r   r   r%   r#   r#   r#   r$   r      s    "r   c                   @   s`   e Zd ZdZdeddfddZdeddfddZd	ed
eddfddZ	e
 defddZdS )InferenceRecipea  
    Recipe for generating tokens from a dense Transformer-based LLM.
    This works for text-only generation and image-text generation.

    Supports distributed inference using Tensor Paralellism(TP) for
    large models that don't fit on a single GPU. For more information
    on TP, see: https://pytorch.org/docs/stable/distributed.tensor.parallel.html.

    This *does not* currently support the following features:
        - torch.compile
        - quantization through torchao
        - batch generation
    cfgr   Nc                 C   sp   t j|jd| _tj|j| jd| _t |j	| _
tjdd t  \}}|dk| _tj|j|dd d d S )Ndevice)dtyper-   nccl)backendr   cudnn_deterministic_mode)seed
debug_mode)r
   
get_devicer-   _devicer	   	get_dtyper.   _dtype
get_logger	log_level_loggerdistinit_process_groupget_world_size_and_rank_is_rank_zeroset_seedr2   get)r   r+   _rankr#   r#   r$   __init__R   s   

zInferenceRecipe.__init__c           	   	   C   s  t |j}| }t| j# td t |j	}W d   n1 s'w   Y  W d   n1 s6w   Y  t
 }|f}t
d|}t||}t||t |jd t| j+ | j | D ]}t|drs|  qhW d   n1 s~w   Y  W d   n1 sw   Y  tj||tj | jddd || _	| jr| jd	| j d
| d t |j| _t | _dS )zSetup the model and transforms.metaNcuda)parallelize_plan	rope_initTF)modelfull_sdr-   strictcpu_offloadz%Model was initialized with precision z and TP degree .)r   instantiatecheckpointerload_checkpointr	   set_default_dtyper7   torchr-   rH   r;   get_world_sizeinit_device_meshprepare_mha_for_tpr   tensor_parallel_planr5   moduleshasattrrG   load_from_full_model_state_dict	MODEL_KEYr>   r:   info	tokenizermodel_transformr   to_messages)	r   r+   _checkpointer
_ckpt_dictrH   	tp_degreetp_mesh_shapetp_device_meshmr#   r#   r$   setup^   sN    

 zInferenceRecipe.setup
total_timetokens_per_secondc                 C   s   t dd t| j | j D }| jd|dd|dd | jd|| d dd	 | jj	d
krLt
 }| jd| d dd dS dS )zLogs the following metrics: total time for inference, tokens/sec,
        bandwidth achieved, and max memory allocated.

        Feel free to modify this function to log additional metrics.
        c                 S   s   g | ]
}|  |jj qS r#   )numelr.   itemsize).0pr#   r#   r$   
<listcomp>   s    z/InferenceRecipe.log_metrics.<locals>.<listcomp>zTime for inference: z.02fz sec total, z tokens/seczBandwidth achieved: i   @z GiB/scpuzMax memory allocated: z GiBN)sum	itertoolschainrH   
parametersbuffersr:   rZ   r5   r   r
   get_torch_device_namespacemax_memory_allocated)r   re   rf   
model_sizetorch_devicer#   r#   r$   log_metrics   s"   zInferenceRecipe.log_metricsc                 C   s  |  t|j}tdd |D }| jd|idd}t|d }||j }| j | j	j
d| j|r6| jjnd|d	 W d   n1 sEw   Y  ttj||ftj| jd
}t|}i }	|rt|gdd| jjd}	|	d ddd|f |	d< |	d| j}
ntj|d | jdd}
|dd|f |	d< |dd|f |	d< t|	| j g }t }| j	|
fi |	dddf }t||j|jd}||   |r|	d |	d ddddf |	d< t!|jD ]D}|d|f |	d< |d|dddf |	d< |  | jj"v r n$| j	|fi |	dddf }t||j|jd}||   |d7 }qt | }| j#|}| j$rK| j%&d| d t|| }| j$r^| j'||d dS dS )z9The main entry point for generating tokens from a prompt.c                 S   s   g | ]}|j qS r#   )contains_media)ri   rc   r#   r#   r$   rk      s    z,InferenceRecipe.generate.<locals>.<listcomp>r    T)	inferencetokens   N)
batch_sizer.   encoder_max_seq_lendecoder_max_seq_len)sizer.   r-   left)pad_directionpad_max_imagespad_max_tilesencoder_maskr,   r   mask	input_pos)temperaturetop_kencoder_inputz


)re   rf   )(r]   r   to_containerr   anyr\   lenmax_new_tokensr5   rH   setup_cachesr7   image_seq_lenrQ   trilonesboolaranger   max_num_tilespoptotensor	unsqueezer
   batch_to_devicetimeperf_counterr   r   r   r   itemrangestop_tokensdecoder>   r:   rZ   rv   )r   r+   r    is_multimodal_inputmodel_inputsseq_lentotal_response_lengthcausal_maskr   batchr   generated_tokenst0logitstokenitdecodedrf   r#   r#   r$   generate   s   




zInferenceRecipe.generate)r&   r'   r(   r)   r   rC   rd   intfloatrv   rQ   inference_moder   r#   r#   r#   r$   r*   C   s    1r*   r+   r   c                 C   s4   t jd| d t| d}|j| d |j| d d S )Nr*   )recipe_namer+   )r+   )r   
log_configr*   rd   r   )r+   reciper#   r#   r$   main  s   
r   __main__)"rn   sysr   typingr   r   r   rQ   torch.distributeddistributedr;   	omegaconfr   r   !torch.distributed.tensor.parallelr   	torchtuner   r	   r
   torchtune.datar   r   r   torchtune.generationr   torchtune.modules.transformsr   r   r*   parser   r&   exitr#   r#   r#   r$   <module>   s(   + B