o
    }oiF                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	  m
Z d dlmZ d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ eG d
d deZdedefddZG dd deeZG dd dZdddZ dd Z!e Z"dedefddZ#G dd deeZ$dS )    N)	dataclass)AnyListOptional)	rearrange)DefaultTaskEncoderSample
SkipSample)	stateless)Cookerbasic_sample_keys)IOMixin)first_fit_decreasingc                   @   s  e Zd ZU dZejed< ejed< ejed< ejed< dZeej ed< dZ	eej ed< dZ
eej ed	< dZeej ed
< dZeej ed< dZeej ed< dZeej ed< dZeej ed< defddZdedefddZdedefddZdedefddZdS )DiffusionSamplea  
    Data class representing a sample for diffusion tasks.

    Attributes:
        video (torch.Tensor): Video latents (C T H W).
        t5_text_embeddings (torch.Tensor): Text embeddings (S D).
        t5_text_mask (torch.Tensor): Mask for text embeddings.
        loss_mask (torch.Tensor): Mask indicating valid positions for loss computation.
        image_size (Optional[torch.Tensor]): Tensor containing image dimensions.
        fps (Optional[torch.Tensor]): Frame rate of the video.
        num_frames (Optional[torch.Tensor]): Number of frames in the video.
        padding_mask (Optional[torch.Tensor]): Mask indicating padding positions.
        seq_len_q (Optional[torch.Tensor]): Sequence length for query embeddings.
        seq_len_kv (Optional[torch.Tensor]): Sequence length for key/value embeddings.
        pos_ids (Optional[torch.Tensor]): Positional IDs.
        latent_shape (Optional[torch.Tensor]): Shape of the latent tensor.
    videot5_text_embeddingst5_text_mask	loss_maskN
image_sizefps
num_framespadding_mask	seq_len_q
seq_len_kvpos_idslatent_shapereturnc                 C   s8   t | j| j| j| j| j| j| j| j| j	| j
| j| jdS )z$Converts the sample to a dictionary.)r   r   r   r   r   r   r   r   r   r   r   r   )dictr   r   r   r   r   r   r   r   r   r   r   r   self r    i/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/data/diffusion_taskencoder.pyto_dict?   s   zDiffusionSample.to_dictotherc                 C   s:   t |tr| j |j  S t |tr| j | S t)zGAdds the sequence length of this sample with another sample or integer.
isinstancer   r   itemintNotImplementedErrorr   r#   r    r    r!   __add__P   s
   

zDiffusionSample.__add__c                 C   s   t |tr| j | S t)z3Handles reverse addition for summing with integers.)r%   r'   r   r&   r(   r)   r    r    r!   __radd__Z   s   
zDiffusionSample.__radd__c                 C   s:   t |tr| j |j k S t |tr| j |k S t)zFCompares this sample's sequence length with another sample or integer.r$   r)   r    r    r!   __lt__b   s
   

zDiffusionSample.__lt__)__name__
__module____qualname____doc__torchTensor__annotations__r   r   r   r   r   r   r   r   r   r   r"   r   r'   r*   r+   boolr,   r    r    r    r!   r      s$   
 




r   sampler   c                 C   *   t di t| | d | d | d dS )a2  
    Processes a raw sample dictionary from energon dataset and returns a new dictionary with specific keys.

    Args:
        sample (dict): The input dictionary containing the raw sample data.

    Returns:
        dict: A new dictionary containing the processed sample data with the following keys:
            - All keys from the result of `basic_sample_keys(sample)`
            - 'json': The contains meta data like resolution, aspect ratio, fps, etc.
            - 'pth': contains video latent tensor
            - 'pickle': contains text embeddings
    z.jsonz.pthz.pickle)jsonpthpickleNr    r   r   r5   r    r    r!   cookk      
r<   c                       s   e Zd ZdZeegZddddddddded	ed
ededededef fddZ	e
dddedefddZdee deee  fddZe
dee defddZe
dee def fddZ  ZS )BasicDiffusionTaskEncoderac  
    BasicDiffusionTaskEncoder is a class that encodes image/video samples for diffusion tasks.
    Attributes:
        cookers (list): A list of Cooker objects used for processing.
        max_frames (int, optional): The maximum number of frames to consider from the video. Defaults to None.
        text_embedding_padding_size (int): The padding size for text embeddings. Defaults to 512.
    Methods:
        __init__(*args, max_frames=None, text_embedding_padding_size=512, **kwargs):
            Initializes the BasicDiffusionTaskEncoder with optional maximum frames and text embedding padding size.
        encode_sample(sample: dict) -> dict:
            Encodes a given sample dictionary containing video and text data.
            Args:
                sample (dict): A dictionary containing 'pth' for video latent and 'json' for additional info.
            Returns:
                dict: A dictionary containing encoded video, text embeddings, text mask, and loss mask.
            Raises:
                SkipSample: If the video latent contains NaNs, Infs, or is not divisible by the tensor parallel size.
    Ni         g        )
max_framestext_embedding_padding_size
seq_lengthmax_seq_lengthpatch_spatialpatch_temporalaesthetic_scorerA   rB   rC   rD   rE   rF   rG   c          
         s@   t  j|i |	 || _|| _|| _|| _|| _|| _|| _d S N)	super__init__rA   rB   rC   rD   rE   rF   rG   )
r   rA   rB   rC   rD   rE   rF   rG   argskwargs	__class__r    r!   rJ      s   
z"BasicDiffusionTaskEncoder.__init__T)restore_seedsr5   r   c                 C   sL  |d }t | st | rt t t |dkr"t |d }|d | jk r0t |j\}}}}|jd |jd  |jd  | j	d  | j
 }|d	k}	| jd
ur_|| jkr_t | jd
url|| jkrlt | jd
ur|d
d
d
| jd
d
d
d
f }t|d| j	| j	| j
d}|	rt |d t j}
nt |d d t j}
|
jd }|| jkr|
d
| j }
nt|
ddd| j| f}
t j|t jd}|	r|d |d }}t jdgd	 t jd}t jd	gd	 t jd}n#|d |d }}t j|d gd	 t jd}t j|d gd	 t jd}t j||||ggd	 t jd}ttj|| j
 || j	 || j	 dd}| jd
urn| jd
u rnt|ddd| j| f}t j| jt jd}d	|d
|< t|ddd| j| f}nt j|t jd}t|d |d d
|d ||
|||||t j|t jdt j| jt jd|t j||||gt jddS )z.
        Encodes video / text sample.
        r8   g     @@r7   rG   r?   r@   Nz.C (T pt) (H ph) (W pw) -> (T H W) (ph pw pt C))phpwptr9   r   )dtypeimage_heightimage_width   heightwidth	framerater   )thwzT H W d -> (T H W) d__key____restore_key____subflavors__)r`   ra   __subflavor__rb   r   r   r   r   r   r   r   r   r   r   r   )r1   isnananyisinfr	   maxabsrG   shaperE   rF   rC   rD   rA   r   
from_numpytobfloat16rB   Fpadonestensor	pos_id_3dget_pos_id_3dzerosr   int32)r   r5   video_latentinfoCTHWseq_lenis_imager   t5_text_embeddings_seq_lengthr   r^   r_   r   r   r   r   r   r    r    r!   encode_sample   s   
"

	 z'BasicDiffusionTaskEncoder.encode_samplesamplesc                 C   s   t || j}t| |S )zK
        Selects sequences to pack for mixed image-video training.
        )r   rD   randomshuffle)r   r   resultsr    r    r!   select_samples_to_pack  s   
z0BasicDiffusionTaskEncoder.select_samples_to_packc                    s    fdd} fdd}t dd  D | j}t dd  D | j}t dd  D | j}td	d
d  D dd d j||d|d||d|d||ddS )z@Construct a new Diffusion sample by concatenating the sequences.c                       t j fddD ddS )Nc                       g | ]}t | qS r    getattr.0r5   attrr    r!   
<listcomp>"      zRBasicDiffusionTaskEncoder.pack_selected_samples.<locals>.stack.<locals>.<listcomp>r   dim)r1   stackr   r   r   r!   r   !     z>BasicDiffusionTaskEncoder.pack_selected_samples.<locals>.stackc                    r   )Nc                    r   r    r   r   r   r    r!   r   %  r   zPBasicDiffusionTaskEncoder.pack_selected_samples.<locals>.cat.<locals>.<listcomp>r   r   )r1   catr   r   r   r!   r   $  r   z<BasicDiffusionTaskEncoder.pack_selected_samples.<locals>.catc                 S      g | ]}|j qS r    )r   r   ir    r    r!   r   '      zCBasicDiffusionTaskEncoder.pack_selected_samples.<locals>.<listcomp>c                 S   r   r    )r   r   r    r    r!   r   (  r   c                 S   r   r    )r   r   r    r    r!   r   )  r   ,c                 S   r   r    )r`   )r   sr    r    r!   r   ,  r   r    Nr   r   r   r   r   r   )r`   ra   rc   rb   r   r   r   r   r   r   r   r   )
concat_padrD   r   joinrb   )r   r   r   r   r   r   r   r    r   r!   pack_selected_samples  s&   z/BasicDiffusionTaskEncoder.pack_selected_samplesc              
      sh   | j du rt | S |d }t|jd|jd|jd|j	d|j
|j|jd|jdS )z&Return dictionary with data for batch.Nr   )r   r   r   r   r   r   r   r   )rD   rI   batchr"   r   r   
unsqueeze_r   r   r   r   r   r   r   )r   r   r5   rM   r    r!   r   =  s   





zBasicDiffusionTaskEncoder.batch)r-   r.   r/   r0   r   r<   cookersr'   floatrJ   r
   r   r~   r   r   r   r   r   __classcell__r    r    rM   r!   r>      sB    	f$r>   c                   @   s2   e Zd ZdZddddddZdd Zd	d
 ZdS )PosID3Dz
    Generates 3D positional IDs for video data.

    Attributes:
        max_t (int): Maximum number of time frames.
        max_h (int): Maximum height dimension.
        max_w (int): Maximum width dimension.
           )max_tmax_hmax_wc                C   s   || _ || _|| _|   d S rH   )r   r   r   generate_pos_id)r   r   r   r   r    r    r!   rJ   `  s   zPosID3D.__init__c              	   C   sB   t jt t j| jddt j| jddt j| jdddd| _dS )zDGenerates a grid of positional IDs based on max_t, max_h, and max_w.cpu)devicerP   r   N)r1   r   meshgridaranger   r   r   gridr   r    r    r!   r   f  s   zPosID3D.generate_pos_idc                C   sl   || j ks|| jks|| jkr(t| j || _ t| j|| _t| j|| _|   | jd|d|d|f S )z2Retrieves positional IDs for specified dimensions.N)r   r   r   rg   r   r   )r   r]   r^   r_   r    r    r!   rr   q  s   zPosID3D.get_pos_id_3dN)r-   r.   r/   r0   rJ   r   rr   r    r    r    r!   r   V  s
    	r   c                 C   sp   |dkr| S |  d}|||  | }|dkr| S t| j}|d  |7  < tj|| j| jd}| |d|< |S )a  
    Pads the input tensor to make its size divisible by a specified value.

    Args:
        x (torch.Tensor): Input tensor.
        padding_value (int): The value to make the tensor size divisible by.

    Returns:
        torch.Tensor: Padded tensor.
    r   rV   r   N)sizelistri   r1   rs   rV   r   )xpadding_valuenpadding_needed	new_shapex_paddedr    r    r!   pad_divisible{  s   

r   c           
      C   sz   ddl }| d jdd }| d j}| d j}|j|g|R ||d}d}| D ]}|jd }	|||||	 < ||	7 }q'|S )a  
    Efficiently concatenates a list of tensors along the first dimension and pads with zeros
    to reach max_seq_length.

    Args:
        tensor_list (list of torch.Tensor): List of tensors to concatenate and pad.
        max_seq_length (int): The desired size of the first dimension of the output tensor.

    Returns:
        torch.Tensor: A tensor of shape [max_seq_length, ...], where ... represents the remaining dimensions.
    r   Nr@   r   )r1   ri   rV   r   rs   )
tensor_listrD   r1   other_shaperV   r   resultcurrent_indexrp   lengthr    r    r!   r     s   



r   c                 C   r6   )a  
    Processes a raw sample dictionary from energon dataset and returns a new dictionary with specific keys.

    Args:
        sample (dict): The input dictionary containing the raw sample data.

    Returns:
        dict: A new dictionary containing the processed sample data with the following keys:
            - All keys from the result of `basic_sample_keys(sample)`
            - 'jpg': original images
            - 'png': contains control images
            - 'txt': contains raw text
    jpgpngtxt)imageshintr   Nr    r:   r;   r    r    r!   cook_raw_images  r=   r   c                   @   s   e Zd ZdZeegZdS )RawImageDiffusionTaskEncoderzC
    Dummy task encoder takes raw image input on CrudeDataset.
    N)r-   r.   r/   r0   r   r   r   r    r    r    r!   r     s    r   )r   )%r   dataclassesr   typingr   r   r   r1   torch.nn.functionalnn
functionalrm   einopsr   megatron.energonr   r   r	   "megatron.energon.task_encoder.baser
   %megatron.energon.task_encoder.cookingr   r   nemo.lightning.io.mixinr   !nemo.utils.sequence_packing_utilsr   r   r   r<   r>   r   r   r   rq   r   r   r    r    r    r!   <module>   s,   L V
%" 