o
    }oi,&                  	   @   sB  d dl mZ d dlmZmZmZ d dlZd dlmZm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d d	lmZ z
d d
lmZ dZW n eefyS   dZY nw d&deeee f dededejfddZ	d'ddZd(ddZdd Z dd Z!	d)ddZ"dd Z#	 						d*d d!Z$d)d"d#Z%G d$d% d%eZ&dS )+    )partial)AnyListUnionN)Datasetdefault_collate)image_transform)imagenet_classnamesopenai_imagenet_template)WebDatasetCommon)MegatronPretrainingSampler)ImageFolder)parallel_stateTFM   texts	tokenizercontext_lengthreturnc           	         s   d}t | tr| g} d}j jj} fdd| D }tjt||tjd| }t	|D ]!\}}t||krE|d| }|d< t
|||dt|f< q1|rY|d }|S )	a  
    Returns the tokenized representation of given input string(s)

    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize
    tokenizer:
        Tokenizer loaded in NeMo
    context_length : int
        The context length to use; all CLIP models use 77 as the context length

    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
    FTc                    s.   g | ]} d ur gng  | g qS N)text_to_ids).0textbos_ideos_idr    f/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/multimodal/data/clip/clip_dataset.py
<listcomp>=   s   . ztokenize.<locals>.<listcomp>)dtypeNr   )
isinstancestrr   r   pad_idtorchoneslenlong	enumeratetensor)	r   r   r   texts_is_strr"   
all_tokensresultitokensr   r   r   tokenize$   s"   
r.   c           
      C   s>   | |f}t ||||d}dd }	|d urtt||d}	||	fS )Nis_trainmeanstdc                 S      | S r   r   xr   r   r   <lambda>Z       z+get_preprocess_fns_params.<locals>.<lambda>r   r   )r   r   r.   )
img_himg_wimg_meanimg_stdr0   max_position_embeddingr   img_sizeimg_transformtext_transformr   r   r   get_preprocess_fns_paramsL   s   rA   c                 C   sn   | j d| j df}| j d}| j d}t||||d}dd }|d ur3tt|| jdd	}||fS )
Nr9   r:   r;   r<   r/   c                 S   r3   r   r   r4   r   r   r   r6   o   r7   z$get_preprocess_fns.<locals>.<lambda>max_position_embeddingsr8   )visiongetr   r   r.   r   )	model_cfgr   r0   r>   r;   r<   r?   r@   r   r   r   get_preprocess_fnsd   s"   
rF   c                 c   s4    | D ]}t  }|d |d< |d |d< |V  qd S )Nr   images   captions)dict)inpinputout_dictr   r   r   tuple_to_dictz   s   rN   c                 C   s,   | d | d }}||||}}||fS )Njpgtxtr   )sampler?   r@   imager   img_transformedtext_transformedr   r   r   transform_fn   s   rU   c           	      C   s   | j }t| |dd\}}t||tt||dtdd}d }|dd ur@|jdr@t| |dd\}}t|dtt||dtdd}||fS )	NTr0   )r?   r@   )dataset_cfgconsumed_samplesmap_fn
compose_fnr0   
validationdataset_pathFr   )datarF   r   r   rU   rN   rD   r[   )	rE   rX   r   data_cfgtrain_img_transformr@   
train_dataval_dataval_img_transformr   r   r   build_train_valid_datasets   s(   rc   c                 C   s   t | dkrdS t| S )Nr   )NN)r%   r   )batchr   r   r   custom_collate   s   re   c              	   C   s   t |||||	|
|d\}}i }| }|d u rd S t||d}tt|d||t t dd}tjj	j
|||t|dd|d< ttt|}tjj	j
||jddddd	|d
< |S )N)r0   r=   r   root	transformr   Ftotal_samplesrX   micro_batch_sizeglobal_batch_sizedata_parallel_rankdata_parallel_size	drop_lastTbatch_samplernum_workers
collate_fn
pin_memorypersistent_workersrG   
batch_sizerr   rt   ru   ro   r   )rA   r   r   r%   r   get_data_parallel_rankget_data_parallel_world_sizer#   utilsr]   
DataLoaderre   ImagenetClassnameDatasetr	   r
   num_templates)imagenet_valr9   r:   mbsgbsrr   rt   r;   r<   r0   r=   r   val_image_transformr@   imagenet_val_dataimagenet_pathimage_datasetimage_batch_samplertext_datasetr   r   r   +build_imagenet_validation_dataloader_params   sX   




	r   c           
   	   C   s   t | |dd\}}| j}i }|d}|du rdS t||d}tt|d| j| jt	 t
 dd}tjjj||t|jdtd	d	d
|d< ttt|}	tjjj|	|	jdd	ddd|d< |S )zBuild dataloadersFrV   r~   Nrf   r   ri      Trp   rG   rv   r   )rF   r]   rD   r   r   r%   rk   rl   r   rx   ry   r#   rz   r{   minrr   re   r|   r	   r
   r}   )
rE   r   r   r@   r^   r~   r   r   r   r   r   r   r   $build_imagenet_validation_dataloader   sJ   



	
r   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	r|   zImagenet class datasetc                    s@   t || _g | _|D ]  fdd|D }| j|| q
d S )Nc                    s   g | ]}| qS r   r   )r   template	classnamer   r   r   +  s    z5ImagenetClassnameDataset.__init__.<locals>.<listcomp>)r%   r}   samplesextend)self
classnames	templatesr@   r   r   r   r   __init__&  s   
z!ImagenetClassnameDataset.__init__c                 C   s
   | j | S r   )r   )r   indexr   r   r   __getitem__.     
z$ImagenetClassnameDataset.__getitem__c                 C   s
   t | jS r   )r%   r   )r   r   r   r   __len__2  r   z ImagenetClassnameDataset.__len__N)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r|   #  s
    r|   )r   )NNTNN)NTr   )r   TNNFNN)'	functoolsr   typingr   r   r   r#   torch.utils.datar   r   Anemo.collections.multimodal.data.clip.augmentations.augmentationsr   <nemo.collections.multimodal.data.clip.imagenet_zeroshot_datar	   r
   2nemo.collections.multimodal.data.common.webdatasetr   Bnemo.collections.nlp.data.language_modeling.megatron.data_samplersr   2nemo.collections.vision.data.megatron.image_folderr   megatron.corer   HAVE_MEGATRON_COREImportErrorModuleNotFoundErrorr!   int
LongTensorr.   rA   rF   rN   rU   rc   re   r   r   r|   r   r   r   r   <module>   sD   *)

	


G/