o
    }oi                     @   s`   d dl Z d dlZd dlZd dlmZmZ d dlmZ dd Zdd Z	dd	e
fd
dZdd ZdS )    N)llmvlm)!has_dist_env_init_or_rank_env_varc              
      s@   t jfdd tj| d|| fddddt  dS )	z
    FineWeb-Edu dataset
    c              	      s   dd | D }||ddd}|d   d d dd f }tj|dt|d d d df  gdd	}d|t| < ||d
< |S )Nc                 S   s   g | ]}|d  qS )text ).0exampler   r   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/hf/data/automodel_datasets.py
<listcomp>    s    zEmk_hf_vlm_dataset_fineweb_edu.<locals>.collate_fn.<locals>.<listcomp>Tpt)r   paddingreturn_tensors	input_ids   dimlabels)clonetorchcat	ones_likeisin)examples	processortextsbatchr   skipped_tokensr   r	   
collate_fn   s   ,z1mk_hf_vlm_dataset_fineweb_edu.<locals>.collate_fntrainc                        | dS N)r   r   xr   r   r   r	   <lambda>6       z/mk_hf_vlm_dataset_fineweb_edu.<locals>.<lambda>   T)splitmicro_batch_sizeglobal_batch_sizer   num_workerspersistent_workers	streaming)r   HFAutoModelForImageTextToTextextract_skipped_token_idsr   HFDatasetDataModuler   	data_pathr   mbsgbsr   r   r   r   r	   mk_hf_vlm_dataset_fineweb_edu   s   r7   c              	      s:   t jfdd t j| d|| fdddddS )	z
    RDR dataset
    c              	      s   dd }g }g }t || D ]}||j|d ddd ||d 7 }q|||ddd	}|d
 tj|d
< |d  d d dd f }tj|dt|d d d df  gdd}d|t	| < ||d< |S )Nc                 S   sN   d}dd|dd| d dgddd| d dgdg}|| d  d	gd
S )Nz$Describe accurately the given image.userr   typer   image)r:   r;   rolecontent	assistantRGB)conversationimages)convert)sampleinstructionrA   r   r   r	   fmtD   s   z6mk_hf_vlm_dataset_rdr.<locals>.collate_fn.<locals>.fmtrA   F)tokenizeadd_generation_promptrB   Tr   )r   rB   r   r   pixel_valuesr   r   r   r   r   )
mapappendapply_chat_templatetor   bfloat16r   r   r   r   )r   r   rF   r   rB   r   r   r   r   r   r	   r   C   s0   ,z)mk_hf_vlm_dataset_rdr.<locals>.collate_fnr    c                    r!   r"   r   r#   r%   r   r	   r&   q   r'   z'mk_hf_vlm_dataset_rdr.<locals>.<lambda>r(   T)r)   r*   r+   r   r,   r-   r   r/   r0   r1   r2   r   r6   r	   mk_hf_vlm_dataset_rdr=   s   )rP   Tsort_json_keyc                    s   t | tkrAt| dkrd| v r| d S d} r!t|  dd}n|  }|D ]}|d| dt| |   d| d 7 }q'|S t | tkrSd	 fd
d| D S t| } | S )z>
    Convert an ordered JSON object into a token sequence
    r   text_sequence T)reversez<s_>z</s_z<sep/>c                    s   g | ]}t | qS r   
json2token)r   itemrQ   r   r	   r
      s    zjson2token.<locals>.<listcomp>)	r:   dictlensortedkeysrW   listjoinstr)objrQ   outputr]   kr   rY   r	   rW   w   s   ,rW   c              
      s:   t j fddt j| d||dd fdddS )	z
    CORD-V2 dataset
    c              	      s|  g }| D ]@}t |d }d|v rt|d tsJ |d }nd|v r*t|d ts,J |d g}tdd |D }||d |f q|} g }g }| D ]+}|\}	}||	g dddid	d
dgddd	|dgdg}
||
}|| qM|||dddd}|d 	t
j|d< |d  d d dd f }t
j|dt
|d d d df  gdd}d|t
| < ||d< |S )Nground_truth	gt_parsesgt_parsec                 S   s   g | ]}t |d dqS )TrY   rV   )r   gt_jsonr   r   r	   r
      s    zGmk_hf_vlm_dataset_cord_v2.<locals>.train_collate_fn.<locals>.<listcomp>r;   r8   r:   r   zExtract JSONr9   r<   r?   Tr   )r   rB   r   
truncationr   rI   r   r   r   r   r   )jsonloads
isinstancer^   rZ   randomchoicerK   rL   rM   r   rN   r   r   r   r   )r   r   processed_examplesr   rd   gt_jsonsr   rB   r   r;   rA   text_promptr   r   r   r   r	   train_collate_fn   sF   


,z3mk_hf_vlm_dataset_cord_v2.<locals>.train_collate_fnr    r(   Tc                    s   |  dS r"   r   r#   )r   rq   r   r	   r&      r'   z+mk_hf_vlm_dataset_cord_v2.<locals>.<lambda>)r)   r*   r+   r,   r-   r   rO   r2   r   )r   r   rq   r	   mk_hf_vlm_dataset_cord_v2   s   3rr   )T)ri   rl   r   nemo.collectionsr   r   (nemo.collections.llm.gpt.data.hf_datasetr   r7   rP   boolrW   rr   r   r   r   r	   <module>   s   %: