o
    wi<                     @   s|   d Z ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 G dd dejjjZG dd dejjjZd	d
 ZdS )zBlendable dataset.    N)logging)AppStatec                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )BlendableDataset c           	   	   C   s  || _ t|}|t|ksJ || _tj|tjd}t|}|dks%J || }t }|dk s3J tj| jtj	d| _
tj| jtjd| _t }z|jdkrSt  tj  ddlm} W n tyj   tdw || j
| j||| jtj dk tdt |  d S )Ndtype           r   )helperszhCould not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.zC> elapsed time for building blendable dataset indices: {:.2f} (sec))datasetslensizenparrayfloat64sumtimezerosuint8dataset_indexint64dataset_sample_indexr   
local_rankcompile_helpertorchdistributedbarriernemo.collections.common.datar
   ImportErrorbuild_blending_indicesget_rankr   infoformat)	selfr   weightsr   num_datasetssum_weights
start_time	app_stater
    r)   k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/data/blendable_dataset.py__init__   sD   


zBlendableDataset.__init__c                 C      | j S Nr   r#   r)   r)   r*   __len__J      zBlendableDataset.__len__c                 C   sp   | j | }| j| }t| j| }||kr1td| d| d || }td| d| d | j| | S )NzIndex z out of bounds for dataset z. Reusing existing examples.zReusing index z for dataset .)r   r   r   r   r   warning)r#   idxdataset_idx
sample_idxdataset_sizer)   r)   r*   __getitem__M   s   

zBlendableDataset.__getitem__c                 C   s   | j D ]}|  qdS )r   N)r   create_data_mmap)r#   datasetr)   r)   r*   r9   Y   s   

z!BlendableDataset.create_data_mmapN)__name__
__module____qualname____doc__r+   r0   r8   r9   r)   r)   r)   r*   r      s    +r   c                   @   s>   e Zd ZdZdddZdd Zdd Zd	d
 Zedd Z	dS )MemoryEfficientBlendableDatasetz
    A BlendableDataset implementation that uses less memory than the original implementation.
    Indices are computed algorithmically instead of storing them in memory.

    To test call: MemoryEfficientBlendableDataset.test_index_blending()
    d   c                    sx  | _ t|}|t|ksJ t||}| _| _tj|tjd}|dk s*J t	|}|dks5J ||  _
g }g }t j
D ]\}	}
t|
| }||	g|  |t| qC|t| }||	g|  |t|d |d |  tj|tjd _tj fddt|D tjd _ jdk sJ d j tj|tjd _tjdd |D tjd _d S )	Nr   r   c                    s   g | ]	} j |k qS r)   )ds_indexr   .0ir/   r)   r*   
<listcomp>   s    z<MemoryEfficientBlendableDataset.__init__.<locals>.<listcomp>r   zvSome datasets have no samples in the blendable dataset, increase weight_bins or the offending weight. ds_index_size = c                 S   s   g | ]}t |qS r)   )r   )rD   dsr)   r)   r*   rF      s    )r   r   minr   weight_binsr   r   r   allr   r$   	enumerateintextendrangeuint32rB   ds_index_sizeds_biasds_size)r#   r   r$   r   rI   r%   r&   rB   rQ   rE   wnr)   r/   r*   r+   g   s:   


$ z(MemoryEfficientBlendableDataset.__init__c                 C   sD   || j  }| j| }| j| || j  | j|   | j|  }||fS )z_Returns ds index and sample index (within the ds) for the given index in the blendable dataset.)rI   rB   rQ   rP   rR   )r#   r4   binds_idxr6   r)   r)   r*   get_ds_sample_idx   s   

 z1MemoryEfficientBlendableDataset.get_ds_sample_idxc                 C   r,   r-   r.   r/   r)   r)   r*   r0      r1   z'MemoryEfficientBlendableDataset.__len__c                 C   s   |  |\}}| j| | S r-   )rW   r   )r#   r4   rV   r6   r)   r)   r*   r8      s   z+MemoryEfficientBlendableDataset.__getitem__c                    s   ddl m} |  G dd dtjjj}dD ]U}t|dd|dd|dd	gg d
d|d  fddtdD }t	t
| d }t	t
| d }|  |j|dd |j|dd |  |  |d|  qdS )z$Visualize indices of blended datasetr   Nc                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	z?MemoryEfficientBlendableDataset.test_index_blending.<locals>.DSr   c                 S   s   || _ || _d S r-   )r   data)r#   r   rX   r)   r)   r*   r+      s   
zHMemoryEfficientBlendableDataset.test_index_blending.<locals>.DS.__init__c                 S   r,   r-   r.   r/   r)   r)   r*   r0      r1   zGMemoryEfficientBlendableDataset.test_index_blending.<locals>.DS.__len__c                 S   s
   | j | S r-   )rX   )r#   r4   r)   r)   r*   r8      s   
zKMemoryEfficientBlendableDataset.test_index_blending.<locals>.DS.__getitem__N)r;   r<   r=   r>   r+   r0   r8   r)   r)   r)   r*   DS   s
    rY   )
   r@   rZ   abc)g      ?g333333?g?2   )rI   c                    s   g | ]}  |qS r)   )rW   rC   blend_dsr)   r*   rF      s    zGMemoryEfficientBlendableDataset.test_index_blending.<locals>.<listcomp>   zds idx)labelsamplezweight_bins=)matplotlib.pyplotpyplotionr   utilsrX   Datasetr?   rN   listzipfigureplotlegendgridtitle)clspltrY   rI   ds_sample_idx_listds_listsample_listr)   r_   r*   test_index_blending   s"   $z3MemoryEfficientBlendableDataset.test_index_blendingN)r@   )
r;   r<   r=   r>   r+   rW   r0   r8   classmethodru   r)   r)   r)   r*   r?   _   s    
(r?   c                  C   sR   t jt jt} tdd| g}|jdkr't	d ddl
}|d dS dS )zVCompile helper function ar runtime. Make sure this
    is invoked on a single process.makez-Cr   z2Making C++ dataset helpers module failed, exiting.Nra   )ospathabspathdirname__file__
subprocessrun
returncoder   errorsysexit)ry   retr   r)   r)   r*   r      s   

r   )r>   rx   r}   r   numpyr   r   
nemo.utilsr   nemo.utils.app_stater   rg   rX   rh   r   r?   r   r)   r)   r)   r*   <module>   s   Ci