o
    }oi                  	   @   s   d dl Z d dlZ d dlmZ d dl mZ d dlmZ d dlm	Z	 z
d dl
mZ dZW n eefy7   dZY nw 		ddd	ZG d
d dejZdS )    N)distributed)
functional))average_losses_across_data_parallel_group)parallel_stateTFc                    s   t  }t  }t  }|r*tjtjj dd}tjtjjdd}||fS  fddt	|D }	fddt	|D }
t
j|	 |d t
j|
|d |sZ |	|< |
|< tj|	dd}tj|
dd}||fS )a  
    Gathers image and text features across multiple data parallel processes.

    This function is designed to work in a distributed environment where multiple
    processes are handling different portions of data. It gathers the image and text
    features from all processes to form a complete set of features across the entire dataset.
    This is crucial for calculating loss in models like CLIP, especially when the model is
    trained in a data parallel fashion.

    Parameters:
    image_features (Tensor): A tensor containing the image features.
    text_features (Tensor): A tensor containing the text features.
    local_loss (bool, optional): A flag to determine whether to use local loss calculation.
                                 Defaults to False.
    gather_with_grad (bool, optional): A flag to enable gathering with gradient computation.
                                       This is not currently working in the latest PyTorch version.
                                       Defaults to False.

    Returns:
    Tuple[Tensor, Tensor]: A tuple containing the gathered image features and text features
                           across all processes.
    r   )dimc                       g | ]}t  qS  torch
zeros_like.0_)image_featuresr   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/multimodal/losses/clip_loss.py
<listcomp>G       z#gather_features.<locals>.<listcomp>c                    r   r   r	   r   )text_featuresr   r   r   H   r   )group)r   get_data_parallel_world_sizeget_data_parallel_rankget_data_parallel_groupr
   catr   nn
all_gatherrangedist)r   r   
local_lossgather_with_graddata_parallel_world_sizedata_parallel_rankdata_parallel_groupall_image_featuresall_text_featuresgathered_image_featuresgathered_text_featuresr   )r   r   r   gather_features    s"   r&   c                       s0   e Zd ZdZ			d fdd	Zdd Z  ZS )ClipLossuE  
    A custom loss module for CLIP (Contrastive Language–Image Pretraining) training.

    This module is specifically designed for calculating the loss in CLIP model training,
    supporting features like local loss calculation, gradient gathering, and label caching
    for efficiency in a distributed training setup.

    Parameters:
    local_loss (bool, optional): If True, calculates loss locally on each data parallel process.
                                 Defaults to False.
    gather_with_grad (bool, optional): If True, gathers gradients during loss calculation.
                                       Currently not functional in the latest PyTorch version.
                                       Defaults to False.
    cache_labels (bool, optional): If True, caches labels for reuse in subsequent iterations,
                                   improving performance. Defaults to False.

    Attributes:
    world_size (int): The size of the data parallel group (number of processes).
    rank (int): The rank of the current process within the data parallel group.

    Methods:
    forward(output_tensor): Computes the loss given the model's output tensor. This involves
                            gathering features across processes, computing logits, and
                            calculating the final cross-entropy loss.
    Fc                    s@   t    || _|| _|| _d| _i | _t | _	t
 | _dS )Initr   N)super__init__r   r   cache_labelsprev_num_logitslabelsr   r   
world_sizer   rank)selfr   r   r+   	__class__r   r   r*   r   s   

zClipLoss.__init__c                 C   s(  |\}}}|j }| jdkr5t||| j| j\}}| jr*|| |j }|| |j }	n|| |j }|j}	n|| |j }|| |j }	|jd }
| j|
ksR|| jvrvt	j
|
|t	jd}| jdkrj| jrj||
| j  }| jru|| j|< |
| _n| j| }t||t|	| d }t|g}|d|ifS )zForward for loss   r   )devicedtype   loss)r4   r.   r&   r   r   Tshaper,   r-   r
   arangelongr/   r+   Fcross_entropyr   )r0   output_tensorr   r   logit_scaler4   r"   r#   logits_per_imagelogits_per_text
num_logitsr-   
total_lossreduced_lossr   r   r   forward   s4   





zClipLoss.forward)FFF)__name__
__module____qualname____doc__r*   rE   __classcell__r   r   r1   r   r'   W   s    r'   )FF)r
   torch.distributed.nntorch.nnr   r   r   r   r<   2nemo.collections.nlp.modules.common.megatron.utilsr   megatron.corer   HAVE_MEGATRON_COREImportErrorModuleNotFoundErrorr&   Moduler'   r   r   r   r   <module>   s    
7