o
    }oih4                     @   sx   d dl mZ d dlZd dlm  mZ d dlmZ d dlmZm	Z	 d dl
mZmZmZmZmZ dgZG dd deZdS )    )ceilN)nn)Loss	typecheck)AcousticEncodedRepresentationLengthsTypeLossType
NeuralTypeSpectrogramTypeContrastiveLossc                +       s   e Zd Zedd Zedd Zedd Z					
						
	
	
							
	d4dedededededede	de	de
dededed ed!ed"e	d#e	d$e	d%e	d&ed'ed(e	f* fd)d*Zd+d, Ze d5d.d/Zd0d1 Zd2d3 Z  ZS )6r   c                 C   s6   t dt t dt t dt t tdt dddS )z(Input types definitions for Contrastive.)BDT)r   r   r   r   T)optional)spectrograms
spec_masksdecoder_outputsdecoder_lengths)r	   r
   r   tupler   self r   f/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/losses/ssl_losses/contrastive.pyinput_types   s
   


zContrastiveLoss.input_typesc                 C   s   dt t diS )z]Output types definitions for Contrastive.
        loss:
            NeuralType(None)
        loss)elements_type)r	   r   r   r   r   r   output_types&   s   zContrastiveLoss.output_typesc                 C   s   dS )NFr   r   r   r   r   needs_labels.   s   zContrastiveLoss.needs_labels      d   F@  皙?sumT         ?;?皙?      0@in_dimproj_dimcombine_time_stepsnum_negativesquantized_targetscodebook_sizeprob_ppl_weight
logit_tempreducesample_from_same_utterance_onlysample_from_non_maskedsample_from_codebook
group_loss
num_groupsquantizer_temp_startquantizer_temp_minquantizer_temp_decaymask_threshold	store_ids
reduce_ids
multiplierc              	      s   t    |||f}|| _|| _|| _| jr)d|| ||||ddd}t|| _|| _|| _|	| _	|| _
|
| _|| _|| _|| _|| _|| _|| _|| _| js[t|| || _dS dS )aQ  
        Loss function representing the contrastive task of identifying the true latent speech representation of
        the masked spectrogram steps from a set of sampled distractors.

        Args:
            in_dim: Number of spectrogram channels.
            proj_dim: Number of channels in the model outputs.
            combine_time_steps: How many time steps should be combined into a single representation.
            num_negatives: Number of sampled negatives for each target.
            quantized_targets: Bool that determines if the targets should be quantized.
            codebook_size: Number of vectors in the codebook per group.
            prob_ppl_weight: Float multiplier on the perplexity loss for target quantization.
            logit_temp: Float temperature for normalizing logits.
            reduce: String representing the type of reduction used for cross entropy.
            sample_from_same_utterance_only: Bool that determines if negatives should be sampled only from same utterance.
            sample_from_non_masked: Bool that determines if negatives should be sampled from non-masked steps of the spectrogram.
            sample_from_codebook: Bool that determines if negatives should be sampled from entire codebook.
            group_loss: Bool that determines if loss should be computed separately for each group in the quantizer codebook.
            num_groups: Number of groups in the quantizer codebook.
            quantizer_temp_start: Starting temperature in quantizer.
            quantizer_temp_min: Minimum temperature in quantizer.
            quantizer_temp_decay: Decay rate of quantizer temperature per global step.
            mask_threshold: Float threshold for determining if a time step of the spectrogram is masked based on percent of masked channels.
            store_ids: Bool that determines if the quantizer ids will be stored to be potentially used by other losses.
            reduce_ids: Bool that determines if we convert any sequence of consecutive equivalent ids to a single occurence of that id.
            multiplier: Float multipler on final loss
        zJnemo.collections.asr.parts.submodules.ssl_quantizers.GumbelVectorQuantizerT)_target_dimvq_dimnum_varsgroupstempcombine_groups
time_firstN)super__init__r-   r,   r/   r   from_config_dict	quantizerr0   r1   r+   r2   r3   r4   r5   r:   r=   r;   r<   r   Lineartarget_proj)r   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   quantizer_tempquantizer_cfg	__class__r   r   rG   2   s>   
4

zContrastiveLoss.__init__c                 C   sh   |j d }ttj||f|jd| j}||d }||| jf|j dd   }|dd}||fS )Nr   devicer   )shapetorchmultinomialonesrQ   r,   view	transpose)r   ynumhighneg_idxsnegsr   r   r   sample_negatives   s   
z ContrastiveLoss.sample_negativesNc                 C   s4  | dd}| dd}tt|jd |jd  |jd  |jd  }|dkr<t|ddd|f}t|ddd|f}||jd |jd d}||jd |jd d}| jr| jr| j	|dd\}}}	| _
| jrtj| j
dd\}
}||jdddd 8 }t| j
}|d|| j
}|jdd	d d }|dd| | _
|| _nd | _n| 	|\}}}	n| |}| jr.|jd }|d| jk}|| }|| }||d|jd }||d|jd }| dd}| dd}| jr| | dd|d\}}
n| ||d\}}
|d|jd }|d|jd }|| jd|jd }n|d| jk}|| }|| }| jr| j	j}| j	j|| j	jd}| dd}| d!d|jd dd}||jd d|jd }|d|jd | }|d|jd | }n7| j"r| j	"| j|d}n'| jr| ||jd |jd  d|d\}}
n| ||d\}}
| #|||}|j$|dtj%d
}| dd}tj&||| j'd}|( }| j)dkr| jr| j)| | }||7 }t*|tj+st+dgj,|j-d}|jd }|| j.| 9 }|S )NrR   r   r   T)
return_ids)return_inverse)r?   keepdimsr?   )dtype)	reductionrP   )/rX   intr   rS   Fpadreshaper-   r;   rI   
target_idsr<   rT   unique_consecutivemin
zeros_likescatter_maxnarrowtarget_lengthsrK   r2   meanr:   r3   r^   sizer,   r5   rB   varsrA   	unsqueezeexpandr4   _calculate_similarity	new_zeroslongcross_entropyr1   numelr/   
isinstanceTensortorQ   r=   )r   r   r   r   r   targetsmasksdiffprob_ppl_losscur_codebook_temp_indicesreduced_idsreduced_lensbsout_masked_onlytargets_masked_only	negativesr6   similarity_scoressimilarity_targetsr   sample_size
batch_sizer   r   r   forward   s   0





zContrastiveLoss.forwardc                 C   s   ||k d}|d}tj||gdd}tj| d|jd dd| dd|}|| j	 }|
 rBtd|dd  |< |S )NrR   r   rc   z-infr   )allru   rT   catcosine_similarityfloatrv   rS   type_asr0   any)r   logitsr   r   
neg_is_posr   r   r   rw     s   
$
z%ContrastiveLoss._calculate_similarityc                 C   s   | j r| j| d S d S N)r-   rI   set_num_updates)r   num_updatesr   r   r   r   .  s   zContrastiveLoss.set_num_updates)r   r   r    Fr!   r"   r"   r#   TFFFr$   r$   r%   r&   r'   TFr(   r   )__name__
__module____qualname__propertyr   r   r   rf   boolr   strrG   r^   r   r   rw   r   __classcell__r   r   rN   r   r      s    
	

	
V 	)mathr   rT   torch.nn.functionalr   
functionalrg   	nemo.corer   r   nemo.core.neural_typesr   r   r   r	   r
   __all__r   r   r   r   r   <module>   s   