o
    }oi7                     @   sp   d dl Z d dlm  mZ d dl mZ d dlmZ d dlmZmZm	Z	 d dl
mZmZmZ G dd deeZdS )    N)nn)NeuralModule)
Exportabler   	typecheck)
LabelsType
NeuralTypeSpectrogramTypec                       s|   e Zd ZddgZ					ddededed	ed
ededededef fddZedd Z	edd Z
e dd Z  ZS )RandomProjectionVectorQuantizerl2cosineFT   feat_incode_dimnum_classes	num_booksdist_fn
time_aheadfreezesqueeze_singlecombine_time_stepsc
                    s   t    || jvrtd| d| j || _|| _|| _|| _|| _|| _	|| _
|	| _tj| j|	 | j| j dd| _tjj| jj t| j| j| j }
tjjj|
ddd tj|
dd	}
t|
| _|rq|   d
S d
S )a  Vector quantization using random projection proposed in BEST-RQ paper:
        'Self-Supervised Learning with Random-Projection Quantizer for Speech Recognition'

         Args:
            feat_in: input feature dimension
            code_dim: dimension of the codebook features
            num_classes: number of classes
            num_books: number of codebooks
            dist_fn: distance function to use, one of "l2" or "cosine"
            time_ahead: if Ture, the input is of shape (B, T, D), otherwise (B, D, T)
            freeze: whether to freeze the projection matrix
            squeeze_single: if True, squeeze codebook dimension if num_books is 1
        Unknown distance function , must be one of F)biasr   r   )meanstddimN)super__init__DIST_FN_LIST
ValueErrorr   r   r   r   r   r   r   r   r   Linearprojtorchinitxavier_normal_weightrandndoublenormal_F	normalize	Parameter	codebooksr   )selfr   r   r   r   r   r   r   r   r   r.   	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/ssl_modules/quantizers.pyr      s(   

 z(RandomProjectionVectorQuantizer.__init__c                 C   s&   | j rdtdt iS dtdt iS )z*Returns definitions of module input ports.input_signalBTDr6   r8   r7   )r   r   r   r/   r2   r2   r3   input_typesN   s   z+RandomProjectionVectorQuantizer.input_typesc                 C   s   | j r%| jdkr| jrtdt tdt dS tdt tdt dS | jdkr:| jr:tdt tdt dS tdt tdt dS )	z+Returns definitions of module output ports.r   r5   )r6   r7   )xqxid)r6   r7   r8   H)r6   r7   r>   r9   )r6   r8   r7   r>   )r   r   r   r   r   r   r:   r2   r2   r3   output_typesU   s   







z,RandomProjectionVectorQuantizer.output_typesc           	      C   s  | j s	|dd}| \}}}| jdkr&| ||| j d}|| j }| |}tj|	||| j
| jdd}| jdkrPtd|| j}|jddd }n.| jdkrr|d| jdddd }|jd	djdd}ntd
| j d| j || jtj| j
|jddd  }|dd 	d|}t|	d| j	d| j	||| j| j
}| j s|dd}| j
dkr| jr|d}|d}||fS )z
        Args:
            input_signal: input features of shape (B, T, D) or (B, D, T)
        Returns:
            xq: quantized features of shape (B, T, D, N) or (B, D, T, N)
            xid: quantized tokens of shape (B, T, N)
        r      r   r   r   zbtdh,dch->btdcr
   r   r   r   )device)r   	transposesizer   
contiguousreshaper#   r+   r,   viewr   r   r   r$   einsumr.   max	unsqueezenormargminr!   r    r   arangerB   	embeddingr   squeeze)	r/   r4   r6   r7   _xr=   xid2r<   r2   r2   r3   forwardl   s4   	




$(

z'RandomProjectionVectorQuantizer.forward)r   FTFr   )__name__
__module____qualname__r    intstrboolr   propertyr;   r?   r   rS   __classcell__r2   r2   r0   r3   r	      s@    	
3

r	   )r$   torch.nn.functionalr   
functionalr+   	nemo.corer   nemo.core.classesr   r   nemo.core.neural_typesr   r   r   r	   r2   r2   r2   r3   <module>   s   