o
    @Ti                  
   @   s   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 ddl
mZ e Zejejejejejejejejejd	ZG dd dejZdS )	    N)Wav2Vec2Model)ConvLayerBlock   )
get_logger)	wav2vec2_basewav2vec2_largewav2vec2_large_lv60khubert_basehubert_largehubert_xlarge
wavlm_basewavlm_base_pluswavlm_largec                       s   e Zd ZddededB def fddZed	efd
dZed	ee	eef  fddZ
ded	efddZe 			ddejdejdB dedB ded	eej f
ddZ  ZS )SSLFeatureExtractorr   N>  
model_nameoutput_layersample_ratec              	      s   t    |dur|nd| _|tvr td| dtt  t| }| | _| j	  |j
d | _|j| _|| jkrYtd| d| j d| d	 tjj|| jd
| _dS d| _dS )z
        Args:
            model_name: Name of the SSL model to use
            output_layer: Which layer's features to extract (None for last layer), 1-based indexing
            sample_rate: Sample rate of input audio
        NzUnknown model: z. Available models: encoder_embed_dimzResampling from z to z required by .)	orig_freqnew_freq)super__init__r   MODEL_REGISTRY
ValueErrorlistkeys	get_modelmodeleval_paramsfeature_dimr   ssl_sample_rateloggerdebug
torchaudio
transformsResample	resampler)selfr   r   r   bundle	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/linacodec/module/ssl_extractor.pyr      s   



 
zSSLFeatureExtractor.__init__returnc                 C   s    d}| j D ]\}}||9 }q|S )z5Get the hop size of the model's convolutional layers.   )conv_config)r+   hop_size_strider/   r/   r0   r4   5   s   
zSSLFeatureExtractor.hop_sizec                 C   s*   g }| j jjD ]}||j|jf q|S )z?Get the configuration of the convolutional layers in the model.)r    feature_extractorconv_layersappendkernel_sizer6   )r+   r8   layerr/   r/   r0   r3   =   s   zSSLFeatureExtractor.conv_configdesired_output_lengthc                 C   s,   |}t | jD ]\}}|d | | }q|S )zMCalculate the minimum input length required to produce a given output length.r2   )reversedr3   )r+   r<   lengthr:   r6   r/   r/   r0   get_minimum_input_lengthF   s   z,SSLFeatureExtractor.get_minimum_input_lengthFwaveformlengths
num_layersreturn_lengthsc                 C   sV   |  dkr|d}| jdur| |}| jj|||p| jd\}}|r)||fS |S )aS  
        Args:
            waveform: (batch_size, num_samples)
            lengths: Optional tensor of sequence lengths for each batch item (used for attention masking)

        Returns:
            features: List of feature tensors for each layer (batch_size, frame, dim)
            lengths: Sequence lengths for each batch item
        r2   r   N)rB   )dim	unsqueezer*   r    extract_featuresr   )r+   r@   rA   rB   rC   featuresfeature_lengthsr/   r/   r0   forwardM   s   



zSSLFeatureExtractor.forward)r   Nr   )NNF)__name__
__module____qualname__strintr   propertyr4   r   tupler3   r?   torchno_gradTensorboolrI   __classcell__r/   r/   r-   r0   r      s,     r   )rQ   torch.nnnnr'   torchaudio.pipelines	pipelinestorchaudio.models.wav2vec2r   %torchaudio.models.wav2vec2.componentsr   utilr   r%   WAV2VEC2_BASEWAV2VEC2_LARGEWAV2VEC2_LARGE_LV60KHUBERT_BASEHUBERT_LARGEHUBERT_XLARGE
WAVLM_BASEWAVLM_BASE_PLUSWAVLM_LARGEr   Moduler   r/   r/   r/   r0   <module>   s&    