o
    ϯi"                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlZddlmZ G d	d
 d
ejjZdS )z
Contrastive Language-Audio Pretraining Model from LAION
--------------------------------------------------------
Paper: https://arxiv.org/abs/2211.06687
Authors (equal contributions): Ke Chen, Yusong Wu, Tianyu Zhang, Yuchen Hui
Support: LAION
    N)create_model   )get_audio_features)int16_to_float32float32_to_int16)RobertaTokenizer)load_state_dictc                       sP   e Zd Zdd fddZdd	 ZdddZdddZdddZdddZ  Z	S )CLAP_ModuleFN
HTSAT-tinyrobertareturnc           	         s   t t|   |du rtj rdnd}d}|r'd}t||||||d\}}nt|||||d\}}|| _|| _|| _	t
d| _dS )	a  Initialize CLAP Model

        Parameters
        ----------
        enable_fusion: bool
            if true, it will create the fusion clap model, otherwise non-fusion clap model (default: false) 
        device: str
            if None, it will automatically detect the device (gpu or cpu)
        amodel: str
            audio encoder architecture, default: HTSAT-tiny
        tmodel: str
            text encoder architecture, default: roberta
        Nzcuda:0cpufp32aff_2d)	precisiondeviceenable_fusionfusion_type)r   r   r   zroberta-base)superr	   __init__torchcudais_availabler   r   model	model_cfgr   from_pretrainedtokenize)	selfr   r   amodeltmodelr   r   r   r   	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/laion_clap/hook.pyr      s2   	
zCLAP_Module.__init__c                 C   s   | j |ddddd}|S )N
max_lengthTM   pt)padding
truncationr$   return_tensors)r   )r   textresultr"   r"   r#   	tokenizerA   s   zCLAP_Module.tokenizerTc           
      C   s  d}g d}|durt d| d nGt d |dkr$| jr"dnd	}tjtjt}|| }tj||}tj	|rEt d
 nt d t
|| tj|}t d t d t|dd}| j| |rdd | j D }|D ]}	t |	d|	|v rdnd qzdS dS )a6  Load the pretrained checkpoint of CLAP model

        Parameters
        ----------
        ckpt: str
            if ckpt is specified, the model will load this ckpt, otherwise the model will download the ckpt from zenodo. 
 
            For fusion model, it will download the 630k+audioset fusion model (id=3). For non-fusion model, it will download the 630k+audioset model (id=1).
        model_id:
            if model_id is specified, you can download our best ckpt, as:
                id = 0 --> 630k non-fusion ckpt 

                id = 1 --> 630k+audioset non-fusion ckpt 

                id = 2 --> 630k fusion ckpt 

                id = 3 --> 630k+audioset fusion ckpt 

            Note that if your model is specied as non-fusion model but you download a fusion model ckpt, you will face an error.
        z7https://huggingface.co/lukewys/laion_clap/resolve/main/)z630k-best.ptz630k-audioset-best.ptz630k-fusion-best.ptz630k-audioset-fusion-best.ptNzLoad the specified checkpoint z from users.z&Load our best checkpoint in the paper.r-      r   z$The checkpoint is already downloadedz&Downloading laion_clap weight files...zDownload completed!zLoad Checkpoint...T)skip_paramsc                 S   s   g | ]\}}|qS r"   r"   ).0npr"   r"   r#   
<listcomp>u   s    z)CLAP_Module.load_ckpt.<locals>.<listcomp>	LoadedUnloaded)logginginfor   ospathdirnamerealpath__file__joinexistswgetdownloadr   r   named_parameters)
r   ckptmodel_idverbosedownload_linkdownload_namespackage_dirweight_file_nameparam_namesr1   r"   r"   r#   	load_ckptK   s0   



zCLAP_Module.load_ckptc           	   
   C   s   | j   g }|D ]3}tj|dd\}}tt|}t| }i }t	||d| j
r,dndd| jd |jd}|| q	| j |}|sM|   }|S )	a  get audio embeddings from the audio file list

        Parameters
        ----------
        x: List[str] (N,): 
            an audio file list to extract features, audio files can have different lengths (as we have the feature fusion machanism)
        use_tensor: boolean:
            if True, it will return the torch tensor, preserving the gradient (default: False).
        Returns
        ----------
        audio_embed : numpy.darray | torch.Tensor (N,D):
            audio embeddings that extracted from audio files
        i  )sr S fusion
rand_trunc	repeatpad	audio_cfgdata_truncatingdata_fillingrQ   require_grad)r   evallibrosaloadr   r   r   
from_numpyfloatr   r   r   requires_gradappendget_audio_embeddingdetachr   numpy)	r   x
use_tensoraudio_inputfaudio_waveform_	temp_dictaudio_embedr"   r"   r#   !get_audio_embedding_from_filelisty   s&   
z-CLAP_Module.get_audio_embedding_from_filelistc              
   C   s   | j   g }|D ],}|stt|}t| }i }t||d| jr%dndd| j	d |j
d}|| q	| j |}|sF|   }|S )a  get audio embeddings from the audio data

        Parameters
        ----------
        x: np.darray | torch.Tensor (N,T): 
            audio data, must be mono audio tracks.
        use_tensor: boolean:
            if True, x should be the tensor input and the output will be the tesnor, preserving the gradient (default: False).      
            Note that if 'use tensor' is set to True, it will not do the quantize of the audio waveform (otherwise the gradient will not be preserved).
        Returns
        ----------
        audio embed: numpy.darray | torch.Tensor (N,D):
            audio embeddings that extracted from audio files
        rM   rN   rO   rP   rQ   rR   )r   rV   r   r   r   rY   rZ   r   r   r   r[   r\   r]   r^   r   r_   )r   r`   ra   rb   rd   rf   rg   r"   r"   r#   get_audio_embedding_from_data   s&   
z)CLAP_Module.get_audio_embedding_from_datac                 C   sJ   | j   |dur||}n| |}| j |}|s#|   }|S )a  get text embeddings from texts

        Parameters
        ----------
        x: List[str] (N,): 
            text list 
        tokenizer: func:
            the tokenizer function, if not provided (None), will use the default Roberta tokenizer.
        use_tensor: boolean:
            if True, the output will be the tesnor, preserving the gradient (default: False).      
        Returns
        ----------
        text_embed : numpy.darray | torch.Tensor (N,D):
            text embeddings that extracted from texts
        N)r   rV   r,   get_text_embeddingr^   r   r_   )r   r`   r,   ra   
text_input
text_embedr"   r"   r#   rj      s   


zCLAP_Module.get_text_embedding)FNr
   r   )r   N)Nr-   T)F)NF)
__name__
__module____qualname__r   r,   rK   rh   ri   rj   __classcell__r"   r"   r    r#   r	      s    +


.
%$r	   )__doc__r9   r7   r   rW   clap_moduler   training.datar   r   r   transformersr   r@   clap_module.factoryr   nnModuler	   r"   r"   r"   r#   <module>   s    