o
    ϯi                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlZddlZddlm  mZ ddlmZ dd	lmZ ddlZdd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! G dd dej"Z#G dd dej"Z$G dd dej"Z%G dd dej"Z&G dd dej'Z'G dd dej"Z(G dd dej"Z)G dd dej"Z*G dd  d ej"Z+eG d!d" d"Z,eG d#d$ d$Z-eG d%d& d&Z.G d'd( d(ej"Z/d)ej"fd*d+Z0d7d.e1d/e2d0e3fd1d2Z4d3e5d4fd5d6Z6dS )8z CLAP Model

Adapted from CLIP: https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
Adapted to the Audio Task.
    )OrderedDict)	dataclass)audio)TupleUnionCallableOptionalN)nn   )	TimmModel)freeze_batch_norm_2d)create_pann_model)create_htsat_model)	BertModelRobertaModel	BartModel)BatchEncodingc                       s4   e Zd Zg de df fdd	Zdd Z  ZS )	MLPLayers)   r   r   皙?c                    s   t t|   || _|| _g }t|d d |dd  D ]\}}|t|| || j |t	| j q|d d }tj
| | _d S )Nr
   )superr   __init__nonlindropoutzipappendr	   LinearDropout
Sequential
sequential)selfunitsr   r   sequenceu0u1	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/laion_clap/clap_module/model.pyr      s   "zMLPLayers.__init__c                 C   s   |  |}|S N)r!   )r"   Xr)   r)   r*   forward*   s   
zMLPLayers.forward)__name__
__module____qualname__r	   ReLUr   r-   __classcell__r)   r)   r'   r*   r      s    r   c                       s2   e Zd ZdZd fdd	ZdejfddZ  ZS )	
Bottleneck   r
   c                    s  t    tj||ddd| _t|| _tj||dddd| _t|| _|dkr/t	|nt
 | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksb||tj krttdt	|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr
   F)bias   )paddingr5   Tinplacez-10)strider5   1)r   r   r	   Conv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dIdentityavgpool	expansionconv3bn3r1   relu
downsampler;   r3   r    r   )r"   inplanesplanesr;   r'   r)   r*   r   2   s:   


zBottleneck.__init__xc                 C   st   |}|  | | |}|  | | |}| |}| | |}| jd ur/| |}||7 }|  |}|S r+   )	rI   r@   r>   rB   rA   rE   rH   rG   rJ   )r"   rM   identityoutr)   r)   r*   r-   Z   s   



zBottleneck.forwardr
   )	r.   r/   r0   rF   r   torchTensorr-   r2   r)   r)   r'   r*   r3   /   s    (r3   c                	       s:   e Zd Z	d
dedededef fddZdd	 Z  ZS )AttentionPool2dNspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r
   g      ?)r   r   r	   	ParameterrQ   randnpositional_embeddingr   k_projq_projv_projc_projrV   )r"   rT   rU   rV   rW   r'   r)   r*   r   k   s   

zAttentionPool2d.__init__c              	   C   s4  | |jd |jd |jd |jd  ddd}tj|jddd|gdd}|| jd d d d d f |j }t	j
di d|d	|d
|d|jd d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddddd| jjd| jjddd| jdd\}}|d S )Nr   r
   rX   r6   T)dimkeepdimr`   querykeyvalueembed_dim_to_checkr   rV   q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weighttrainingneed_weightsr)   )reshapeshapepermuterQ   catmeanr[   todtypeFmulti_head_attention_forwardrV   r]   weightr\   r^   r5   r_   rs   )r"   rM   _r)   r)   r*   r-   x   s^   *$

	
zAttentionPool2d.forwardr+   )r.   r/   r0   intr   r-   r2   r)   r)   r'   r*   rS   j   s    rS   c                       sN   e Zd ZdZd fdd	ZdddZd	d
 ZdddZdd Zdd Z	  Z
S )ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       @   c                    s>  t    || _|| _tjd|d ddddd| _t|d | _tj|d |d dddd| _	t|d | _
tj|d |dddd| _t|| _td| _tjdd| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _|   d S )Nr6   rX   r
   F)kernel_sizer;   r7   r5   )r   r7   r5   Tr8   r   )r;   r4          )r   r   rW   
image_sizer	   r=   r>   r?   r@   rA   rB   rG   rH   rC   rE   r1   rI   	_inplanes_make_layerlayer1layer2layer3layer4rS   attnpoolinit_parameters)r"   layersrW   headsr   widthrU   r'   r)   r*   r      s.   
zModifiedResNet.__init__r
   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr
   )r3   r   rF   ranger   r	   r    )r"   rL   blocksr;   r   r   r)   r)   r*   r      s
   
zModifiedResNet._make_layerc                 C   s   | j d ur8| j jjd }tjj| j jj|d tjj| j jj|d tjj| j j	j|d tjj| j jj|d | j
| j| j| jfD ]}| D ]\}}|drWtj| qHqBd S )N      ࿩stdz
bn3.weight)r   r_   in_featuresr	   initnormal_r]   r~   r\   r^   r   r   r   r   named_parametersendswithzeros_)r"   r   resnet_blocknameparamr)   r)   r*   r      s   

zModifiedResNet.init_parametersr   Fc                 C   s8   |dksJ d|   D ]}d|_q|rt|  d S d S Nr   z6partial locking not currently supported for this modelF)
parametersrequires_gradr   r"   unlocked_groupsfreeze_bn_statsr   r)   r)   r*   lock   s   
zModifiedResNet.lockc                 C   sL   | j | jf| j| jf| j| jffD ]\}}| |||}q| |}|S r+   )r>   r@   rA   rB   rG   rH   rI   rE   )r"   rM   convbnr)   r)   r*   stem   s   



zModifiedResNet.stemc                 C   s@   |  |}| |}| |}| |}| |}| |}|S r+   )r   r   r   r   r   r   r"   rM   r)   r)   r*   r-      s   





zModifiedResNet.forward)r   r   rP   r   F)r.   r/   r0   __doc__r   r   r   r   r   r-   r2   r)   r)   r'   r*   r      s    
	
	
r   c                   @   s    e Zd ZdZdejfddZdS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.rM   c                 C   s*   |j }t|| j| j| j| j}||S r+   )r{   r|   
layer_normnormalized_shaper~   r5   epsrz   )r"   rM   	orig_typer)   r)   r*   r-      s   
zLayerNorm.forwardN)r.   r/   r0   r   rQ   rR   r-   r)   r)   r)   r*   r      s    r   c                   @   s   e Zd ZdejfddZdS )	QuickGELUrM   c                 C   s   |t d|  S )NgZd;?)rQ   sigmoidr   r)   r)   r*   r-      s   zQuickGELU.forwardN)r.   r/   r0   rQ   rR   r-   r)   r)   r)   r*   r      s    r   c                       sh   e Zd Zejfdededef fddZddej	de
ej	 fd	d
Zddej	de
ej	 fddZ  ZS )ResidualAttentionBlockd_modeln_head	act_layerc              
      sl   t    t||| _t|| _ttdt	||d fd| fdt	|d |fg| _
t|| _d S )Nc_fcr4   gelur_   )r   r   r	   MultiheadAttentionattnr   ln_1r    r   r   mlpln_2)r"   r   r   r   r'   r)   r*   r     s   

	zResidualAttentionBlock.__init__NrM   	attn_maskc                 C   s   | j |||d|dd S )NF)rt   r   r   )r   r"   rM   r   r)   r)   r*   	attention  s   z ResidualAttentionBlock.attentionc                 C   s0   || j | ||d }|| | | }|S Nr   )r   r   r   r   r   r)   r)   r*   r-     s   zResidualAttentionBlock.forwardr+   )r.   r/   r0   r	   GELUr   r   r   rQ   rR   r   r   r-   r2   r)   r)   r'   r*   r     s     $r   c                	       sP   e Zd Zejfdedededef fddZddej	d	e
ej	 fd
dZ  ZS )Transformerr   r   r   r   c                    s<   t    | _|| _t fddt|D | _d S )Nc                    s   g | ]	}t  d qS )r   )r   ).0r   r   r   r   r)   r*   
<listcomp>%  s    z(Transformer.__init__.<locals>.<listcomp>)r   r   r   r   r	   
ModuleListr   	resblocks)r"   r   r   r   r   r'   r   r*   r     s   

zTransformer.__init__NrM   r   c                 C   s   | j D ]}|||d}q|S r   )r   )r"   rM   r   rr)   r)   r*   r-   +  s   
zTransformer.forwardr+   )r.   r/   r0   r	   r   r   r   r   rQ   rR   r   r-   r2   r)   r)   r'   r*   r     s    $r   c                       sZ   e Zd Zejfdededededededef fdd	ZdddZde	j
fddZ  ZS )VisualTransformerr   
patch_sizer   r   r   rW   r   c           	         s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t||||d| _t|| _t|t	|| | _d S )Nr6   F)in_channelsout_channelsr   r;   r5   r   rX   r
   r   )r   r   r   rW   r	   r=   r>   rY   rQ   rZ   class_embeddingr[   r   ln_prer   text_branchln_postproj)	r"   r   r   r   r   r   rW   r   scaler'   r)   r*   r   2  s&   



zVisualTransformer.__init__r   Fc                 C   s(   |dksJ d|   D ]}d|_qd S r   )r   r   r   r)   r)   r*   r   S  s   
zVisualTransformer.lockrM   c              	   C   s   |  |}||jd |jd d}|ddd}tj| j|jtj	|jd d|jd |j|j
d |gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urr|| j }|S )Nr   r
   r   rX   r{   devicerb   )r>   ru   rv   rw   rQ   rx   r   rz   r{   zerosr   r[   r   r   r   r   r   r)   r)   r*   r-   Z  s,   





zVisualTransformer.forwardr   )r.   r/   r0   r	   r   r   r   r   r   rQ   rR   r-   r2   r)   r)   r'   r*   r   1  s&    	
!r   c                   @   s   e Zd ZU dZeeeeeef ef ed< dZeed< dZ	eed< dZ
eeeef ef ed< d	Zeed
< dZeed< dZeed< dZeed< d	S )CLAPVisionCfg   r      r      r   r   r   Ntimm_model_nameFtimm_model_pretrainedavg	timm_poollinear	timm_proj)r.   r/   r0   r   r   r   r   __annotations__r   r   r   r   strr   boolr   r   r)   r)   r)   r*   r   w  s   
  


r   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dS )CLAPAudioCfpPANN
model_typeCnn14
model_namei  sample_ratei   audio_lengthwindow_sizehop_size2   fmini6  fmaxi  	class_numr   mel_binsi S clip_samplesN)r.   r/   r0   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r)   r)   r)   r*   r     s   
 r   c                   @   s>   e Zd ZU eed< eed< eed< eed< eed< eed< dS )CLAPTextCfgcontext_length
vocab_sizer   r   r   r   N)r.   r/   r0   r   r   r   r)   r)   r)   r*   r     s   
 r   c                       s   e Zd Z					d"dedededed	ed
ededef fddZdd Z	dd Z
dd Zdd Zd#ddZdd Zdd Zdd Zd$d d!Z  ZS )%CLAPFNoner   rI   rU   	audio_cfgtext_cfg
quick_geluenable_fusionfusion_typejoint_embed_shapemlp_actc	                    sl  t    t|trtdi |}t|trtdi |}|| _|| _|| _|| _	|| _
|| _|j| _|r7tntj}	|dkrCt }
n|dkrLt }
nt|jdkr[t|||| _n |jdkrht|||| _ntd|j d td|j d|jdkrt|j|j|j|	d	| _|j| _t|j|j| _ t!t"#| j|j| _$t%|j| _&t'| j
| j
| j
gd
d| _(t)t*|j| j
|
t*| j
| j
| _+n|jdkrt,-d| _t'| j
| j
| j
gd
d| _(t)t*d| j
|
t*| j
| j
| _+nm|jdkr&t.-d| _t'| j
| j
| j
gd
d| _(t)t*d| j
|
t*| j
| j
| _+n@|jdkrSt/-d| _t'| j
| j
| j
gd
d| _(t)t*d| j
|
t*| j
| j
| _+ntd|j d td|j d|j| _0t'| j
| j
| j
gd
d| _1t)t*|| j
|
t*| j
| j
| _2t!t"3g t45d | _6t!t"3g t45d | _7| j8d| 9 dd | :  d S )NrI   r   r   HTSATzModel config for 
 not found not found.transformer)r   r   r   r   r   )r#   r   bertzbert-base-uncasedr   robertazroberta-basebartzfacebook/bart-base$I$I,@r   F)
persistentr)   );r   r   
isinstancedictr   r   r   r   r   r   r  r  r   r   r	   r   r1   NotImplementedErrorr   r   audio_branchr   loggingerrorRuntimeErrorr   r   r   r   r   r   	Embeddingtoken_embeddingrY   rQ   emptyr[   r   ln_finalr   text_transformr    r   text_projectionr   from_pretrainedr   r   text_branch_typeaudio_transformaudio_projectiononesnploglogit_scale_alogit_scale_tregister_bufferbuild_attention_maskinit_text_branch_parameters)r"   rU   r   r   r   r   r   r  r  r   mlp_act_layerr'   r)   r*   r     s   








zCLAP.__init__c                 C   sH  | j dkrftjj| jjdd tjj| jdd | jjd d| jj	 d  }| jjd }d| jj d }| jj
D ]-}tjj|jj|d tjj|jjj|d tjj|jjj|d tjj|jjj|d q8| j dksp| j dkrz| jjjjjd	 }n| j d
kr| jjjjd	 }n| jj}tj| jtd tj| jtd d S )Nr  g{Gz?r   g{Gz?r   rX   r  r  r   r	  r
  )r  r	   r   r   r  r~   r[   r   r   r   r   r   rj   out_projr   r   r_   
embeddingsword_embeddingsrv   shared	constant_r   r  r  r!  )r"   proj_stdattn_stdfc_stdblockr   r)   r)   r*   r$  '  s(   


z CLAP.init_text_branch_parametersc                 C   s,   t | j| j}|td |d |S )Nz-infr
   )rQ   r  r   fill_floattriu_)r"   maskr)   r)   r*   r#  E  s   
zCLAP.build_attention_maskc                 C   s   | j |d |dS )N)mixup_lambdar   )r  )r"   r   r   r)   r)   r*   encode_audioM  s   zCLAP.encode_audioc                 C   s  | j dkrF|j|dd}| |}|| j }|ddd}| j|| jd}|ddd}| |}| |t	
|jd |jdd	f }|S | j d
krq| j|d j|dd|d j|dd|d j|dddd }| |}|S | j dkr| j|d j|dd|d j|dddd }| |}|S | j dkrt	j| j|d j|dd|d j|dddd dd}| |}|S td| j  d td| j  d)Nr  T)r   non_blockingr
   r   rX   r   r   rb   r  	input_idsattention_masktoken_type_ids)r6  r7  r8  pooler_outputr  )r6  r7  r	  encoder_last_hidden_state)axiszModel type r  r  )r  rz   r  r[   rw   r   r   r  r  rQ   arangerv   argmaxry   r  r  r  )r"   textr   rM   r)   r)   r*   encode_textZ  sd   



&!

	



zCLAP.encode_textNc                 C   s   |du r|dur|j }n|dur|j }|du r%|du r%| j | j fS |du r0| j||dS |du r@| | j||dd S | | j||dd }tj|dd}| j||d}tj|dd}| 	|}| 
|}||||| j | j fS )a*  Forward audio and text into the CLAP

        Parameters
        ----------
        audio: torch.Tensor (batch_size, audio_length)
            the time-domain audio input / the batch of mel_spec and longer list.
        text: torch.Tensor () // need to add
            the text token input
        Nr   	embeddingr   rb   )r   r   expr!  r?  r  r4  r|   	normalizer  r  )r"   r   r>  r   audio_featurestext_featuresaudio_features_mlptext_features_mlpr)   r)   r*   r-     s4   


zCLAP.forwardc                 C   s   | j  | j fS r+   )r   rB  r!  )r"   r)   r)   r*   get_logit_scale  s   zCLAP.get_logit_scalec                 C   sJ   t |  j}|D ]}|| |||< q	| j||d}tj|dd}|S )a	  Get the text embedding from the model

        Parameters
        ----------
        data: torch.Tensor 
            a tensor of text embedding

        Returns
        ----------
        text_embed: torch.Tensor
            a tensor of text_embeds (N, D)

        r@  r   rb   )nextr   r   rz   r?  r|   rC  )r"   datar   ktext_embedsr)   r)   r*   get_text_embedding  s   zCLAP.get_text_embeddingc                    s|   t |  j}i }|d  }|D ] tj fdd|D dd|| < q| j||dd }| |}t	j
|dd}|S )a,  Get the audio embedding from the model

        Parameters
        ----------
        data: a list of dict
            the audio input dict list from 'get_audio_feature' method

        Returns
        ----------
        audio_embed: torch.Tensor
            a tensor of audio_embeds (N, D)

        r   c                    s   g | ]	}|   d qS )r   )	unsqueeze)r   drK  r)   r*   r     s    z,CLAP.get_audio_embedding.<locals>.<listcomp>rb   r@  rA  r   )rI  r   r   keysrQ   rx   rz   r4  r  r|   rC  )r"   rJ  r   
input_dictrQ  audio_embedsr)   rP  r*   get_audio_embedding  s   (
zCLAP.get_audio_embeddingc                    s6  j rJ di }jjdkr& jdd}j||dt jdd|t< |S jjdkrt }jj| }|dkrC 	| t }|du rLt
||}|jjkr fd	d
td|jj |D }| jj d   t|}j||dt |t< |S  jdd}j||dt jdd|t< |S )a	  Forward one audio and produce the audio embedding

        Parameters
        ----------
        audio:  (audio_length)
            the time-domain audio input, notice that it must be only one input
        hopsize: int
            the overlap hopsize as the sliding window

        Returns
        ----------
        output_dict: {
            key: [n, (embedding_shape)] if "HTS-AT"
            or
            key: [(embedding_shape)] if "PANN"
        }
            the list of key values of the audio branch

        z,the inference mode must be run at eval stager   r   rb   r@  r  r
   Nc                    s$   g | ]} ||j j   qS r)   )r   r   clone)r   posr   r"   r)   r*   r     s    z$CLAP.audio_infer.<locals>.<listcomp>)rs   r   r   rN  r4  rd   squeezelenr   repeatminr   r   rU  rQ   stack)r"   r   hopsizer   output_dictaudio_input	audio_lenrK  r)   rW  r*   audio_infer  s6   


zCLAP.audio_infer)FFr   r   rI   r+   )NN)r.   r/   r0   r   r   r   r   r   r   r$  r#  r4  r?  r-   rH  rM  rT  ra  r2   r)   r)   r'   r*   r     sB    	 
0-r   modelc                 C   s   dd }|  | dS )z+Convert applicable model parameters to fp16c                 S   s   t | tjtjtjfr | jj | j_| jd ur | jj | j_t | tj	rGg dd dD dddD ]}t
| |}|d urF|j |_q5dD ]}t| |r_t
| |}|d ur_|j |_qId S )Nc                 S   s   g | ]}| d qS )_proj_weightr)   )r   sr)   r)   r*   r   ,  s    zMconvert_weights_to_fp16.<locals>._convert_weights_to_fp16.<locals>.<listcomp>)inqrK  vrk   rl   rm   )r  r   )r  r	   Conv1dr=   r   r~   rJ  halfr5   r   getattrhasattr)lattrtensorr   r)   r)   r*   _convert_weights_to_fp16$  s2   



z9convert_weights_to_fp16.<locals>._convert_weights_to_fp16N)apply)rb  ro  r)   r)   r*   convert_weights_to_fp16!  s   rq  Fr   
state_dictr   r   c                 C   s  |d }|d }|d }| d j d }| d j d }| d j d }	|	d }
ttd	d
 | D }tdi |}tdi |}t|||d||d}| d | d< | d | d< t|  d d  }|D ]}|drm| 	|d  q`dD ]}| 	|d  qp|j
| dd | S )NrU   r   r   r[   r   ztoken_embedding.weightzln_final.weightr   c                 s   s(    | ]}| d r|dd V  qdS )ztransformer.resblocks.rX   N)
startswithsplit)r   rK  r)   r)   r*   	<genexpr>I  s    
z5build_model_from_openai_state_dict.<locals>.<genexpr>T)r   r   r   r   r   logit_scaler   r!  zvisual.)rw  input_resolutionr   r   F)strictr)   )rv   rY  setr   r   r   listrQ  rt  popload_state_dicteval)rr  	model_cfgr   r   rU   r   r   r   r   transformer_widthtransformer_headstransformer_layersrb  pop_keysrd   r)   r)   r*   "build_model_from_openai_state_dict?  sB   
r     cpuc                 C   sh   |    | jj}tj||f|d}tj|| jftj|d}tjj	| t
||f|f|fdd} || j_| S )Nr@  r   )r-   r?  encode_image)inputs)r~  r   r   rQ   r  r   r   r   jittrace_moduler  )rb  
batch_sizer   r   example_audioexample_textr)   r)   r*   trace_modell  s    r  )Fr   )7r   collectionsr   dataclassesr   
email.mimer   typingr   r   r   r   numpyr  rQ   torch.nn.functionalr	   
functionalr|   
timm_modelr   r  utilsr   
pann_modelr   htsatr   transformersr   r   r   $transformers.tokenization_utils_baser   Moduler   r3   rS   r   r   r   r   r   r   r   r   r   r   rq  r  r   r   r  r   r  r)   r)   r)   r*   <module>   sJ    ;/[	F	  }-