o
    ϯiR                     @   s  d dl Z de jd< d dlZd dlmZ d dlm  mZ d dlm	Z	m
Z
 d dlmZ ddlmZmZmZ ddlmZmZmZ d	d
 Zdd ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdddZdS )    Nz/tmp/NUMBA_CACHE_DIR)SpectrogramLogmelFilterBank)SpecAugmentation   )do_mixupinterpolatepad_framewise_output)iAFFAFFDAFc                 C   s<   t j| j t| dr| jdur| jjd dS dS dS )z,Initialize a Linear or Convolutional layer. biasN        )nninitxavier_uniform_weighthasattrr   datafill_)layer r   U/home/ubuntu/.local/lib/python3.10/site-packages/laion_clap/clap_module/pann_model.py
init_layer   s   

r   c                 C   s    | j jd | jjd dS )zInitialize a Batchnorm layer. r         ?N)r   r   r   r   )bnr   r   r   init_bn   s   r   c                       .   e Zd Z fddZdd Zd	ddZ  ZS )
	ConvBlockc                    sb   t t|   tj||ddddd| _tj||ddddd| _t|| _t|| _	| 
  d S )N)   r   r   r   Fin_channelsout_channelskernel_sizestridepaddingr   )superr   __init__r   Conv2dconv1conv2BatchNorm2dbn1bn2init_weightselfr"   r#   	__class__r   r   r(   !   s   zConvBlock.__init__c                 C   s,   t | j t | j t| j t| j d S N)r   r*   r+   r   r-   r.   r1   r   r   r   r/   4   s   


zConvBlock.init_weight   r7   avgc                 C   s   |}t | | |}t | | |}|dkr%t j||d}|S |dkr2t j||d}|S |dkrJt j||d}t j||d}|| }|S tdNmax)r$   r8   zavg+maxzIncorrect argument!)	Frelu_r-   r*   r.   r+   
max_pool2d
avg_pool2d	Exceptionr1   input	pool_size	pool_typexx1x2r   r   r   forward;   s   
zConvBlock.forwardr6   r8   __name__
__module____qualname__r(   r/   rG   __classcell__r   r   r2   r   r       s    r   c                       r   )
ConvBlock5x5c                    s>   t t|   tj||ddddd| _t|| _|   d S )N   rP   r    r6   Fr!   )	r'   rN   r(   r   r)   r*   r,   r-   r/   r0   r2   r   r   r(   O   s   zConvBlock5x5.__init__c                 C   s   t | j t| j d S r4   )r   r*   r   r-   r5   r   r   r   r/   \   s   
zConvBlock5x5.init_weightr6   r8   c                 C   s   |}t | | |}|dkrt j||d}|S |dkr't j||d}|S |dkr?t j||d}t j||d}|| }|S tdr9   )r;   r<   r-   r*   r=   r>   r?   r@   r   r   r   rG   a   s   
zConvBlock5x5.forwardrH   rI   r   r   r2   r   rN   N   s    rN   c                       s6   e Zd Zd fdd	Zdd Zdd Zd	d
 Z  ZS )AttBlocklinearr   c                    sb   t t|   || _|| _tj||ddddd| _tj||ddddd| _t	|| _
|   d S )Nr   r   Tr!   )r'   rQ   r(   
activationtemperaturer   Conv1dattclaBatchNorm1dbn_attinit_weights)r1   n_inn_outrS   rT   r2   r   r   r(   t   s   zAttBlock.__init__c                 C   s"   t | j t | j t| j d S r4   )r   rV   rW   r   rY   r5   r   r   r   rZ         

zAttBlock.init_weightsc                 C   sJ   t jt | |dddd}| | |}t j|| dd}|||fS )Ni
   dimr7   )torchsoftmaxclamprV   nonlinear_transformrW   sum)r1   rD   norm_attrW   r   r   r   rG      s   
zAttBlock.forwardc                 C   s&   | j dkr|S | j dkrt|S d S )NrR   sigmoid)rS   rb   rh   )r1   rD   r   r   r   re      s
   


zAttBlock.nonlinear_transform)rR   r   )rJ   rK   rL   r(   rZ   rG   re   rM   r   r   r2   r   rQ   s   s
    rQ   c                       2   e Zd Z	d
 fdd	Zdd Zddd	Z  ZS )Cnn14FNonec
                    s.  t t|   d}
d}d}d}d}d }|| _|	| _t||||
||dd| _t||||||||dd	| _t	dd	d
d	d| _
td| _| jrR| jdkrRtddd| _ntddd| _tddd| _tddd| _tddd| _tddd| _tddd| _tjdddd| _tjd|dd| _| jr| jdv rttjddddd	dtd| _| jdkrt | _n| jdkrtddd| _n| jdkrtddd| _| jr| jdv rttj ddd d!d"dtdtj!dd#| _"| jd$krt | _n| jd%krtdd&d| _n| jd'krtdd&d| _| #  d S )(NhannTreflectr   绽|=n_fft
hop_length
win_lengthwindowcenterpad_modefreeze_parameters	srrp   n_melsfminfmaxrefamintop_dbrv   @   r7      time_drop_widthtime_stripes_numfreq_drop_widthfreq_stripes_numchannel_map   r"   r#   r               i   r   daf_1daff_1diaff_1drP   r   r$   r%   r&   r   r   1D)channelstyper   daf_2daff_2diaff_2drO   )   r7   r6   )inplacer   r   2Dr   )$r'   rj   r(   enable_fusionfusion_typer   spectrogram_extractorr   logmel_extractorr   spec_augmenterr   r,   bn0r   conv_block1conv_block2conv_block3conv_block4conv_block5conv_block6Linearfc1fc_audioset
SequentialrU   rX   
mel_conv1dr   fusion_modelr   r
   r)   ReLU
mel_conv2dr/   r1   sample_ratewindow_sizehop_sizemel_binsrz   r{   classes_numr   r   rs   rt   ru   r|   r}   r~   r2   r   r   r(      sn   






zCnn14.__init__c                 C   "   t | j t| j t| j d S r4   r   r   r   r   r   r5   r   r   r   r/      r]   zCnn14.init_weightNc           !   	   C   s  | j r|d  dkrd|d td|d jd d< | j sA| |d j|dd}| |}|dd}| 	|}|dd}n|d j|dd}|d	 j|dd}t
|d }|dd}| 	|}|dd}| jd
v r(|ddddddddf   }t|dkr%||ddddddf   }| \}	}
}}||	|
 ||}t|d }| |}||	|
||d}t|d d}|d|k rtj|tj|	|||d f|dgdd}n|ddddd|f }|dd }| || |||< |d dddddddf }n|}n| jdv r0|}| jr9| |}| jrG|durGt||}| j r| jdv r|ddddddddf }|j\}}}}| j|ddd}t|dkr||ddddddf  }|d}|j\}}}}||| d||}| |}||||d|d|d}|d dd}| \}}}}|d|k rtj|tj||||d |f|jdgdd}n|ddddd|ddf }| || |||< |}n| j|ddd}tj|d| jd}| j |ddd}tj|d| jd}| j!|ddd}tj|d| jd}| j"|ddd}tj|d| jd}| j#|ddd}tj|d| jd}| j$|ddd}tj|d| jd}tj%|dd}tj&|dddd}tj'|dddd}|| }|dd}t(| )|}t*|d}tj+|dd\}}tj%|dd}|| }tj|d| jd}t(| )|}tj|d| jd}t,| -|}|||d} | S ) )
        Input: (batch_size, data_length)longerr   T)r   waveform)devicenon_blockingr   r   
mel_fusionr   N)r   r7   r   r_   )r   r7   r   r   r7   )r   r`   )r   r   r   r   r   r6   r8   rB   rC   )r   r7   r   r   r   皙?ptrainingr    r             ?clipwise_output	embeddingfine_grained_embedding).r   rf   rb   randintshaper   tor   	transposer   wherer   clone
contiguouslensizeviewpermuter   flattencatzerossqueezer   r   r   r   r   r   r   r;   dropoutr   r   r   r   r   mean
max_pool1d
avg_pool1dr<   r   r   r:   rh   r   )!r1   rA   mixup_lambdar   rD   longer_listlonger_list_idxnew_xfusion_x_localFBFCFTFFglobal_xBCHWlocal_xTHTBTC_TW	latent_x1	latent_x2latent_xlatent_outputrE   rF   r   r   output_dictr   r   r   rG      s    


($
.(

  

$2 
zCnn14.forwardFrk   NNrI   r   r   r2   r   rj      s
    Grj   c                       ri   )Cnn6Frk   c
                    s   t t|   d}
d}d}d}d}d }|| _|	| _t||||
||dd| _t||||||||dd	| _t	dd	d
d	d| _
td| _tddd| _tddd| _tddd| _tddd| _tjdddd| _tjd|dd| _|   d S )Nrl   Trm   r   rn   ro   rw   r   r7   r   r   r   r   r   r   r   r   )r'   r   r(   r   r   r   r   r   r   r   r   r   r,   r   rN   r   r   r   r   r   r   r   r/   r   r2   r   r   r(   N  s8   zCnn6.__init__c                 C   r   r4   r   r5   r   r   r   r/   w  r]   zCnn6.init_weightNc                 C   s  |  |}| |}|dd}| |}|dd}| jr#| |}| jr/|dur/t||}| j|ddd}tj	|d| jd}| j
|ddd}tj	|d| jd}| j|ddd}tj	|d| jd}| j|ddd}tj	|d| jd}tj|dd	}tj|dddd
}tj|dddd
}|| }|dd}t| |}t|d}tj|dd	\}	}
tj|dd	}|	| }tj	|d| jd}t| |}tj	|d| jd}t| |}|||d}|S )r   r   r   Nr6   r8   r   r   r   r`   r   r7      r   r   )r   r   r   r   r   r   r   r   r;   r   r   r   r   rb   r   r   r   r<   r   r   r:   rh   r   r1   rA   r   r   rD   r   r   r   r   rE   r   rF   r   r   r   r   r   r   rG   |  sB   





zCnn6.forwardr   r   rI   r   r   r2   r   r   M  s
    )r   c                       ri   )Cnn10Frk   c
                    s   t t|   d}
d}d}d}d}d }|| _|	| _t||||
||dd| _t||||||||dd	| _t	dd	d
d	d| _
td| _tddd| _tddd| _tddd| _tddd| _tddd| _tjdddd| _tjd|dd| _|   d S )Nrl   Trm   r   rn   ro   rw   r   r7   r   r   r   r   r   r   r   r   r   )r'   r   r(   r   r   r   r   r   r   r   r   r   r,   r   r   r   r   r   r   r   r   r   r   r/   r   r2   r   r   r(     s:   zCnn10.__init__c                 C   r   r4   r   r5   r   r   r   r/     r]   zCnn10.init_weightNc                 C   s  |  |}| |}|dd}| |}|dd}| jr#| |}| jr/|dur/t||}| j|ddd}tj	|d| jd}| j
|ddd}tj	|d| jd}| j|ddd}tj	|d| jd}| j|ddd}tj	|d| jd}| j|ddd}tj	|d| jd}tj|dd	}tj|dddd
}tj|dddd
}|| }|dd}t| |}t|d}tj|dd	\}	}
tj|dd	}|	| }tj	|d| jd}t| |}tj	|d| jd}t| |}|||d}|S )r   r   r   Nr6   r8   r   r   r   r`   r   r7   r   r   r   )r   r   r   r   r   r   r   r   r;   r   r   r   r   r   rb   r   r   r   r<   r   r   r:   rh   r   r   r   r   r   rG     sF   





zCnn10.forwardr   r   rI   r   r   r2   r   r     s
    *r   Frk   c                 C   sR   zt | j}|| j| j| j| j| j| j| j||d	}|W S    t	d| j d)N)	r   r   r   r   rz   r{   r   r   r   zImport Model for z7 not found, or the audio cfg parameters are not enough.)
eval
model_namer   r   r   r   rz   r{   	class_numRuntimeError)	audio_cfgr   r   
ModelProtomodelr   r   r   create_pann_model  s    
r  r   )osenvironrb   torch.nnr   torch.nn.functional
functionalr;   torchlibrosa.stftr   r   torchlibrosa.augmentationr   utilsr   r   r	   feature_fusionr
   r   r   r   r   Moduler   rN   rQ   rj   r   r   r  r   r   r   r   <module>   s&   
	.% <_b