o
    iB                    @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlm  mZ ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z) e$*e+Z,dd Z-dd Z.dd Z/diddZ0de
j1de
j1fddZ2ee!ddG dd de Z3ee!ddG d d! d!e Z4ee!G d"d# d#e Z5G d$d% d%ej6Z7G d&d' d'ej6Z8G d(d) d)ej6Z9G d*d+ d+ej6Z:G d,d- d-ej6Z;G d.d/ d/ej6Z<G d0d1 d1ej6Z=G d2d3 d3ej6Z>G d4d5 d5ej6Z?G d6d7 d7eZ@G d8d9 d9ej6ZAG d:d; d;ej6ZBG d<d= d=ej6ZCG d>d? d?ej6ZD	@	djdAej6dBe
j1dCe
j1dDe
j1dEee
j1 dFeEdGeEdHee
j1 fdIdJZFG dKdL dLej6ZGG dMdN dNej6ZHG dOdP dPej6ZIG dQdR dRej6ZJG dSdT dTej6ZKG dUdV dVeZLG dWdX dXej6ZMG dYdZ dZej6ZNe!G d[d\ d\eZOG d]d^ d^eOZPe!d_dG d`da daeOZQe!G dbdc dceOZRe!G ddde deeOZSe!G dfdg dgeOZTg dhZUdS )kzPyTorch CLAP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                 C   sJ   | j \}}}| dddddddf dd|d}|||| |}|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampled r'   c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/clap/modeling_clap.pyinterpolate+   s   
(r)   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r            r   viewpermute
contiguous)r!   window_sizer#   heightwidthnum_channelswindowsr'   r'   r(   window_partition<   s   $r7   c                 C   sN   | j d }| d|| || |||} | dddddd d|||} | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r-   r   r   r   r*   r+   r,   r.   )r6   r2   r3   r4   r5   r'   r'   r(   window_reverseQ   s   
$r8   c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskincremental_indicesr'   r'   r(   "create_position_ids_from_input_idse   s   rF   logitsreturnc                 C   s"   t jt| | jd}tj| |S )Ndevice)r=   arangelenrJ   r   
functionalcross_entropy)rG   labelsr'   r'   r(   contrastive_lossw   s   rP   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   @   j   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )ClapTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedslast_hidden_state.r!   
attentions)__name__
__module____qualname____doc__rT   r   r=   FloatTensor__annotations__rU   r!   tuplerV   r'   r'   r'   r(   rS   |   s   
 rS   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                   @   rR   )ClapAudioModelOutputz
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    Naudio_embedsrU   .r!   rV   )rW   rX   rY   rZ   r_   r   r=   r[   r\   rU   r!   r]   rV   r'   r'   r'   r(   r^      s   
 r^   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )
ClapOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrT   r_   text_model_outputaudio_model_outputrH   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))rd   re   N)getattrto_tuple).0kselfr'   r(   	<genexpr>   s
    
z&ClapOutput.to_tuple.<locals>.<genexpr>)r]   keysrj   r'   rj   r(   rg      s   zClapOutput.to_tuple)rW   rX   rY   rZ   ra   r   r=   r[   r\   rb   rc   rT   r_   rd   r   re   r]   r   rg   r'   r'   r'   r(   r`      s   
 r`   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    Nc                    s   t    || _d S N)super__init__	drop_prob)rk   rr   	__class__r'   r(   rq      s   

zClapDropPath.__init__c                 C   sj   | j dks| js
|S d| j  }|jd fd|jd   }|tj||j|jd }|  |	|| }|S )N        r   r   )r   dtyperJ   )
rr   trainingr   ndimr=   randrw   rJ   floor_div)rk   r!   	keep_probr   random_tensoroutputr'   r'   r(   forward   s   
zClapDropPath.forwardro   )rW   rX   rY   rZ   rq   r   __classcell__r'   r'   rs   r(   rn      s    rn   c                       s.   e Zd ZdZdef fddZdd Z  ZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    s   t    |j}|j}t|| }ttj||ddddt|tj	ddtj||ddddt|| _
ttdtj||ddddt|tj	ddtj||ddddt|| _t | _d S )Nr   r   kernel_sizestridepaddingT)inplace)rp   rq   patch_embeds_hidden_sizeaff_block_rr<   r   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)rk   r   channelsdownsize_ratiointer_channelsrs   r'   r(   rq      s(   


	zClapAudioAFFBlock.__init__c                 C   sF   || }|  || | }| |}d| | d| d|   }|S )Nr*   r   )r   r   r   )rk   r!   residualattention_inputfused_layer_outputr   r'   r'   r(   r      s
   
zClapAudioAFFBlock.forwardrW   rX   rY   rZ   r   rq   r   r   r'   r'   rs   r(   r      s    r   c                       s0   e Zd ZdZdef fddZdddZ  ZS )	ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                    s  t    t|jtr|j|jfn|j}t|jtr |j|jfn|j}t|jtr/|j|jfn|j}|| _|| _|d |d  |d |d  f| _| jd | jd  | _	|j
| _|j| _|d |d  d |d |d  d f}| jrz|jdkrzdnd}tj|j| |j|||d| _|jrt|jnt | _| jrt|| _tj|j|j|d |d d f|d |d d f|d| _d S d S )Nr   r   r*   channel_mapr+   r   r   )rp   rq   
isinstance	spec_sizer<   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)rk   r   r   r   r   r   scale_factorrs   r'   r(   rq     s@   
"(
zClapAudioPatchEmbed.__init__Nc              
   C   s  | j r|d d ddd d d d f }|j\}}}}|| jd ks(|| jd kr?td| d| d| jd  d| jd  d	| |}|d}t|dkr||dd d d d d f  }	|	j\}}}}|	|| d||}	| 	|	}	|	j\}
}}}|	|||||}	|	
d d	}	|	d}tjj|	d|| fd
d}	| || |	||< |}n1|j\}
}
}}|| jd ks|| jd krtd| d| d| jd  d| jd  d	| |}| jr|ddd}| |}|S )Nr   r   zInput audio size (*z) doesn't match model (z).r-   )r   r*   r   r   r+   r   constantr*   )r   r   r   
ValueErrorr   sizerL   r1   r/   r   r0   r   r=   r   rM   padr   	transposer   )rk   r!   is_longer_idxglobal_hidden_statesr#   r5   r3   r4   output_widthlocal_hidden_states_featureslocal_widthr'   r'   r(   r   9  sF    (

 

(

zClapAudioPatchEmbed.forwardro   r   r'   r'   rs   r(   r   	  s    *r   c                       Z   e Zd Z fddZ			ddejdeej deej dee d	e	ej f
d
dZ
  ZS )ClapAudioSelfAttentionc                    s
  t    || dkrtd| d| d|| _t|| | _| j| j | _t|tj	j
r0|n||f| _ttd| jd  d d| jd  d  || _t| jd }t| jd }tt||gdd}t|d}|d d d d d f |d d d d d f  }	|	ddd }	|	d d d d df  | jd d 7  < |	d d d d df  | jd d 7  < |	d d d d df  d| jd  d 9  < |	d	}
| d
|
 tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _t|j| _ d S )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r*   r   ij)indexingr-   relative_position_indexbias)!rp   rq   r   num_attention_headsr<   attention_head_sizeall_head_sizer   collectionsabcIterabler2   r   	Parameterr=   zerosrelative_position_bias_tablerK   stackr   r   r0   r1   sumregister_bufferLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)rk   r   r:   	num_headsr2   coords_hcoords_wcoordscoords_flattenrelative_coordsr   rs   r'   r(   rq   m  s8   
*,((,
zClapAudioSelfAttention.__init__NFr!   attention_mask	head_maskoutput_attentionsrH   c                 C   s  |j \}}}||d| jf}| ||dd}	| ||dd}
| ||dd}t|	|
dd}|t	
| j }| j| jd }|| jd | jd  | jd | jd  d}|ddd }||d }|d ur|j d }||| || j||}||dd }|d| j||}tjj|dd}| |}|d ur|| }t||}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr-   r   r*   r   r9   r   )r   r   r   r/   r   r   r   r=   matmulmathsqrtr   r   r2   r0   r1   	unsqueezer   r   rM   softmaxr   r   r   )rk   r!   r   r   r   r#   r:   r5   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputsr'   r'   r(   r     s@   &


zClapAudioSelfAttention.forwardNNFrW   rX   rY   rq   r=   Tensorr   r[   boolr]   r   r   r'   r'   rs   r(   r   l  s     (r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )ClapAudioSelfOutputc                    s*   t    t||| _t|j| _d S ro   )rp   rq   r   r   denser   r   r   rk   r   r:   rs   r'   r(   rq     s   
zClapAudioSelfOutput.__init__r!   input_tensorrH   c                 C      |  |}| |}|S ro   r   r   rk   r!   r   r'   r'   r(   r     s   

zClapAudioSelfOutput.forwardrW   rX   rY   rq   r=   r   r   r   r'   r'   rs   r(   r     s    $r   c                       b   e Zd Z fddZdd Z			ddejdeej d	eej d
ee	 de
ej f
ddZ  ZS )ClapAudioAttentionc                    s2   t    t||||| _t||| _t | _d S ro   )rp   rq   r   rk   r   r   setpruned_heads)rk   r   r:   r   r2   rs   r'   r(   rq     s   
zClapAudioAttention.__init__c                 C      t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S Nr   r   r9   rL   r   rk   r   r   r  r   r   r   r   r   r   r   unionrk   headsindexr'   r'   r(   prune_heads     zClapAudioAttention.prune_headsNFr!   r   r   r   rH   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   rk   r   )rk   r!   r   r   r   self_outputsattention_outputr   r'   r'   r(   r     s   zClapAudioAttention.forwardr   rW   rX   rY   rq   r  r=   r   r   r[   r   r]   r   r   r'   r'   rs   r(   r    s"    r  c                       2   e Zd Z fddZdejdejfddZ  ZS )ClapAudioIntermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S ro   )rp   rq   r   r   r<   	mlp_ratior   r   
hidden_actstrr	   intermediate_act_fnr   rs   r'   r(   rq     s
   
zClapAudioIntermediate.__init__r!   rH   c                 C   r   ro   r   r  rk   r!   r'   r'   r(   r   
     

zClapAudioIntermediate.forwardr  r'   r'   rs   r(   r        r  c                       r  )ClapAudioOutputc                    s4   t    tt|j| || _t|j| _	d S ro   )
rp   rq   r   r   r<   r  r   r   hidden_dropout_probr   r   rs   r'   r(   rq     s   
zClapAudioOutput.__init__r!   rH   c                 C   r   ro   r   r  r'   r'   r(   r     r  zClapAudioOutput.forwardr  r'   r'   rs   r(   r        r  c                       s   e Zd Zd fdd	Zdd Zdd Zd	d
 Z			ddejde	e
e
f deej dee dee de	ejejf fddZ  ZS )ClapAudioLayerru   r   c                    s   t    |j| _|| _|j| _|| _tj||jd| _	t
|||| jd| _|dkr.t|nt | _tj||jd| _t||| _t||| _d S )Neps)r2   ru   )rp   rq   chunk_size_feed_forward
shift_sizer2   input_resolutionr   r   layer_norm_epslayernorm_beforer  	attentionrn   r   	drop_pathlayernorm_afterr  intermediater  r   )rk   r   r:   r&  r   drop_path_rater%  rs   r'   r(   rq     s   
zClapAudioLayer.__init__c                 C   sD   t || jkr td| _tj rt t|nt || _d S d S Nr   )minr2   r   r%  r=   jit
is_tracingtensor)rk   r&  r'   r'   r(   set_shift_and_window_size,  s
   
 z(ClapAudioLayer.set_shift_and_window_sizec              	   C   s  | j dkrtjd||df||d}td| j t| j | j  t| j  d f}td| j t| j | j  t| j  d f}d}|D ]}	|D ]}
||d d |	|
d d f< |d7 }qEqAt|| j}|d| j| j }|d|d }||dkd|dkd}|S d }|S )Nr   r   rv   r-   r*   g      Yru   )	r%  r=   r   slicer2   r7   r/   r   masked_fill)rk   r3   r4   rw   rJ   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_maskr'   r'   r(   get_attn_mask4  s.   

zClapAudioLayer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj||}||fS r.  )r2   r   rM   r   )rk   r!   r3   r4   	pad_right
pad_bottom
pad_valuesr'   r'   r(   	maybe_padP  s
   zClapAudioLayer.maybe_padNFr!   input_dimensionsr   r   always_partitionrH   c                 C   s  |s|  | n	 |\}}| \}}	}
|}| |}|||||
}| |||\}}|j\}	}}}	| jdkrGtj|| j | j fdd}n|}t	|| j
}|d| j
| j
 |
}| j|||j|jd}| j||||d}|d }|d| j
| j
|
}t|| j
||}| jdkrtj|| j| jfdd}n|}|d dkp|d dk}|r|d d d |d |d d f  }|||| |
}|| | }| |}| |}|| | }|r||d	 f}|S |f}|S )
Nr   )r   r*   )shiftsdimsr-   rv   )r   r   r,   r   )r3  r   r(  r/   rB  r   r%  r=   rollr7   r2   r>  rw   rJ   r)  r8   r1   r*  r+  r,  r   )rk   r!   rC  r   r   rD  r3   r4   r#   r   r   shortcutrA  
height_pad	width_padshifted_hidden_stateshidden_states_windowsr=  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputsr'   r'   r(   r   W  sN   


$

zClapAudioLayer.forward)ru   r   NFF)rW   rX   rY   rq   r3  r>  rB  r=   r   r]   r<   r   r[   r   r   r   r'   r'   rs   r(   r!    s*    
r!  c                       sd   e Zd Z fddZ			ddejdeeef deej	 dee
 d	ee
 d
eej fddZ  ZS )ClapAudioStagec                    sh   t     | _| _t fddt|D | _|d ur,|tjd| _	nd | _	d| _
d S )Nc              
      s:   g | ]}t  | |d  dkrdn jd  dqS )r*   r   )r   r:   r&  r   r-  r%  )r!  r2   rh   ir   r:   r*  r&  r   r'   r(   
<listcomp>  s    	z+ClapAudioStage.__init__.<locals>.<listcomp>)r:   
norm_layerF)rp   rq   r   r:   r   
ModuleListrangeblocksr   
downsamplepointing)rk   r   r:   r&  depthr   r*  r]  rs   rW  r(   rq     s   
	
zClapAudioStage.__init__NFr!   rC  r   r   rD  rH   c                 C   s   |\}}t | jD ]\}}	|d ur|| nd }
|	|||
||}|d }q	|}| jd urE|d d |d d }}||||f}| ||}n||||f}|||f}|rZ||dd  7 }|S )Nr   r   r*   )	enumerater\  r]  )rk   r!   rC  r   r   rD  r3   r4   rV  layer_modulelayer_head_maskrR  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputsr'   r'   r(   r     s"   



zClapAudioStage.forwardrS  )rW   rX   rY   rq   r=   r   r]   r<   r   r[   r   r   r   r'   r'   rs   r(   rT    s$    
rT  c                	       sh   e Zd ZdZejfdee dedejddf fddZ	d	d
 Z
dejdeeef dejfddZ  ZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    r&  r:   rY  rH   Nc                    sB   t    || _|| _tjd| d| dd| _|d| | _d S )Nr+   r*   Fr   )rp   rq   r&  r:   r   r   	reductionr   )rk   r&  r:   rY  rs   r'   r(   rq     s
   
zClapAudioPatchMerging.__init__c                 C   sF   |d dkp|d dk}|r!ddd|d d|d f}t j||}|S )Nr*   r   r   )r   rM   r   )rk   input_featurer3   r4   
should_padrA  r'   r'   r(   rB    s
   zClapAudioPatchMerging.maybe_padrj  rC  c                 C   s   |\}}|j \}}}|||||}| |||}|d d dd ddd dd d f }|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }t||	|
|gd}||dd| }| |}| |}|S )Nr   r*   r   r-   r+   )r   r/   rB  r=   catr   ri  )rk   rj  rC  r3   r4   r#   r:   r5   input_feature_0input_feature_1input_feature_2input_feature_3r'   r'   r(   r     s   $$$$

zClapAudioPatchMerging.forward)rW   rX   rY   rZ   r   r   r]   r<   Modulerq   rB  r=   r   r   r   r'   r'   rs   r(   rh    s
    **rh  c                       s   e Zd Z fddZdd Z							ddeej d	eej d
ee dee dee dee dee de	e
ef fddZ  ZS )ClapAudioEncoderc                    s  t    t j_ _t _ j_jj	_	 j
_
 j
 j _t jdjd   _dd tjd jt jddD jjfddtjD _t fd	dtjD _d
_t j_tj_ j_td_ d S )Nr*   r   c                 S   s   g | ]}|  qS r'   )item)rh   xr'   r'   r(   rX        z-ClapAudioEncoder.__init__.<locals>.<listcomp>r   cpurI   c                    s,   g | ]} d  d|   d d|  fqS )r   r*   r   r'   rU  )r   r'   r(   rX     s   , c                    s|   g | ]:}t  t jd |  j|  j|  j| t jd| t jd|d   |jd k r8tnddqS )r*   Nr   )r   r:   r&  r_  r   r*  r]  )	rT  r<   r   input_resolutionsdepthsr   r   
num_layersrh  )rh   i_layer)r   r-  rk   r'   r(   rX  #  s    
*F)!rp   rq   rL   rx  ry  r   r   patch_embedr   r   r   num_mel_bins
freq_ratior<   r   num_featuresr=   linspacer-  r   r   r[  rw  r   rZ  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpoolrk   r   rs   )r   r-  r   rk   r(   rq     s,   


$
zClapAudioEncoder.__init__c                 C   s   |j \}}}}t| j| j }| j| j }||ks||kr!td||k r1tjj|||fddd}||k rAtjj|||fddd}|j \}}}	}
|||| j |	| j |
}|	dddd
 }||||
| j |	| j }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r   r*   )r   r<   r   r}  r   r   rM   r)   r    r0   r1   )rk   normalized_input_featuresr   r$   freq_length
spec_widthspec_heightbatchr   timefreqr'   r'   r(   reshape_mel2img8  s,   z ClapAudioEncoder.reshape_mel2imgNFT	is_longerr   r   output_hidden_states(output_hidden_states_before_downsamplingrD  return_dictrH   c	           $      C   s  | dd}| |}	|	 dd}	d }
| jr%||j}t|dkd }
| |	}|jd }| 	||
}|r9dnd }|r?dnd }|rEdnd }| j
d }|rq|j\}}}|j|g||R  }|dddd}||f7 }||f7 }t| jD ]\}}|d ur|| nd }| j
| }||||||}|d }|d }|d }|d |d f}|r|r|j\}}}|j|g|d |d f|R  }|dddd}||f7 }||f7 }n'|r|s|j\}}}|j|g||R  }|dddd}||f7 }||f7 }|r||dd  7 }qv| |}|j\}}}|dt| jd   | jd  }|dt| jd   | jd  }|ddd ||||}|j\}}} }!| | j }"|||| |" |"|!}|ddddd |||"d}| t|d}#t|#d}#|std	d
 ||#||fD S t||#||dS )Nr   r   r   r*   r'   r   r-   r+   c                 s   s    | ]	}|d ur|V  qd S ro   r'   )rh   vr'   r'   r(   rl     s    z+ClapAudioEncoder.forward.<locals>.<genexpr>rU   pooler_outputr!   rV   )r   r  r   torJ   r=   wherer  r   r{  rw  r/   r0   r`  r  r   rL   rx  r   r1   r    r}  r  r   r]   r   )$rk   input_featuresr  r   r   r  r  rD  r  r  is_longer_list_idxis_longer_listr!   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrC  r#   r   hidden_sizereshaped_hidden_staterV  ra  rb  rR  rc  rf  rU   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputr'   r'   r(   r   \  s   











  
 zClapAudioEncoder.forward)NNFFFFT)rW   rX   rY   rq   r  r   r=   r[   r   r   r]   r^   r   r   r'   r'   rs   r(   rr    s6    ('	

rr  c                       s2   e Zd Zdeeef f fddZdd Z  ZS )ClapProjectionLayerr   c                    sH   t    || _|j}|j}t||| _t|j	 | _
t||| _d S ro   )rp   rq   r   r  projection_dimr   r   linear1r	   projection_hidden_act
activationlinear2)rk   r   r  r  rs   r'   r(   rq     s   
zClapProjectionLayer.__init__c                 C   s"   |  |}| |}| |}|S ro   )r  r  r  r  r'   r'   r(   r     s   


zClapProjectionLayer.forward)	rW   rX   rY   r   r   r   rq   r   r   r'   r'   rs   r(   r    s    
r  c                       s4   e Zd ZdZ fddZ	d
ddZdd	 Z  ZS )ClapTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd |j| _tj|j|j| jd| _	d S )N)rB   r"  position_embedding_typeabsoluteposition_ids)r   r-   T)
persistenttoken_type_ids)rw   )rp   rq   r   	Embedding
vocab_sizer  pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr   r'  r   r  r   rf   r  r   r=   rK   expandr   r  r   r@   rB   r  rs   r'   r(   rq     s"   
zClapTextEmbeddings.__init__Nr   c                 C   s   |d u r|d urt || j|}n| |}|d ur| }n| d d }|d }|d u rTt| drI| jd d d |f }||d |}	|	}ntj|tj	| j
jd}|d u r]| |}| |}
||
 }| jdkrt| |}||7 }| |}| |}|S )Nr-   r   r  r   rv   r  )rF   rB   &create_position_ids_from_inputs_embedsr   hasattrr  r  r=   r   r@   r  rJ   r  r  r  r  r   r   )rk   rA   r  r  inputs_embedsrC   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  
embeddingsr  r'   r'   r(   r     s0   








zClapTextEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr-   r   rv   r   )r   r=   rK   rB   r@   rJ   r   r  )rk   r  r  sequence_lengthr  r'   r'   r(   r  .  s   	z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )rW   rX   rY   rZ   rq   r   r  r   r'   r'   rs   r(   r    s    
(r  ru   moduler   r   r   r   scalingr   r   c                 K   s   t ||dd| }	|d ur'|d d d d d d d |jd f }
|	|
 }	tjj|	dt jd|j	}	tjj
|	|| jd}	|d urM|	|dddd }	t |	|}|dd }||	fS )Nr*   r   r   r-   )r:   rw   )prx   r   )r=   r   r   r   r   rM   r   float32r  rw   r   rx   r/   r1   )r  r   r   r   r   r  r   r   kwargsattn_weightscausal_maskattn_outputr'   r'   r(   eager_attention_forwardA  s   &r  c                       r   )ClapTextSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t
|j| _|j| _| jd | _d S )Nr   embedding_sizer   r   r         )rp   rq   r  r   r  r   r   r<   r   r   r   r   r   r   r   r   r   r   attention_dropoutr  r  rs   r'   r(   rq   ^  s"   

zClapTextSelfAttention.__init__NFr!   r   r   r   rH   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}	| ||dd}
t}| jj	dkrCt
| jj	 }|| ||	|
|f| jsOdn| j| j|d|\}}|jg |dR   }|rp||f}|S |f}|S )Nr-   r   r*   eagerru   )r   r  r   )r   r   r   r/   r   r   r   r  r   _attn_implementationr   rx   r  r  r    r1   )rk   r!   r   r   r   r  r  r   query_states
key_statesvalue_statesattention_interfacer  r  r   r'   r'   r(   r   s  s4   	
zClapTextSelfAttention.forwardr   r   r'   r'   rs   r(   r  ]  s     r  c                       r   )ClapTextSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr"  )rp   rq   r   r   r  r   r   r'  r   r  r   r  rs   r'   r(   rq        
zClapTextSelfOutput.__init__r!   r   rH   c                 C   &   |  |}| |}| || }|S ro   r   r   r   r  r'   r'   r(   r        

zClapTextSelfOutput.forwardr  r'   r'   rs   r(   r        $r  c                       r  )ClapTextAttentionc                    s*   t    t|| _t|| _t | _d S ro   )rp   rq   r  rk   r  r   r  r  r  rs   r'   r(   rq     s   


zClapTextAttention.__init__c                 C   r  r  r	  r  r'   r'   r(   r    r  zClapTextAttention.prune_headsNFr!   r   r   r   rH   c           	      K   s@   | j |f|||d|}| |d |}|f|dd   }|S N)r   r   r   r   r   r  )	rk   r!   r   r   r   r  r  r  r   r'   r'   r(   r     s   zClapTextAttention.forwardr   r  r'   r'   rs   r(   r    s"    r  c                       r  )ClapTextIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S ro   )rp   rq   r   r   r  intermediate_sizer   r   r  r  r	   r  r  rs   r'   r(   rq     s
   
zClapTextIntermediate.__init__r!   rH   c                 C   r   ro   r  r  r'   r'   r(   r     r  zClapTextIntermediate.forwardr  r'   r'   rs   r(   r    r  r  c                       r   )ClapTextOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r  )rp   rq   r   r   r  r  r   r   r'  r   r  r   r  rs   r'   r(   rq     r  zClapTextOutput.__init__r!   r   rH   c                 C   r  ro   r  r  r'   r'   r(   r     r  zClapTextOutput.forwardr  r'   r'   rs   r(   r    r  r  c                       sb   e Zd Z fddZ			ddejdeej deej dee d	e	ej f
d
dZ
dd Z  ZS )ClapTextLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S )Nr   )
rp   rq   r$  seq_len_dimr  r)  r  r,  r  r   r  rs   r'   r(   rq     s   


zClapTextLayer.__init__NFr!   r   r   r   rH   c           
      K   sP   | j |f|||d|}|d }|dd  }t| j| j| j|}	|	f| }|S r  )r)  r   feed_forward_chunkr$  r  )
rk   r!   r   r   r   r  self_attention_outputsr  r   rQ  r'   r'   r(   r     s    
zClapTextLayer.forwardc                 C   s   |  |}| ||}|S ro   )r,  r   )rk   r  intermediate_outputrQ  r'   r'   r(   r    s   
z ClapTextLayer.feed_forward_chunkr   )rW   rX   rY   rq   r=   r   r   r[   r   r]   r   r  r   r'   r'   rs   r(   r    s"    
r  c                       sz   e Zd Z fddZe					ddejdeej deej d	ee	 d
ee	 dee	 de
eej ef fddZ  ZS )ClapTextEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r'   )r  rU  r   r'   r(   rX  !  ru  z,ClapTextEncoder.__init__.<locals>.<listcomp>F)	rp   rq   r   r   rZ  r[  num_hidden_layerslayerr  r  rs   r  r(   rq     s   
 
zClapTextEncoder.__init__NFTr!   r   r   r   r  r  rH   c                 K   s   |rdnd }|r
dnd }	t | jD ].\}
}|r||f }|d ur$||
 nd }|d||||d|}|d }|r?|	|d f }	q|rG||f }t|||	dS )Nr'   )r!   r   r   r   r   r   )rU   r!   rV   )r`  r  r   )rk   r!   r   r   r   r  r  r  r  r  rV  ra  rb  rR  r'   r'   r(   r   $  s2   

zClapTextEncoder.forward)NNFFT)rW   rX   rY   rq   r   r=   r   r   r[   r   r   r]   r   r   r   r'   r'   rs   r(   r    s.    	r  c                       r  )ClapTextPoolerc                    s*   t    t|j|j| _t | _d S ro   )rp   rq   r   r   r  r   Tanhr  r  rs   r'   r(   rq   P  s   
zClapTextPooler.__init__r!   rH   c                 C   s(   |d d df }|  |}| |}|S r.  )r   r  )rk   r!   first_token_tensorpooled_outputr'   r'   r(   r   U  s   

zClapTextPooler.forwardr  r'   r'   rs   r(   r  O  r   r  c                   @   s.   e Zd ZU eed< dZdZdejfddZ	dS )ClapPreTrainedModelr   clapFr  c                 C   sZ  | j j}t|tr#|jjjjd|d d |jjjjd|d d dS t|t	rB|j
jt| j j |jjt| j j dS t|tjrU|jjjd|d d dS t|tjtjfrm|jj  |jjd dS t|tjtjfr| j jd d| j j d  | }tjj|j|d |jdur|jj  dS dS t|tr|jj  dS dS )	zInitialize the weightsru   g{Gz?)meanstdg      ?r  r*   )r  N)r   initializer_factorr   r  r  weightdatanormal_r  	ClapModellogit_scale_afill_r   loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   zero_r   r   r  r  initr   r   )rk   r  factorin_proj_stdr'   r'   r(   _init_weightsd  s*   

 

z!ClapPreTrainedModel._init_weightsN)
rW   rX   rY   r   r\   base_model_prefixsupports_gradient_checkpointingr   rq  r  r'   r'   r'   r(   r  ^  s
   
 r  c                       s   e Zd ZU eed< dZdef fddZdejfddZ	e
					ddeej d	eej d
ee dee dee deeef fddZ  ZS )ClapAudioModelr   r  c                    s"   t  | t|| _|   d S ro   )rp   rq   rr  audio_encoder	post_initr  rs   r'   r(   rq     s   
zClapAudioModel.__init__rH   c                 C   
   | j jjS ro   )r
  r{  r   rj   r'   r'   r(   get_input_embeddings     
z#ClapAudioModel.get_input_embeddingsNr  r   r  r  c                 C   sP   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j|||||dS )ae  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```Nr  r  r   r  r  )r   use_return_dictr   r  r
  )rk   r  r  r   r  r  r'   r'   r(   r     s   zClapAudioModel.forwardNNNNN)rW   rX   rY   r   r\   main_input_namerq   r   rq  r  r   r   r=   r[   
BoolTensorr   r   r]   r   r   r   r'   r'   rs   r(   r	  |  s0   
 
r	  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                       s   e Zd ZU eed< d fdd	Zdd Zdd Zee																			dd
e
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
e deeej ef fddZ  ZS )ClapTextModelr   Tc                    sD   t  | || _t|| _t|| _|rt|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rp   rq   r   r  r  r  encoderr  poolerr  )rk   r   add_pooling_layerrs   r'   r(   rq     s   

zClapTextModel.__init__c                 C   s   | j jS ro   r  r  rj   r'   r'   r(   r    s   z"ClapTextModel.get_input_embeddingsc                 C   s   || j _d S ro   r  rk   r   r'   r'   r(   set_input_embeddings     z"ClapTextModel.set_input_embeddingsNrA   r   r  r  r   r  r   r  r  rH   c
                 C   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|
\}}|d urU|jn|j}|d u retj	||f|d}|d u rt
| jdr| jjd d d |f }|||}|}n	tj|
tj|d}| ||
}| || j j}| j||||d}| j|||||dd	}|d
 }| jd ur| |nd }t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsrI   r  rv   )rA   r  r  r  T)r   r   r   r  r  r   r  )r   r   r  r  r   %warn_if_padding_and_no_attention_maskr   rJ   r=   onesr  r  r  r  r   r@   get_extended_attention_maskget_head_maskr  r  r  r   r!   rV   )rk   rA   r   r  r  r   r  r   r  r  r  r#   r  rJ   r  r  extended_attention_maskembedding_outputencoder_outputssequence_outputr  r'   r'   r(   r     s\   
zClapTextModel.forward)T	NNNNNNNNN)rW   rX   rY   r   r\   rq   r  r  r   r   r   r=   r   r   r   r]   r   r   r   r'   r'   rs   r(   r    sJ   
 	
r  c                       s$  e Zd ZU eed< def fddZe e		ddej	de
ej	 de
ej	 dejfd	d
Ze e		ddej	de
ej	 de
ej	 dejfddZee									dde
ej de
ej de
ej de
ej	 de
ej de
e de
e de
e de
e deeef fddZ  ZS )r  r   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}t	
tt|j| _t	
tt|j| _|j| _t|| _t|| _t|| _t|| _|   d S )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )rp   rq   r   text_configr   	TypeErrortypeaudio_configr   r   r   r=   r2  r   r  r   r  r  r  r  
text_modelr  text_projectionr	  audio_modelaudio_projectionr  )rk   r   r&  r)  rs   r'   r(   rq   .  s.   



zClapModel.__init__NrA   r   r  rH   c                 C   s.   | j |||d}| |j}tj|dd}|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`ClapTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)rA   r   r  r-   r9   )r*  r+  r  F	normalize)rk   rA   r   r  text_outputstext_featuresr'   r'   r(   get_text_featuresN  s   zClapModel.get_text_featuresr  r  c                 C   s,   | j ||d}| |j}tj|dd}|S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Returns:
            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
            applying the projection layer to the pooled output of [`ClapAudioModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))

        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     audio_features = model.get_audio_features(**inputs)
        ```)r  r  r-   r9   )r,  r-  r  r.  r/  )rk   r  r  r   audio_outputsaudio_featuresr'   r'   r(   get_audio_featuresp  s   zClapModel.get_audio_featuresreturn_lossr   r  r  c
              	   C   s@  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j||||dd}
| j|||||dd}|	s9|
d n|
j}| |}|	sG|d n|j}| |}||j	dddd }||j	dddd }| j
 }| j }t|| | }t|| | }d}|rt|}t| }|| d	 }t|||||||
d
S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]

        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```NTr  rA   r   r  r   r  r  r   r*   r-   )r  r:   keepdimg       @)ra   rb   rc   rT   r_   rd   re   )r   r   r  r  r,  r*  r  r-  r+  r   r  expr  r=   r   trP   r`   )rk   rA   r  r  r   r  r6  r   r  r  r3  r0  r_   rT   logit_scale_textlogit_scale_audiorc   rb   ra   caption_loss
audio_lossr'   r'   r(   r     sV   *	



zClapModel.forward)NNr$  )rW   rX   rY   r   r\   rq   r   r   r=   r   r   r[   r2  r5  r   
LongTensorr  r   r   r]   r`   r   r   r'   r'   rs   r(   r  *  sz   
   %	

r  c                       s   e Zd ZU eed< def fddZdejfddZdd Z	e
e												dd
eej deej deej dee dee dee deeef fddZ  ZS )ClapTextModelWithProjectionr   c                    ,   t  | t|| _t|| _|   d S ro   )rp   rq   r  r*  r  r+  r  r  rs   r'   r(   rq        

z$ClapTextModelWithProjection.__init__rH   c                 C   r  ro   r*  r  r  rj   r'   r'   r(   r    r  z0ClapTextModelWithProjection.get_input_embeddingsc                 C   s   || j j_d S ro   rC  r  r'   r'   r(   r    s   z0ClapTextModelWithProjection.set_input_embeddingsNrA   r   r  r   r  r  c           
      C   s\   |dur|n| j j}| j|||||dd}|s|d n|j}| |}	t|	|j|j|jdS )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```NTr7  r   )rT   rU   r!   rV   )	r   r  r*  r  r+  rS   rU   r!   rV   )
rk   rA   r   r  r   r  r  r0  r  rT   r'   r'   r(   r   
  s"   	
z#ClapTextModelWithProjection.forward)NNNNNN)rW   rX   rY   r   r\   rq   r   rq  r  r  r   r   r   r=   r   r   r   r]   rS   r   r   r'   r'   rs   r(   r@    s8   
 
r@  c                       s   e Zd ZU eed< dZdef fddZdejfddZ	e
e					ddeej d	eej d
ee dee dee deeef fddZ  ZS )ClapAudioModelWithProjectionr   r  c                    rA  ro   )rp   rq   r	  r,  r  r-  r  r  rs   r'   r(   rq   ?  rB  z%ClapAudioModelWithProjection.__init__rH   c                 C   s   | j jjjS ro   )r,  r
  r{  r   rj   r'   r'   r(   r  F  r  z1ClapAudioModelWithProjection.get_input_embeddingsNr  r   r  r  c           	      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||dd}|s.|d n|j}| |}t||j|j	|j
dS )av  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```NTr  r   )r_   rU   rV   r!   )r   r  r   r  r,  r  r-  r^   rU   rV   r!   )	rk   r  r  r   r  r  r3  r  r_   r'   r'   r(   r   I  s&   
z$ClapAudioModelWithProjection.forwardr  )rW   rX   rY   r   r\   r  rq   r   rq  r  r   r   r   r=   r[   r  r   r   r]   r^   r   r   r'   r'   rs   r(   rD  :  s2   
 
rD  )r  r  r  r@  r	  rD  )r   )ru   N)VrZ   r   r   dataclassesr   typingr   r   r   r   r=   torch.nn.functionalr   rM   r.  activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   r   utilsr   r   r   r   r   r   configuration_clapr   r   r   
get_loggerrW   loggerr)   r7   r8   rF   r   rP   rS   r^   r`   rq  rn   r   r   r   r   r  r  r  r!  rT  rh  rr  r  r  floatr  r  r  r  r  r  r  r  r  r  r	  r  r  r@  rD  __all__r'   r'   r'   r(   <module>   s    

$(c`'~=6 Fa
;.)2;e O@G