o
    idO                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlm  m	Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ dd ZG d	d
 d
ejZdS )    N)compute_mask_indices)	EMAModule)GradMultiply)ConvFeatureExtractionModelTransformerEncoder)make_pad_maskc                 C   s    ||  }d||  }|||  S N    )startend	curr_steptotal_stepsrpct_remainingr
   r
   [/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/data2vec/data2vec_encoder.pyget_annealed_rate   s   r   c                k       s  e Zd Z															
	
	
	
									
		
																									
		dededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2ed3ed4ed5ed6ed7ed8ed9ed:ed;ed<ed=ed>ed?ed@edAedBedCedDedEedFedGedHedIedJedKedLedMedNefj fdOdPZdQdR ZdSdT Z			dfdUdVZ
dWejfdXdYZ							dgdZd[Zed\d] Zdhd^d_Zdid`daZdbefdcddZ  ZS )jData2VecEncoderN[(512,2,2)] + [(512,2,2)]transformerF         gelu皙?              ??
   staticr   r	   T      順    +?H.?{Gz?   
input_sizeextractor_modeconv_feature_layers
layer_typelayer_norm_firstencoder_layersencoder_embed_dimencoder_ffn_embed_dimencoder_attention_headsactivation_fndropoutattention_dropoutactivation_dropoutencoder_layerdropdropout_inputdropout_featuresfeature_grad_mult	mask_probmask_lengthmask_selection
mask_otherno_mask_overlapmask_min_spacerequire_same_masksmask_dropoutmask_channel_lengthmask_channel_probmask_channel_beforemask_channel_selectionmask_channel_otherno_mask_channel_overlapmask_channel_min_spaceconv_posconv_pos_groupspos_conv_depthmax_positionsaverage_top_k_layerslayer_norm_target_layerinstance_norm_target_layerinstance_norm_targetslayer_norm_targetsbatch_norm_target_layergroup_norm_target_layer	ema_decayema_end_decayema_anneal_end_stepema_transformer_onlyema_layers_onlymin_target_varmin_pred_var	loss_beta
loss_scalerequired_seq_len_multiplec6           7         sx  t    || _t|}6|6d d | _t|6d||d| _|| _|| _|| _	|| _
|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _ || _!|| _"| | _#|!| _$|"| _%|#| _&|$| _'t()t*+| j
, | _-t.di d| jd| j
d|5d| j&d	| j$d
| j%d| jd| j	d| jd| jd| jd| jd| jd| jd| jd| j'| _/t(0| j| j
| _1t(2| j| _t(2| j| _t*j(3| j| _4t(0| j
| j
| _5|%| _6|&| _7|'| _8|(| _9|)| _:|*| _;|+| _<|,| _=|-| _>|.| _?|/| _@|0| _A|1| _B|2| _Cd | _D|3| _E|4| _F|5| _Gd| _HtIJdK| jL d S )Nr   r   )conv_layersr2   modein_dr2   r.   r\   rJ   rH   rI   r+   r-   r/   r0   r3   r4   r1   r,   r5   rK   zData2VecEncoder settings: {}r
   )Msuper__init__r*   evalextractor_embedr   feature_extractorr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   nn	ParametertorchFloatTensoruniform_mask_embr   encoderLinearpost_extract_projDropout	LayerNorm
layer_norm
final_projrL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   emarZ   r[   r\   num_updateslogginginfoformat__dict__)7selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   feature_enc_layers	__class__r
   r   rb      s   
C		
zData2VecEncoder.__init__c                 C   s\   t  }| jrd| _| jj D ]\}}|d|  qt| jr#| jn| | jd|d| _	d S )NTz	pos_conv.)rS   ema_fp32	skip_keys)
setrW   rV   rl   pos_convnamed_parametersaddr   rS   rs   )ry   r~   k_r
   r
   r   make_ema_teacher   s   z Data2VecEncoder.make_ema_teacherc                 C   s   | j d u r| jd urtd |   n:| jrN| j d urN| j| jkr;|| jkr+| j}n
t	| j| j|| j}| j 
| | j  dk rN| j | jrK| jn|  || _d S )NzMaking EMA Teacherr	   )rs   rr   ru   rv   r   trainingrS   rT   rU   r   	set_decay	get_decaysteprV   rl   rt   )ry   rt   decayr
   r
   r   set_num_updates   s"   



zData2VecEncoder.set_num_updatesc                 C   sF  |j \}}}| jdkr7| jr7t||fd | j| j| j| j| j| jd}t	
||jdd|d}d||< | jdkrh|d u rbt||f|| j| j| j| jd| j| j| j| jd}t	
||j}| j||< nd }| jdkr| js|d u rt||fd | j| j| j| j| j| jd}t	
||jdd|d}d||< ||fS )Nr   )
no_overlap	min_spacer	   r]   )	min_masksr   r   r?   r@   )shaperB   rC   r   rA   rD   rE   rF   rG   rh   
from_numpytodevice	unsqueezeexpandr9   r:   r;   r<   r=   r>   r?   r@   rk   )ry   xpadding_maskmask_indicesmask_channel_indicesBTCr
   r
   r   
apply_mask   sh    

zData2VecEncoder.apply_maskinput_lengthsc                 C   sL   dd }t | j}tt|D ]}|||| d || d }q|tjS )zH
        Computes the output length of the convolutional layers
        c                 S   s   t | | t j| d S r   )rh   floorr   float32)input_lengthkernel_sizestrider
   r
   r   _conv_out_length;  s   zJData2VecEncoder._get_feat_extract_output_lengths.<locals>._conv_out_lengthr	   r'   )rc   r*   rangelenr   rh   long)ry   r   r   conv_cfg_listir
   r
   r    _get_feat_extract_output_lengths6  s   
z0Data2VecEncoder._get_feat_extract_output_lengthsc	                 C   s6  |d urt |d|j}	nd }	|}
| jdkr)| |
}
| jdkr(t|
| j}
nt  | |
}
W d    n1 s=w   Y  |
	dd}
| 
|
}
|	}|	d urd|	  d}| |}tj|
jd d |
j|
jd}	d|	tj|	jd |	jd|d f< d|	dgddg  }	nd }	| jd ur| |
}
d }| jr|
 }| |
}
|r| j|
|	||d	\}}n|
}d }| j||	|d
\}}|rd|	  d}||d fS i |	|d}t  | jj  | jr| jjj||	| j| j  d\}}||	|d}n
| jjj||dd}dd |d D }d}| j!s"| j"r+dd |D }d}| j"r6dd |D }| j!rAdd |D }|rKdd |D }| j#rVdd |D }| j$radd |D }t|t%| }| j&rzt'
|( |jdd  }| j)rt'*|( 	dd	dd}|s|	dd}|| }W d    n	1 sw   Y  || }| +|}|,d}| j-dkrt'j.|( |( ddjdd}nt'j/|( |( d| j-djdd}| j0d ur| j0}ndt12| }| | |d d< d |vr|3 |d < t  | 4||d!< | 4|( |d"< W d    n	1 s'w   Y  | j5d#kr[|d! | j6k r[t78d$|d! 9  d%| j6 d& t:d$|d! 9  d%| j6 d&| j5d#kr|d" | j;k rt78d'|d" 9  d%| j; d& t:d'|d" 9  d%| j; d&| jd ur| j< d( |d)< |S )*N)lengthsr   r   r	   r'   r]   )dtyper   )r   )r   r   )r   layer)lossesr   r   )r   	min_layer)r   r   layer_resultsF)sourcer   maskc                 S   s   g | ]}|d  qS )r'   r
   ).0lr
   r
   r   
<listcomp>  s    z+Data2VecEncoder.forward.<locals>.<listcomp>r   c                 S   s   g | ]	}| d ddqS )r	   r'   r   )permuter   tlr
   r
   r   r     s    Tc                 S   s"   g | ]}t j| d d ddqS )NT)running_meanrunning_varr   )F
batch_normfloatr   r
   r
   r   r     s    c                 S   s   g | ]	}t | qS r
   )r   instance_normr   r   r
   r
   r   r     s    c                 S   s   g | ]}| d dqS )r	   r'   )	transposer   r
   r
   r   r     s    c                 S   &   g | ]}t | |jd d qS )Nr   rq   r   r   r   r
   r
   r   r         c                 S   r   )r]   Nr   r   r
   r
   r   r     r   none)	reductiondim)r   betar   
regressionsample_size
target_varpred_vari  ztarget var is z < z	, exitingzpred var is i  rS   )=r   r   r   r8   re   r   applyrh   no_gradr   rq   r   sumr   zerosr   r   arangeflipcumsumboolrn   rV   cloner6   r   rl   rs   modelrc   extract_featuresr-   rL   rN   rQ   rR   rM   r   rP   r   r   rO   r   rr   sizerZ   mse_losssmooth_l1_lossr[   mathsqrtnumelcompute_varrt   rX   ru   erroritem	ExceptionrY   r   )ry   xs_padilensr   features_onlyr   r   r   padding_countr   featuresorig_padding_maskr   output_lengthspre_encoder_featuresr   r   encoder_out_lensresultytarget_layer_resultspermutedszlossscaler
   r
   r   forwardG  s  






$








C

"

"zData2VecEncoder.forwardc                 C   s   |  d| d} t rMt| d }| jdd}| d jdd}t| t| t| ||d  |d ||d    }t	|d 
 S t	| jddd 
 S )Nr]   r   r   r'   r	   gư>)viewr   distis_initializedrh   tensorcudar   
all_reducer   meanvar)r   zczszssr   r
   r
   r   r     s   


 zData2VecEncoder.compute_varc                 C   s   | j |||d|d}|S )NT)r   r   r   )r   )ry   r   r   r   r   resr
   r
   r   r      s   z Data2VecEncoder.extract_featuresc                    s@   d | _ d | _ d urt fddt| jjD | j_d S d S )Nc                 3   s     | ]\}}| kr|V  qd S Nr
   )r   r   r   
last_layerr
   r   	<genexpr>.  s    z=Data2VecEncoder.remove_pretraining_modules.<locals>.<genexpr>)rr   rs   rf   
ModuleList	enumeraterl   layers)ry   r   r
   r   r   remove_pretraining_modules*  s   
z*Data2VecEncoder.remove_pretraining_modulesreturnc                 C   s   | j S r   )r.   )ry   r
   r
   r   output_size2  s   zData2VecEncoder.output_size)5NNr   r   Fr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Fr	   Tr   r   r   Fr   r   Fr	   r    r!   r	   r"   r#   FFFFFFr$   r%   r"   TTr   r&   r   Nr'   )NN)NFTNNNN)FNr   )__name__
__module____qualname__intstrr   r   rb   r   r   r   rh   
LongTensorr   r   staticmethodr   r   r  r  __classcell__r
   r
   r{   r   r      sl   	
 "#$%&'(*+,-/0123456789:;<>?A 2
C
 I



r   )ru   r   rh   torch.distributeddistributedr   torch.nnrf   torch.nn.functional
functionalr   !funasr.models.data2vec.data_utilsr   !funasr.models.data2vec.ema_moduler   $funasr.models.data2vec.grad_multiplyr   funasr.models.data2vec.wav2vec2r   r   *funasr.models.transformer.utils.nets_utilsr   r   Moduler   r
   r
   r
   r   <module>   s   