o
    ߥif8                     @   st   d dl Z d dlm  mZ ddlmZmZmZ dZ	dddZ
				
dddZdd Z									dddZdS )    N   )GaborSTRFConvMelScaleModulationDomainLossModule:0yE>psmiamc                 C   s  t | r| j| j}}n	| d | d }}t |r#|j|j}}n	|d |d }}|dkrSt |d |d  }t |d |d  }	|	|t  }
t |
ddS |dkrr|d |d  }|| ||  |t  }t |ddS |dkr|d |d  }|| ||  |t  }t |d |d  }t |d |d  }	|	|t  }
||
 }t |ddS |d	kr|d |d  }|| ||  |t  }|| ||  |t  }t || |}t || |}||fS d
S )zL
        stft: (batch, ..., 2) or complex(batch, ...)
        y = x + n
    .r   .r   iam   r   r   psmr   crmN)torch
is_complexrealimagsqrtEPSclamp)
mixed_spec
clean_spec	mask_typeclipyryixrxiymagxmagr
   ypowr   r   mrmi r"   \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/aec/network/loss.pycompute_mask   s>   

r$     nTc                 C   sJ   t j| d d | d d  dd}||k}t |dk||k}d||< |S )z
        energy based vad should be accurate enough
        spec: (batch, bins, frames, 2)
        returns (batch, frames)
    r   r   r	   r   dimr         ?)r   sumlogical_and)specthdhighthdlowint16energyvadidxr"   r"   r#   
energy_vad7   s
   "	r3   c                 C   sh   t dd }tdddd}|| t| }| D ]}d|_qtdd| d	 d
 d	 }||fS )Nz"./network/gabor_strf_parameters.pt
state_dict   <   )supnsupknkernFP   i>  r   r   )n_melssample_raten_stft)
r   loadr   load_state_dictr   eval
parametersrequires_gradr   cuda)n_fftgabor_strf_parametersgabor_modulation_kernelsmodulation_loss_moduleparamstft2melr"   r"   r#   modulation_loss_initG   s$   
rJ   psm_lossmseF  @  c	                    sx  d urt d  tjddd+fdd	
fddfdd	d
d 	
fdd}	d,
fdd	d,
fdd	}
d,
fdd	}	 	d- 
fdd	}d, 
fdd	}d,
fdd	fdd}fdd}	
fdd }| d!krS | d"kr|S | d#kr|S | d$krS | d%kr|
S | d&kr|S | d'kr|S | d(kr|S | d)kr|	S t d* d S ).NzUse loss weight: F)periodicc              	          t j|  | jd|dS )NF)windowcenterreturn_complex)r   stfttodevice)xrS   
hop_lengthrD   rQ   winlenr"   r#   rT   g   s   
z mask_loss_function.<locals>.stftc              	      rP   )NF)rQ   rR   length)r   istftrU   rV   )rW   slenrX   r"   r#   r\   r   s   
z!mask_loss_function.<locals>.istftc              	      s  t  ! t | }t|D ]\}}d|||dddf< qW d   n1 s(w   Y  || }| | } du r<d}n|  } dkrTdt |t | | d  }n+ dkret |t | |  }ndt |t | | d d| t | |    }|t | }|S )	z" [Batch, Time, Frequency]
        r   Nr   rL   r)   r   maeg?)r   no_grad	ones_like	enumerater*   powabs)targetsmasksnframesmask_for_lossr2   numalphaloss)	loss_typeweightr"   r#   	mask_loss|   s(   

 z%mask_loss_function.<locals>.mask_lossc                 S   s$  t  # t | d }t|D ]\}}d|||dddf< qW d   n1 s*w   Y  |d | }|d | }| d | }| d | }	t |d d |d d  | }
t | d d | d d  | }t t || dt ||	 d }t t |
| d}|| t | }|S )z% [Batch, Time, Frequency, 2]
        r   r   Nr	   r   )r   r_   r`   ra   r   r*   rb   )rd   r,   rf   rg   r2   rh   r   r   r   r   r   r   loss1loss2rj   r"   r"   r#   spectrum_loss   s(   
"
&z)mask_loss_function.<locals>.spectrum_lossc                    s    |  g dd } | g dd }t # t|d }t|D ]\}}d|||d d d f< q&W d    n1 s@w   Y  |d d |d d  d |d  }	|d d |d d  d }
|	| }	|
| }
tt|	|
 dt| }|S )	Nr   r   r         r   r   r   r	   g333333?g333333?)permuter   r_   r`   ra   r*   rb   )mixedcleanre   rf   yspecxspecrg   r2   rh   emagr   rj   )rT   r"   r#   sa_loss_dlen   s   
$ z(mask_loss_function.<locals>.sa_loss_dlenc                    s   | }|}t ||}|ddd} |||}|d urqt|}	t ' t|d d d d df }
t|D ]\}}d|
||d f< q:W d    n1 sQw   Y  |d d d d df |
 }|	|
 }	t||	}|| S |S )Nr   r   r   )	r$   rt   r3   r   r_   r`   ra   Fbinary_cross_entropy)ru   rv   re   rf   subtaskr   r   rd   rj   
vadtargetsrg   r2   rh   loss_vad)rm   r   rT   r"   r#   psm_vad_loss_dlen   s$   
z-mask_loss_function.<locals>.psm_vad_loss_dlenc                    s&  | d}|d}t |}t |}t  ! t |}	t|D ]\}
}d|	|
d d |d f< q"W d    n1 s<w   Y  ||	 }||	 |g d }t t t|d ddd }t t t|d ddd }t	|| }|ddd}dt
||| }| ||||}|| }|S )NTr   r   r   r   r   r   r   g?)r   rc   r_   r`   ra   rt   log	transposerI   r$   rG   )ru   rv   re   rf   r}   r   r   enhanced_mag	clean_magrg   r2   rh   clean_log_melenhanced_log_melri   rj   ro   )r   r   rT   r"   r#   modulation_loss   s4   





z+mask_loss_function.<locals>.modulation_lossc                    s   | d } |d }| }t  ! t |}t|D ]\}}d|||d d d f< qW d    n1 s4w   Y  || }	||	g dd }
 |
|jd }t||}|S )Nrs   r   r   rr   r   )r   r_   r`   ra   rt   	unsqueezeshapewav2vec_loss_module)ru   rv   re   rf   r}   r   rg   r2   rh   	masks_estestimate	est_cleanrj   )r\   rT   r"   r#   wav2vec_loss   s   


z(mask_loss_function.<locals>.wav2vec_lossTc                    s  | }t  ! t |}t|D ]\}}	d|||	d d d f< qW d    n1 s,w   Y  || }
||
g dd }||jd }t|jd |jd }|d d d |f }|d d d |f }|r~|t j|ddd }|t j|ddd }t j	|| ddd}t j	|d ddd  }|| | }|| }t j	|d ddt j	|d dd   }d	t 
|   }| }|S )
Nr   r   rr   r   T)r(   keepdimr   r'   )r   r_   r`   ra   rt   r   r   minmeanr*   log10)ru   rv   re   rf   r}   	zero_meanr   rg   r2   rh   r   r   r   flendots_clean_energyscaled_cleane_noisesisdrrj   )r   r\   rT   r"   r#   sisdr_loss_dlen   s8   

z+mask_loss_function.<locals>.sisdr_loss_dlenc                    s  | }|}t  ! t |}t|D ]\}}	d|||	d d d f< qW d    n1 s0w   Y  || }
||
g dd }|d |d  |d |d   }|d |d  |d |d   }t j|d|dgdd}|d d |d d    }|| |d }|| }t j|d d |d d  d	d}t j|d d |d d  d	d}t j|d	dt j|d	d   }d
t |   }|	 }|S )Nr   r   rr   r   r	   r'   r   r   r   )
r   r_   r`   ra   rt   r   catr*   r   r   )ru   rv   re   rf   r}   r   r   rg   r2   rh   r   r   dot_realdot_imagr   r   r   r   scaled_clean_energye_noise_energyr   rj   )r   rT   r"   r#   sisdr_freq_loss_dlen  sL   



z0mask_loss_function.<locals>.sisdr_freq_loss_dlenc              	      s   |  g d} | g d}|d }|d }t||dd\}}t|jd d }	t # t|d }
t|D ]\}}d|
||d d d f< q<W d    n1 sVw   Y  |dd |	f |
 }|d|	d f |
 }||
 }||
 }d u r~d	}n| }|d }|d
 }t|t	|| ||  |d  d t|t	|| ||  |d
  d  }t|t	|| d t|t	|| d  }d||  t| }|S )Nrq   rs   r   )r   r   r   r   .r   r	   r)   )
rt   r$   intr   r   r_   r`   ra   r*   rb   )ru   rv   re   rf   r}   r   r   tgt_mrtgt_miDrg   r2   rh   r    r!   ri   r   r   rn   ro   rj   )rT   rl   r"   r#   crm_loss_dlen@  s:   
((z)mask_loss_function.<locals>.crm_loss_dlenc                    s    | d |d ||S )Nr   r"   )ru   rv   re   rf   )r   r"   r#   crm_miso_loss_dlen`  s   z.mask_loss_function.<locals>.crm_miso_loss_dlenc           	   	      s   | j d }|j d | } | d |d |dd |f |}td|D ]!} | d|f |d|f |d|| || | f |}|| }q"|| S )Nr   r   r   .r   )r   range)	ru   rv   re   rf   chsr   rj   chrn   )r   r"   r#   mimo_loss_dlenc  s   

z*mask_loss_function.<locals>.mimo_loss_dlenc                    sf   | g d}|d }|jd d }tj|dd |d f |d|d d f gdd} |||}|S )Nrq   rs   r   .r   r'   )rt   r   r   r   )ru   rv   r,   rf   r   r   spec_estrj   )rp   rT   r"   r#   spec_loss_dlenn  s   &z*mask_loss_function.<locals>.spec_loss_dlenr   r   r   r   r   r   r   r   rz   zerror loss func)F)N)NT)printr   hamming_window)	loss_funcrk   r   use_mod_lossuse_wav2vec_lossrD   rY   r   rl   rz   r   r   r   r   r   r   r   r"   )r   r   rY   r\   rk   rm   r   rD   r   rp   rT   rl   rQ   rZ   r#   mask_loss_functionX   sP   

%! 
r   )r   r   )r%   r&   T)	rK   rL   r   FFrM   rN   r   N)r   torch.nn.functionalnn
functionalr{   r   r   r   r   r   r$   r3   rJ   r   r"   r"   r"   r#   <module>   s(   
,
