o
    ´©iY#  ã                   @   sj   d dl Zd dlZdd„ Zdejfdd„Zddd„Zdd	d
„Zddd„Zdd„ Z		ddd„Z
	ddd„ZdS )é    Nc                 C   sT   |  d¡rd} n|  d¡rd} nd| d  ¡ > }t|d ƒd } d| d |  }|S )NÚlogmel23é   Úlogmelé(   é   é   )Ú
startswithÚ
bit_lengthÚint)Ú
frame_sizeÚcontext_sizeÚtransform_typeÚfft_sizeÚ	input_dim© r   úT/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/eend/utils/feature.pyÚget_input_dim
   s   

r   c                 C   sÐ  t  | ¡} |s	nZ|dkrt  t  | d¡¡} nK|dkrEd| jd d  }d}d}tj |||¡}t  | d |j	¡} t  
t  | d¡¡} n|dkrqd| jd d  }d	}d
}tj |||¡}t  | d |j	¡} t  
t  | d¡¡} nò|dkr¨d| jd d  }d	}d
}tj |||¡}t  | d |j	¡} t  
t  | d¡¡} t j| dd}| | } n»|dkrd| jd d  }d	}d
}tj |||¡}t  | d |j	¡} t  
t  | d¡¡} t j| dd}t  |¡t  |¡ d }	tdƒD ]}
t  |||	k ¡t  |||	k  ¡ d }	qët j| ||	kdd…f dd}| | } nM|dkr]d| jd d  }d	}d
}tj |||¡}t  | d |j	¡} t  
t  | d¡¡} t j| dd}| | } t  t j| ddd¡}| | } ntd| ƒ‚|  |¡S )a  Transform STFT feature

    Args:
        Y: STFT
            (n_frames, n_bins)-shaped np.complex array
        transform_type:
            None, "log"
        dtype: output data type
            np.float32 is expected
    Returns:
        Y (numpy.array): transformed feature
    Úlogg»½×Ùß|Û=r   r   r   i€>  r   r   i@  r   Úlogmel23_mnr   )ÚaxisÚlogmel23_swng       @é
   NÚlogmel23_mvnzUnknown transform_type: %s)ÚnpÚabsr   ÚmaximumÚshapeÚlibrosaÚfiltersÚmelÚdotÚTÚlog10ÚmeanÚsumÚmaxÚminÚrangeÚstdÚ
ValueErrorÚastype)ÚYr   ÚdtypeÚn_fftÚsrÚn_melsÚ	mel_basisr#   ÚpowersÚthÚir(   r   r   r   Ú	transform   sl   


*



r4   r   c                 C   s$   | dd|… }|dd|… }||fS )zFrame subsamplingNr   )r+   r!   ÚsubsamplingÚY_ssÚT_ssr   r   r   Ú	subsampled   s   r8   c                 C   sd   t  | ||fdgd¡}t jjjt  |¡| jd | jd d| d  f| j| jd  | jfdd}|S )ac  Frame splicing

    Args:
        Y: feature
            (n_frames, n_featdim)-shaped numpy array
        context_size:
            number of frames concatenated on left-side
            if context_size = 5, 11 frames are concatenated.

    Returns:
        Y_spliced: spliced feature
            (n_frames, n_featdim * (2 * context_size + 1))-shaped
    )r   r   Úconstantr   r   r   F)Ú	writeable)r   ÚpadÚlibÚstride_tricksÚ
as_stridedÚascontiguousarrayr   Úitemsize)r+   r   ÚY_padÚ	Y_splicedr   r   r   Úsplicek   s   ürC   é   é   c                 C   sP   d|d   ¡ > }t| ƒ| dkrtj| |||djdd… S tj| |||djS )aR  Compute STFT features

    Args:
        data: audio signal
            (n_samples,)-shaped np.float32 array
        frame_size: number of samples in a frame (must be a power of two)
        frame_shift: number of samples between frames

    Returns:
        stft: STFT frames
            (n_frames, n_bins)-shaped np.complex64 array
    r   r   )r-   Ú
win_lengthÚ
hop_lengthNéÿÿÿÿ)r	   Úlenr   Ústftr!   )Údatar   Úframe_shiftr   r   r   r   rJ   ƒ   s   ÿrJ   c                 C   s(   dt | | ƒ }| | dkr|d }|S )Nr   r   )r
   )Údata_lenÚsizeÚshiftÚn_framesr   r   r   Ú_count_framesœ   s   rQ   c                    sL  ˆ j ˆ j d |k }t ‡ fdd„|D ƒ¡ ¡ }|du r t|ƒ}|dur(|| nd}	ˆ  ||| |	¡\}
}tt|
ƒ||ƒ}tj||ftjd}|du rM|}|D ]T}| 	ˆ j
|d  ¡}t |d | | ¡ t¡}t |d | | ¡ t¡}d }}||kr‡||k r‡|| }||k r“||kr“|| }|dus›|dur£d	|||…|f< qO|S )
a  Get frame-aligned labels of given recording
    Args:
        kaldi_obj (KaldiData)
        rec (str): recording id
        start (int): start frame index
        end (int): end frame index
            None means the last frame of recording
        frame_size (int): number of frames in a frame
        frame_shift (int): number of shift samples
        n_speakers (int): number of speakers
            if None, the value is given from data
    Returns:
        T: label
            (n_frames, n_speakers)-shaped np.int32 array
    Úrecc                    ó   g | ]	}ˆ j |d   ‘qS ©Úutt©Úutt2spk©Ú.0Úseg©Ú	kaldi_objr   r   Ú
<listcomp>·   ó    z$get_frame_labels.<locals>.<listcomp>N©r,   rU   ÚstÚetr   )Úsegmentsr   ÚuniqueÚtolistrI   Úload_wavrQ   ÚzerosÚint32ÚindexrW   Úrintr*   r
   )r\   rR   ÚstartÚendr   rL   Ú
n_speakersÚfiltered_segmentsÚspeakersÚesrK   ÚraterP   r!   rZ   Úspeaker_indexÚstart_frameÚ	end_frameÚ	rel_startÚrel_endr   r[   r   Úget_frame_labels¤   s.   €rv   Fc                    s–  ˆ   ||| || ¡\}}	t|||ƒ}
ˆ j| }t ‡ fdd„|D ƒ¡ ¡ }|du r.t|ƒ}tj|
jd |ftj	d}|rSt
ˆ j ¡ ƒ}tj|
jd t|ƒftj	d}|D ]j}| ˆ j|d  ¡}|rm| ˆ j|d  ¡}t |d |	 | ¡ t¡}t |d |	 | ¡ t¡}d }}||kr™||k r™|| }||k r¥||kr¥|| }|dus­|dur¿d	|||…|f< |r¿d	|||…|f< qU|rÇ|
||fS |
|fS )
a  Extracts STFT and corresponding labels

    Extracts STFT and corresponding diarization labels for
    given recording id and start/end times

    Args:
        kaldi_obj (KaldiData)
        rec (str): recording id
        start (int): start frame index
        end (int): end frame index
        frame_size (int): number of samples in a frame
        frame_shift (int): number of shift samples
        n_speakers (int): number of speakers
            if None, the value is given from data
    Returns:
        Y: STFT
            (n_frames, n_bins)-shaped np.complex64 array,
        T: label
            (n_frmaes, n_speakers)-shaped np.int32 array.
    c                    rS   rT   rV   rX   r[   r   r   r]   ê   r^   z#get_labeledSTFT.<locals>.<listcomp>Nr   r_   rU   r`   ra   r   )re   rJ   rb   r   rc   rd   rI   rf   r   rg   ÚsortedÚspk2uttÚkeysrh   rW   ri   r*   r
   )r\   rR   rj   rk   r   rL   rl   Úuse_speaker_idrK   rp   r+   rm   rn   r!   Úall_speakersÚSrZ   rq   Úall_speaker_indexrr   rs   rt   ru   r   r[   r   Úget_labeledSTFTÏ   s:   
€
r~   )r   )r   )rD   rE   )r   NrD   rE   N)NF)Únumpyr   r   r   Úfloat32r4   r8   rC   rJ   rQ   rv   r~   r   r   r   r   Ú<module>   s   
J

	
ÿ,ÿ