o
    wi~                     @   s  d dl mZ d dlmZmZ d dlZd dlmZ d dl	Z
d dlZd dlmZmZ d dlmZmZmZ d dlmZ d dlmZ dZzd dlZW n eyQ   d	ZY nw zd d
lmZ W n eym   d dlmZ dd ZY nw G dd deZdd Zdd Z dd Z!dd Z"		ddeej# deej# dej#fddZ$	ddej#dej#dee% d ee& deej#ej#ej#f f
d!d"Z'dd#ej#d$ej#dee% dej#fd%d&Z(edd'dd)d*Z)edd'd+d, Z*eddd-dd.d/Z+dd2d3Z,e	1	4	5	1	6	7dd8d9Z-e	:			(		1	4	5	1	6	7dd;d<Z.	:			(		1	4	5	1	6	7dd=d>Z/dd@dAZ0	?					 	B	ddCdDZ1ddEdFZ2ddGdHZ3dIdJ Z4ddKdLZ5dMdN Z6dOdP Z7e	:	1	Q	R	ddSdTZ8dUdV Z9	W		(	ddXej:fdYdZZ;dd[e%de%fd\d]Z<dd_d`Z=ddadbZ>ddcddZ?dedf Z@dgdh ZAdidj ZBdkejCfdldmZDejjE	n	ddoej#dpej#dqej#drej#dse%dteej# deej#ej#ej#ej#ej#f fdudvZF	(	wddxdyZGedzd{d|eHdeHfd}d~ZIdS )    )Enum)OptionalTupleN)jitprange)DATA_STR2DATA_CLASSMAIN_DATA_TYPESWithLens)logging)
deprecatedTF)rank_zero_onlywrapsc                    s   t   fdd}d S )Nc                     s   t d  d td d S )Nz	Function zX requires lighting to be installed, but it was not found. Please install lightning first   )r
   errorexit)argskwargsfn e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/parts/utils/helpers.py
wrapped_fnF   s   
z"rank_zero_only.<locals>.wrapped_fnr   )r   r   r   r   r   r   E   s   r   c                   @   s   e Zd ZdZdZdZdZdS )OperationModez'Training or Inference (Evaluation) moder   r      N)__name__
__module____qualname____doc__training
validationinferr   r   r   r   r   N   s
    r   c                 C   sL   | j d ur| j S | jd ur| jjd ur| jjS td| j td|  )Nz.Could not find batch_size from batch_sampler: z1Could not find batch_size from train_dataloader: )
batch_sizebatch_samplermicro_batch_size
ValueError)train_dataloaderr   r   r   get_batch_sizeV   s   

r'   c                 C   s   | j | j S N)num_devices	num_nodes)trainerr   r   r   get_num_workersb   s   r,   c              
   C   s   | j d }t D | j  }t| }t|D ]*}t||dd|| d|| f }tj	|| j
d||dd|| d|| f< qW d   |S 1 sPw   Y  |S )a  Convert soft attention matrix to hard attention matrix.

    Args:
        attn (torch.Tensor): B x 1 x max_mel_len x max_text_len. Soft attention matrix.
        in_len (torch.Tensor): B. Lengths of texts.
        out_len (torch.Tensor): B. Lengths of spectrograms.

    Output:
        attn_out (torch.Tensor): B x 1 x max_mel_len x max_text_len. Hard attention matrix, final dim max_text_len should sum to 1.
    r   Ndevice)shapetorchno_graddatacpunumpy
zeros_likerangemastensorr.   )attnin_lenout_lenb_sizeattn_cpuattn_outind	hard_attnr   r   r   binarize_attentionf   s   


$.
rA   c                 C   sn   t  " t | j  }t||  |  dd}W d   n1 s)w   Y  t || j	S )zFor training purposes only. Binarizes attention with MAS.
       These will no longer receive a gradient.

    Args:
        attn: B x 1 x max_mel_len x max_text_len
    r   )widthN)
r0   r1   logr2   r3   r4   b_mas
from_numpytor.   )r9   in_lensout_lenslog_attn_cpur>   r   r   r   binarize_attention_parallel{   s
   
"rJ   lengthsxreturnc                 C   st   | du r|dus
J t j|jd t j|jdS |du r!t | }n|jd }t jd|| j| jd}|| dk }|S )aE  Constructs binary mask from a 1D torch tensor of input lengths

    Args:
        lengths: Optional[torch.tensor] (torch.tensor): 1D tensor with lengths
        x: Optional[torch.tensor] = tensor to be used on, last dimension is for mask
    Returns:
        mask (torch.tensor): num_sequences x max_length binary tensor
    Ndtyper.   r   r.   rP   r   )	r0   onesr/   boolr.   maxarangerP   	unsqueeze)rK   rL   max_lenidsmaskr   r   r   get_mask_from_lengths   s   
rZ   contextlensdim
descendingc                 C   s*   t j||d\}}t | ||} | ||fS )a  Sorts elements in context by the dim lengths specified in lens
    Args:
        context:  source tensor, sorted by lens
        lens: lengths of elements of context along the dimension dim
        dim: Optional[int] : dimension to sort by
    Returns:
        context: tensor sorted by lens along dimension dim
        lens_sorted: lens tensor, sorted
        ids_sorted: reorder ids to be used to restore original order

    )r^   )r0   sortindex_select)r[   r\   r]   r^   lens_sorted
ids_sortedr   r   r   sort_tensor   s   
rc   orderedindicesc                 C   s   t | ||dS )af  Reverses the result of sort_tensor function:
       o, _, ids = sort_tensor(x,l)
       assert unsort_tensor(o,ids) == x
    Args:
        ordered: context tensor, sorted by lengths
        indices: torch.tensor: 1D tensor with 're-order' indices returned by sort_tensor
    Returns:
        ordered tensor in original order (before calling sort_tensor)
    r   )r0   r`   argsort)rd   re   r]   r   r   r   unsort_tensor   s   
rg   )nopythonr   c           	         sh  t | }t | } t j | ddd f< t | | dd d f dd d f< t j| t jd}td| jd D ]A t| jd D ]7}t td|| |d }t 	 fdd|D }t 
|}|  |f ||   |f< || | |f< q@q7| jd d }t| jd d ddD ] d| |f< | |f }qd|d|f< |d sJ |d sJ |S )Nr   r   rP   c                    s   g | ]
} d  |f qS r   r   ).0prev_idxilog_pr   r   
<listcomp>       zmas.<locals>.<listcomp>rN   )npr5   rC   infint64r6   r/   rU   rT   arrayargmaxsumall)	attn_maprB   optprev_indjprev_jprev_logr?   curr_text_idxr   rm   r   r7      s,   



	r7   c           	      C   s(  | j tj }|  }||dddf< td|jd D ]%}|}t|jd D ]}||d |f }|||f  t||7  < |}q'qt|}|j d}|jd d }t|jd d ddD ],}||||f< ||d |d f ||d |f kr|d8 }|dkr||d||f<  nq_||d|f< |S )zmas with hardcoded width=1r   r   NrN   )	rP   typerr   rs   copyr6   r/   rT   r5   )	log_attn_mapneg_infro   rn   	prev_log1r|   	prev_log2rz   oner   r   r   
mas_width1   s.   
$r   )rh   parallelc              	   C   sr   |dksJ t | }t| jd D ]$}t| |dd || d || f }|||dd || d || f< q|S )Nr   r   )rr   r5   r   r/   r   )b_log_attn_maprG   rH   rB   r>   boutr   r   r   rD      s   
$"rD   2      c                 C   s   t dt j t jj| j  }| | }t|}t |	 s*t
d t dgS t|D ]}ttj||d\}}| | }t|}q.|S )zR
    Griffin-Lim algorithm to convert magnitude spectrograms to audio signals
    y               @z+audio was not finite, skipping audio savingr   )n_fft)rr   exppirandomrandr/   librosaistftisfiniterx   r
   warningru   r6   magphasestft)
magnitudesn_itersr   phasecomplex_specsignal_r   r   r   griffin_lim  s   

r   333333?"V  P   @  c
                 C   sp   t jj||||	d}
|j  j}t|}t	||
| }t
|j| }| j||tt| ||d d S )Nsrr   n_melsfmaxsample_rate)r   filtersmelr2   r3   r4   Trr   r   dotr   	add_audiorT   abs)swriterspectnamestepgriffin_lim_mag_scalegriffin_lim_powerr   r   r   r   
filterbanklog_melr   	magnitudeaudior   r   r   log_audio_to_tb  s   
$r   trainc                 C   s  |\}}}}}}|r|| dkr| j | dt|d j  j|dd | j | dt|d j  |dd | j | dt|d j  |dd | j | dt|d j  t	|d j  |dd |rt
jj|	|
||d}|d j  j}t|}t||| }t|j| }| jd	| d
|tt| ||	d |d j  j}t|}t||| }t|j| }| jd	| d|tt| ||	d d S d S d S d S )Nr   
_alignmentHWCdataformats_mel_target_mel_predicted_gater   zaudio/
_predictedr   _target)	add_imageplot_alignment_to_numpyr2   r3   r4   r   plot_spectrogram_to_numpyplot_gate_outputs_to_numpyr0   sigmoidr   r   r   rr   r   r   r   r   rT   r   )r   tensorsr   tag
log_imageslog_images_freqr   r   r   r   r   r   r   r   spec_targetmel_postnetgategate_target
alignmentsr   r   r   r   r   r   r   r   tacotron2_log_to_tb_func/  sV   

(
,r   c                 C   s  |\}}}}}}t sd S |r|| dkrg }g }g }|tjt|d j  j| ddg7 }|tjt|d j  | ddtjt|d j  | ddg7 }|tjt	|d j  t
|d j  | ddg7 }| |||d |rg }tjj|	|
||d}|d j  j}t|}t||| }t|j| }|d j  j}t|}t||| }t|j| }|tj|tt| | d	|	d
tj|tt| | d|	d
g7 }| d|i d S d S d S d S )Nr   r   )captionr   r   r   )specsr   gatesr   _wav_target)r   r   _wav_predictedaudios)
HAVE_WANDBwandbImager   r2   r3   r4   r   r   r   r0   r   rC   r   r   r   rr   r   r   r   AudiorT   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
audio_pred
audio_truer   r   r   tacotron2_log_to_wandb_funcl  sv   



 r    c                 C   s   |rt jdd\}}nt jdd\}}|j| ddd||d}|| |j||d d	}	|d ur6|	d
| 7 }	t |	 t d t   |d krh|t	
t| || |jt	
t|dt| d |j  t|}
t   |
S )N)   
   figsize)      autolowernoneaspectorigininterpolationvminvmaxaxDecoder timestep

Encoder timestep        xminxmax)pltsubplotsimshow	set_titlecolorbarxlabelylabeltight_layout
set_yticksrr   rU   lenset_yticklabelshlinesrT   
get_xtickscanvasdrawsave_figure_to_numpyclose)	alignmenttitleinfophoneme_seqr   r   figr   imr   r2   r   r   r   r     s(   



"
r   r   c	                 C   s  t j| dd d} tjdd\}	}
|
j| ddd||d}|
| |	j||
d	 d
}|d ur3|d| 7 }t| td |d ur|dkrf|
	t 
t| |
| |
jt 
t|dt|
 d nw|dkr|
 }g }|D ]}|dk s|| jd krqr|| qr||7 }|
	| nM|dkr||d  }|
	t 
t| |
| |
jt 
dt|d dd| jd d dd |r|
 }g }|D ]}|||d  d q|
| t  |	j  t|	}t  |S )Nr   a_mina_max)   r   r   r   r   r   r   r   r   r   r   r   r   r   r         ?      ?black)r   r   colorsz.0f)rr   clipr   r   r   r   r   r   r   r  rU   r  r  r  rT   r  
get_yticksr/   appendset_xticklabelsr  r  r  r	  r
  )r  r  r  r  r   r   phoneme_verphone_offseth_offsetr  r   r  r   yticks
new_ytickstickphonesxticks
new_xticksr2   r   r   r   %plot_alignment_to_numpy_for_speechllm  sP   



$
.

r&  c                 C   sf   t jdd\}}t |  |d urt | t d t d t   |j  t	|}t 
  |S )N      r   FramesPitch)r   r   plotylimr   r   r  r  r  r	  r
  )pitch
ylim_ranger  r   r2   r   r   r   plot_pitch_to_numpy  s   




r0  c                 C   s   t jdd\}}t j| dd t j|dd |d urt | t d t d t   t   |j	  t
|}t   |S )Nr'  r   zGround truth)label	Predictedr*  r+  )r   r   r,  r-  r   r   legendr  r  r  r	  r
  )pitch_gt
pitch_predr/  r  r   r2   r   r   r   plot_multipitch_to_numpy  s   



r6  c                 C   sv   |  tj} tjdd\}}|j| dddd}tj||d td td	 t	  |j
  t|}t  |S )
Nr'  r   r   r   r   r   r   r   r   r*  Channels)astyperr   float32r   r   r   r   r   r   r  r  r  r	  r
  )spectrogramr  r   r  r2   r   r   r   r   /  s   


r   c                 C   s|   t jdd\}}|j| dddd}t j||d t | t | t   |r/t j|dd	 |j	  t
|} t   | S )
Nr'  r   r   r   r   r7  r   png)format)r   r   r   r   r   r   r  savefigr  r  r	  r
  )r2   x_axisy_axisoutput_filepathr  r   r  r   r   r   create_plot>  s   


rB  c              	   C   s   t jdd\}}|jtt| | dddddd |jtt||dd	d
ddd t d t d t   |j	  t
|}t   |S )Nr'  r   r  green+r   target)alphacolormarkersr1  red.	predictedz$Frames (Green target, Red predicted)z
Gate State)r   r   scatterr6   r  r   r   r  r  r  r	  r
  )gate_targetsgate_outputsr  r   r2   r   r   r   r   O  s4   
	




r   c                 C   s   t | jj }|S r(   )rr   ru   r  rendererbuffer_rgba)r  	img_arrayr   r   r   r	  n  s   r	     hannc                 C   s   |\}}	}
}|d }|
d j   d d d |f }
| j| dt|
|dd |d urwtjtjjt	
|	d    |||d\}}t	|  | }t	t	j|dd d}| j| dt|d d d |f |dd d S d S )	Nr   r   r   r   )r   
hop_lengthwindowgh㈵>r  r   )r2   r3   r4   r   r   r   corer   r   rr   
nan_to_numdetachmatmulsqueezerC   r  )r   r   r   r   r   rU  rV  mel_fbr   r   r   
mel_lengthmagmel_predlog_mel_predr   r   r   waveglow_log_to_tb_funcs  s6   "
ra  c                 C   s0   t j }| D ]}t jj|}|| q|S r(   )r0   nn
ModuleListutilsremove_weight_normr  )	conv_listnew_conv_listold_convr   r   r   remove  s
   
ri  r  dur_lensc                 C   sr  |j }|  | }|d   }|jdd}|dkrD|tj|d |dd | }	|jtj|j	d tjd|d g|	dd	 |jdd}|
 }
tjtjjj|d
dddddddddf }|j||jd}t|
|jddddf }|ddddddf |k|ddddddf |k@ }||}t||}|dur|ddd|f }t||}||fS )aL  A function that takes predicted durations per encoded token, and repeats enc_out according to the duration.
    NOTE: durations.shape[1] == enc_out.shape[1]

    Args:
        durations (torch.tensor): A tensor of shape (batch x enc_length) that represents how many times to repeat each
            token in enc_out.
        enc_out (torch.tensor): A tensor of shape (batch x enc_length x enc_hidden) that represents the encoded tokens.
        pace (float): The pace of speaker. Higher values result in faster speaking pace. Defaults to 1.0.        max_mel_len (int): The maximum length above which the output will be removed. If sum(durations, dim=1) >
            max_mel_len, the values after max_mel_len will be removed. Defaults to None, which has no max length.
        group_size (int): replicate the last element specified by durations[i, in_lens[i] - 1] until the
            full length of the sequence is the next nearest multiple of group_size
        in_lens (torch.tensor): input sequence length specifying valid values in the durations input tensor (only needed if group_size >1)
    r  r   )r]   floor)rounding_moder   ri   T)re   values
accumulate)r   r   r   r   r   )valueNrO   rN   )rP   floatrk  longrw   r0   div
index_put_rU   r/   rT   cumsumrb  
functionalpadrF   r.   rZ  	clamp_max)	durationsenc_outpacemel_max_len
group_sizerj  rP   repsdec_lensto_padrW   reps_cumsumrange_multenc_repr   r   r   regulate_len  s*    0 <
r  
split_sizec                 C   s   |dk rt | j| }| j| | dkr;|| j| |  }dgt | j d }|||d d < |  tjj| |} | j}|d | | j| | |f ||d d   }| j| S )Nr   r   r   )r  r/   reverser0   rb  ru  rv  reshape)r8   r  r]   r  padding	cur_shape	new_shaper   r   r   
split_view  s   .
r  r   c              	   C   s   t | ddddd|f }t| dD ]2}|| }|| }| | }|| dkr=t jj|d|d | d f}|dd||f ||< q|S )zr
    Time-wise slicing (patching) of bathches for audio/spectrogram
    [B x C x T] -> [B x C x segment_size]
    Nr   r   r   )r0   r5   r6   sizerb  ru  rv  )rL   ids_strsegment_sizeretrn   idx_stridx_endx_ir   r   r   slice_segments  s    "r  c           	      C   sl   |   \}}}|du r|}|| d }|j| jd}t|gj| jd| jtjd}t| ||}||fS )zi
    Chooses random indices and slices segments from batch
    [B x C x T] -> [B x C x segment_size]
    Nr   r-   ri   )r  rF   r.   r0   r   rq  r  )	rL   	x_lengthsr  r   dtids_str_maxr  r  r   r   r   rand_slice_segments  s   $r  c                 C   s   t | tjr	| g} ttdd | } t|}|d urt|}d}| D ]}|jj|}||	 | 7 }|d urA|jjj
| |d q"|d|  }|S )Nc                 S   s
   | j d uS r(   )grad)pr   r   r   <lambda>  s   
 z"clip_grad_value_.<locals>.<lambda>r   )minrT   r  )
isinstancer0   Tensorlistfilterrp  r  r2   normitemclamp_)
parameters
clip_value	norm_type
total_normr  
param_normr   r   r   clip_grad_value_  s   r  c                 C   s   dd | d d d D } | S )Nc                 S   s   g | ]	}|D ]}|qqS r   r   )rk   sublistr  r   r   r   rp     s    z%convert_pad_shape.<locals>.<listcomp>rN   r   )	pad_shaper   r   r   convert_pad_shape  s   r  c           	   	   C   s   |j \}}}}t| d}||| }t|t|ddd|j}||||}|tj	j
|tddgddgddggddddf  }|ddd| }|S )z:
    duration: [b, 1, t_x]
    mask: [b, 1, t_y, t_x]
    rN   r   r   Nr   r)  )r/   r0   rt  viewrZ   r  r  rF   rP   rb  ru  rv  r  rV   	transpose)	durationrY   r   r   t_yt_xcum_durationcum_duration_flatpathr   r   r   generate_path  s   ":r  c                 C   sd   i }d}t  D ]'\}}|tv s||v r/| | ||< |d }t|tr/| | ||d < |d }q|S )Nr   r   _lens)r   itemsr   
issubclassr	   )
batch_datasup_data_types_set
batch_dictbatch_indexr   datatyper   r   r   process_batch+  s   
r  r.   c                    sZ   t | ttfr fdd| D S t | tr  fdd|  D S t | tjr+|  S | S )a  
    Use .to(device) on all tensors within nested lists, tuples, values ofdicts
    Returns a new structure with tensors moved to target device, leaving other data intact.

    The intended use is to move collections of tensors to a device while:
        - avoiding calling specific movers like .cpu() or .cuda()
        - avoiding stuff like .to(torch.device("cuda:{some_variable}"))
    c                    s   g | ]}t | qS r   to_device_recursive)rk   elemr-   r   r   rp   B  s    z'to_device_recursive.<locals>.<listcomp>c                    s   i | ]
\}}|t | qS r   r  )rk   keyro  r-   r   r   
<dictcomp>D  rq   z'to_device_recursive.<locals>.<dictcomp>)r  r  tupledictr  r0   r  rF   )er.   r   r-   r   r  8  s   	

r  rN   textr.  rz  batch_lengthspadding_idxvolumec                 C   s|  |j tjd}t|dd  |d d  }d}|jd d }tj||tj| jd| }	tj||tj| jd}
tj||tj| jdd }tj||tj| jdd }tj|tj| jd}|d }||jd k r|| }|| }|| }|||< | || |	|d |f< ||| |
|d |f< ||| ||d |f< |d ur||| ||d |f< |}|d7 }||jd k sf|	|
|||fS )Nri   r   rN   r   rO   r  )	rF   r0   rt   rT   r/   zerosr.   rR   r:  )r  r.  rz  r  r  r  rW   indexnum_batchestextspitchespacesvolumesr\   
last_index	seq_startseq_endcur_seq_lenr   r   r   batch_from_raggedK  s0   
r     c                 C   s$  | d r	|| fn||f}t jg | d |R |t jd}t j||t jdd }t jt j||t jdd d dd}|||d	}| d rt j|d
 |t jd}	|d }
d|	d< td
|D ]/}|
||  ||  }t j|d |d|t jd}||	|d
   |	|< |
|	 
  d 8 }
q]|
|	d  |	d< d}d
}|t|	k r||	| |	|d
   7 }|d
7 }|t|	k s||d ksJ d| d|d  d|	 nt j|d ||f|t jd}	||	d< |	|d< | d rt jt j||t jdd d
 dd}||d< d| v rt jd| d |f|t jd|d< |S )zc
    Generates input examples for tracing etc.
    Returns:
        A tuple of input examples.
    enable_ragged_batches	emb_rangerQ   r  g?r  g?)r  )r  r.  rz  r   r   r   rj   rN   zsum: z, sz: z
, lengths:r  enable_volumeg{Gz?r  num_speakersspeaker)r0   randintrt   randnr:  clampr  int32r6   rY  r3   r4   r  )export_configr.   	max_batchmax_dimszinpr.  rz  inputsr  left_over_sizern   	equal_lenlengthrw   r  r  r   r   r   sample_tts_inputp  sB   "$,$

r  zBut it will not be removed until a further notice. G2P object root directory `nemo_text_processing.g2p` has been replaced with `nemo.collections.tts.g2p`. Please use the latter instead as of NeMo 1.18.0.)explanation
g2p_targetc                 C   s   |  dd}|S )Nznemo_text_processing.g2pznemo.collections.tts.g2p)replace)r  g2p_target_newr   r   r   g2p_backward_compatible_support  s   r  )NN)r   T)r   rj   )r   r   )r   r   r   r   r   r   )
r   Fr   Tr   r   r   r   r   r   )r   NNNN)r   NNNNr   r   Tr(   )r   r   rS  rT  N)r  Nr   N)r   )Nr   )r   )rN   N)r   r  )Jenumr   typingr   r   r   matplotlib.pylabpylabr   r4   rr   r0   numbar   r   )nemo.collections.tts.torch.tts_data_typesr   r   r	   
nemo.utilsr
   nemo.utils.decoratorsr   r   r   ModuleNotFoundErrorlightning.pytorch.utilitiesr   	functoolsr   r   r'   r,   rA   rJ   r  rZ   intrS   rc   rg   r7   r   rD   r   r   r   r   r   r&  r0  r6  r   rB  r   r	  ra  ri  r8   r  r  r  r  r  r  r  r  r.   r  scriptr  r  strr  r   r   r   r   <module>   s.  ,

&



@

O

9

&
1


'
1