o
    wi                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZ	ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z  e!e"Z#eeddG dd deZ$eeddG dd deZ%e
j&j'dd Z(					dFddZ)dd  Z*G d!d" d"e
jj+Z,G d#d$ d$ej+Z-G d%d& d&ej+Z.G d'd( d(ej+Z/G d)d* d*ej+Z0G d+d, d,ej+Z1G d-d. d.ej+Z2G d/d0 d0ej+Z3G d1d2 d2ej+Z4G d3d4 d4ej+Z5G d5d6 d6ej+Z6G d7d8 d8ej+Z7G d9d: d:ej+Z8G d;d< d<eZ9G d=d> d>ej+Z:G d?d@ d@ej+Z;eG dAdB dBeZ<edCdG dDdE dEe<Z=dEdBgZ>dS )GzPyTorch VITS model.    N)	dataclass)AnyOptionalUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringlogging   )
VitsConfigz`
    Describes the outputs for the VITS model, with potential hidden states and attentions.
    )custom_introc                   @   sx   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dS )VitsModelOutputa"  
    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        The final audio waveform predicted by the model.
    sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
        The length in samples of each element in the `waveform` batch.
    spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
        The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
        GAN decoder model to obtain the final audio waveform.
    Nwaveformsequence_lengthsspectrogramhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler   r    r#   r#   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/vits/modeling_vits.pyr   (   s   
 
r   zm
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
    c                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )VitsTextEncoderOutputa  
    prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted mean values of the prior distribution for the latent text variables.
    prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted log-variance values of the prior distribution for the latent text variables.
    Nlast_hidden_stateprior_meansprior_log_variancesr   r   )r   r   r   r   r&   r   r   r    r!   r'   r(   r   r"   r   r#   r#   r#   r$   r%   @   s   
 r%   c                 C   sT   | | }t |d d d |d d f }t |d d |d d d f }|| }|S N)r   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactsr#   r#   r$   fused_add_tanh_sigmoid_multiplyU   s
     r3   F      @MbP?c	                 C   s   | | k| |k@ }	|	 }
t | }t | }ttd| d }tjj|dd}||d< ||d< | |
 ||
< d||
< t| |	 ||	ddf ||	ddf ||	ddf |||||d	\||	< ||	< ||fS )	a	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r   
zeros_likenplogexpr   
functionalr6   _rational_quadratic_spline)r;   r<   r=   r>   r?   r@   rA   rB   rC   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstantr#   r#   r$   (_unconstrained_rational_quadratic_spline^   s,   .

rO   c	           *      C   s  |}	| }
t | |
k st | |	krtd|jd }|| dkr,td| d| || dkr<td| d| tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |dddf |dddf  }|tj
| }tjj|dd}|d||  |  }t j|dd}tjj	|d	d
dd}|	|
 | |
 }|
|d< |	|d< |dddf |dddf  }|r|n|}|d  d7  < t j| d |kddd }|d }|d|d }|d|d }|d|d }|| }|d|d }|d|d }|dddf d|d }|d|d }|| d|  }|s| | | }|d|  }|||d ||   }|||  }|||  } |d||d d| |  |d| d   }!t |!dt |  }"| |"fS | | }#|#| }$|||  |$ }%|| |$ }&| |# }'|&dd|% |'  }(|(dk std|( d|' |& t |(  })|)| | } |)d|)  }|||  }|d||)d d| |  |d|) d   }!t |!dt |  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr9         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rN   r:   )r6   modevaluer7   r8   .Ngư>).N      r   zinvalid discriminant )r   minmax
ValueErrorshaper   rH   softmaxcumsumr6   softplussumgatherpowrF   allRuntimeErrorsqrt)*r;   r<   r=   r>   r?   r@   rA   rB   rC   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrL   derivative_numeratorrM   intermediate2intermediate3abcdiscriminantrootr#   r#   r$   rI      s   ,
  



rI   c                       s8   e Zd Zdedef fddZd
ddZdd	 Z  ZS )VitsWaveNetconfig
num_layersc                    sB  t    |j| _|| _tj | _tj | _t	|j
| _ttjjdr,tjjj}ntjj}|jdkrJtj|jd|j | d}||dd| _t|D ]P}|j| }|j| | d }tjj|jd|j |j||d}||dd}| j| ||d k rd|j }	n|j}	tj|j|	d}
||
dd}
| j|
 qNd S )Nweight_normr   rU   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__r#   r$   r   1  s>   


zVitsWaveNet.__init__Nc                 C   s  t |}t | jg}|d ur| |}t| jD ]p}| j| |}|d urA|d | j }|d d ||d| j  d d f }	nt |}	t||	|d }
| 	|
}
| j
| |
}|| jd k r|d d d | jd d f }|| | }||d d | jd d d f  }q|| }q|| S )NrU   r   r   )r   rD   	IntTensorr   r   r   r   r   r3   r   r   )r   r;   padding_maskglobal_conditioningrL   num_channels_tensorr   r   cond_offsetglobal_statesr2   res_skip_actsres_actsr#   r#   r$   forwardZ  s&   

&

"
zVitsWaveNet.forwardc                 C   sR   | j dkrtjj| j | jD ]	}tjj| q| jD ]	}tjj| qd S )Nr   )r   r   r   r   remove_weight_normr   r   r   r   layerr#   r#   r$   r   w  s   


zVitsWaveNet.remove_weight_normr)   )	r   r   r   r   intr   r   r   __classcell__r#   r#   r   r$   r   0  s    
)r   c                       s,   e Zd Zdef fddZdddZ  ZS )VitsPosteriorEncoderr   c                    sR   t    |j| _t|j|jd| _t	||j
d| _t|j| jd d| _d S )Nr   r   rU   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   r#   r$   r     s
   
zVitsPosteriorEncoder.__init__Nc                 C   sf   |  || }| |||}| || }tj|| jdd\}}|t|t|  | }|||fS )Nr   rQ   )r   r   r   r   splitr   
randn_likerG   )r   r;   r   r   statsmean
log_stddevsampledr#   r#   r$   r     s   
zVitsPosteriorEncoder.forwardr)   r   r   r   r   r   r   r   r#   r#   r   r$   r         r   c                       s@   e Zd Zd fdd	ZdddZd	d
 Zdd Zdd Z  ZS )HifiGanResidualBlockr   r   r      皙?c                    sb   t    |_t fddttD _t fddttD _d S )Nc                    s2   g | ]}t j  d | | dqS r   )strider   r   r   r   get_padding).0r   channelsr   r   r   r#   r$   
<listcomp>  s    	z1HifiGanResidualBlock.__init__.<locals>.<listcomp>c                    s*   g | ]}t j  d d d dqS r   r   r   _)r   r   r   r#   r$   r     s    	
)	r   r   leaky_relu_sloper   r   r   lenconvs1convs2)r   r   r   r   r   r   r   r$   r     s   

	
	
zHifiGanResidualBlock.__init__r   c                 C   s   || | d S )NrU   r#   )r   r   r   r#   r#   r$   r     s   z HifiGanResidualBlock.get_paddingc                 C   sL   t jj}tt jjdrt jjj}| jD ]}|| q| jD ]}|| qd S Nr   )r   r   r   r   r   r   r   r   r   r   r#   r#   r$   apply_weight_norm     




z&HifiGanResidualBlock.apply_weight_normc                 C   s4   | j D ]}tj| q| jD ]}tj| qd S r)   )r   r   r   r   r   r   r#   r#   r$   r     s
   

z'HifiGanResidualBlock.remove_weight_normc                 C   sX   t | j| jD ]"\}}|}tj|| j}||}tj|| j}||}|| }q|S r)   )zipr   r   r   rH   
leaky_relur   )r   r   conv1conv2residualr#   r#   r$   r     s   
zHifiGanResidualBlock.forward)r   r   r   r   )	r   r   r   r   r   r   r   r   r   r#   r#   r   r$   r     s    

r   c                       sV   e Zd Zdef fddZdd Zdd Z	dd	ejd
e	ej dejfddZ
  ZS )VitsHifiGanr   c              
      sF  t    || _t|j| _t|j| _tj	|j
|jdddd| _t | _tt|j|jD ]$\}\}}| jtj|jd|  |jd|d   |||| d d q/t | _tt| jD ]#}|jd|d   }t|j|jD ]\}}| jt||||j qrq`tj	|dddddd| _|jdkrt	|j|jd| _d S d S )	N   r   r   )r   r   r   rU   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   r#   r$   r     s@   





zVitsHifiGan.__init__c                 C   sL   t jj}tt jjdrt jjj}| jD ]}|| q| jD ]}|  qd S r   )r   r   r   r   r   r   r   r   r   r#   r#   r$   r     r   zVitsHifiGan.apply_weight_normc                 C   s0   | j D ]}tj| q| jD ]}|  qd S r)   )r   r   r   r   r   r   r#   r#   r$   r      s
   


zVitsHifiGan.remove_weight_normNr   r   returnc                 C   s   |  |}|dur|| | }t| jD ]8}tj|| jj}| j	| |}| j
|| j  |}td| jD ]}|| j
|| j |  |7 }q7|| j }qtj|}| |}t|}|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        Nr   )r   r   r   r   r   rH   r   r   r   r   r   r   r   r   r*   )r   r   r   r   r   	res_statejr   r#   r#   r$   r     s   


zVitsHifiGan.forwardr)   )r   r   r   r   r   r   r   r   r    r   r   r   r#   r#   r   r$   r     s    $
r   c                       ,   e Zd Zdef fddZdddZ  ZS )	VitsResidualCouplingLayerr   c                    sR   t    |jd | _t| j|jd| _t||j	d| _
t|j| jd| _d S )NrU   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r   r   r   r#   r$   r   *  s
   
z"VitsResidualCouplingLayer.__init__NFc                 C   s   t j|| jgd dd\}}| || }| |||}| || }t |}	|sJ||t |	 |  }t j||gdd}
t 	|	ddg}|
|fS || t |	  | }t j||gdd}
|
d fS )NrU   r   rQ   )
r   r   r   r   r   r   rD   rG   catr^   )r   r;   r   r   r?   
first_halfsecond_halfr   r   r   rL   log_determinantr#   r#   r$   r   2  s   
z!VitsResidualCouplingLayer.forwardNFr   r#   r#   r   r$   r   )  r   r   c                       r   )	VitsResidualCouplingBlockr   c                    s8   t    t | _t|jD ]
}| jt| qd S r)   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r   )r   r   r   r   r#   r$   r   E  s
   

z"VitsResidualCouplingBlock.__init__NFc                 C   sh   |s| j D ]}||||\}}t|dg}q|S t| j D ]}t|dg}||||dd\}}q|S )Nr   Tr?   )r  r   flipreversed)r   r;   r   r   r?   flowr   r#   r#   r$   r   K  s   
z!VitsResidualCouplingBlock.forwardr   r   r#   r#   r   r$   r   D      r   c                       s.   e Zd Zddef fddZd	ddZ  ZS )
VitsDilatedDepthSeparableConvr:   r   c                    s   t    |j}|j}|j| _t|| _t	 | _
t	 | _t	 | _t	 | _t| jD ]:}|| }|| | d }| j
tj||||||d | jt||d | jt| | jt| q.d S )NrU   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)r   r   dropout_rater   r   r   r   r   r   r#   r$   r   X  s4   





z&VitsDilatedDepthSeparableConv.__init__Nc                 C   s   |d ur|| }t | jD ]E}| j| || }| j| |dddd}tj|}| j| |}| j	| |dddd}tj|}| 
|}|| }q|| S Nr   r9   )r   r   r  r  	transposer   rH   gelur  r  r   )r   r;   r   r   r   r   r#   r#   r$   r   t  s   

z%VitsDilatedDepthSeparableConv.forward)r:   r)   r   r#   r#   r   r$   r  W  s    r  c                       r   )	VitsConvFlowr   c                    sr   t    |j| _|jd | _|j| _|j| _	t
| j| jd| _t|| _t
| j| j| jd d  d| _d S )NrU   r   r   )r   r   r   filter_channelsdepth_separable_channelsr   duration_predictor_flow_binsrf   duration_predictor_tail_boundr@   r   r   r   r  conv_ddsr   r   r   r#   r$   r     s   

&zVitsConvFlow.__init__NFc                 C   s  t j|| jgd dd\}}| |}| |||}| || }|j\}}	}
|||	d|
dddd}|dd | j	f t
| j }|d| j	d| j	 f t
| j }|dd| j	 d f }t|||||| jd\}}t j||gdd| }|st || ddg}||fS |d fS )	NrU   r   rQ   r9   r   r   .)r?   r@   )r   r   r   r   r  r   rZ   reshapepermuterf   mathrc   r  rO   r@   r   r^   )r   r;   r   r   r?   r   r   r   
batch_sizer   lengthr<   r=   r>   rM   rL   r   r#   r#   r$   r     s,   
$
	zVitsConvFlow.forwardr   r   r#   r#   r   r$   r    s    r  c                       r   )	VitsElementwiseAffiner   c                    sB   t    |j| _tt| jd| _tt| jd| _	d S Nr   )
r   r   r  r   r   	Parameterr   zeros	translate	log_scaler   r   r#   r$   r     s   
zVitsElementwiseAffine.__init__NFc                 C   sd   |s | j t| j|  }|| }t| j| ddg}||fS || j  t| j  | }|d fS Nr   rU   )r$  r   rG   r%  r^   )r   r;   r   r   r?   rL   r   r#   r#   r$   r     s   zVitsElementwiseAffine.forwardr   r   r#   r#   r   r$   r     r  r   c                       s&   e Zd Z fddZdddZ  ZS )	VitsStochasticDurationPredictorc                    s  t    |j}|j}t||d| _t||d| _t||j	d| _
|dkr/t||d| _t | _| jt| t|jD ]
}| jt| qAtd|d| _t||d| _t||j	d| _t | _| jt| t|jD ]
}| jt| qvd S )Nr   )r  r   )r   r   r   r   r   r   r   r   r  duration_predictor_dropoutr  r   r   r  r   r   r   duration_predictor_num_flowsr  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimr  r   r   r#   r$   r     s4   


z(VitsStochasticDurationPredictor.__init__NFrP   c                 C   s  t |}| |}|d urt |}|| | }| ||}| || }|s	| |}| ||}| || }t 	|
dd|
dj|j|jd| }d}	|}
| jD ]}||
||| d\}
}t |
dg}
|	|7 }	qYt j|
ddgdd\}}|	t tj|tj|  | ddg7 }	t dtdtj |d   | ddg|	 }|t | | }t t |d| }t | ddg}t j||gdd}| jD ]}||||d\}}t |dg}||7 }qt d	tdtj |d   | ddg| }|| S tt| j}|d d
 |d g }t 	|
dd|
dj|j|jd| }|D ]}t |dg}||||dd\}}q3t j|ddgdd\}}|S )Nr   rU   )devicedtype)r   r   rQ         gh㈵>g      ?r9   T)r   r?   )r   detachr   r   r  r   r*  r,  r+  randnsizetor/  r0  r-  r  r   r^   r   rH   
logsigmoidr  rF   pir+   	clamp_minr   r  listr  )r   r;   r   r   	durationsr?   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr  r   r   r   logqlog_determinant_sumlatentsnllr  r   log_durationr#   r#   r$   r     sh   



&



"*

0&z'VitsStochasticDurationPredictor.forward)NNFrP   r   r   r   r   r   r   r#   r#   r   r$   r'    s     r'  c                       s&   e Zd Z fddZdddZ  ZS )VitsDurationPredictorc                    s   t    |j}|j}t|j| _tj|j	|||d d| _
tj||jd| _tj||||d d| _tj||jd| _t|dd| _|jdkrVt|j|j	d| _d S d S )NrU   )r   epsr   r   )r   r   r
  "duration_predictor_filter_channelsr   r   r(  r   r   r   conv_1r  layer_norm_epsnorm_1conv_2norm_2projr   r   )r   r   r   r  r   r#   r$   r   '  s   

zVitsDurationPredictor.__init__Nc                 C   s   t |}|d urt |}|| | }| || }t |}| |dddd}| |}| || }t |}| 	|dddd}| |}| 
|| }|| S r  )r   r3  r   rJ  relurL  r  r   rM  rN  rO  )r   r;   r   r   r#   r#   r$   r   6  s   





zVitsDurationPredictor.forwardr)   rE  r#   r#   r   r$   rF  &  s    rF  c                       s   e Zd ZdZdef fddZdejdedefdd	Z		
	
	
	ddejde
ej de
ej de
ej dedeeje
ej f fddZdd Zdd Zdd Z  ZS )VitsAttentionz?Multi-headed attention with relative positional representation.r   c                    s.  t    |j| _|j| _|j| _|j| _| j| j | _	| j	d | _
| j	| j | jkr8td| j d| j dtj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _| jrttd| jd d | j	| j
 | _ttd| jd d | j	| j
 | _d S d S )Nr1  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   rU   )r   r   r   r.  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingrY   r   Linearuse_biask_projv_projq_projout_projr"  r   r4  	emb_rel_k	emb_rel_vr   r   r#   r$   r   N  s*   

(,zVitsAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S r&  )viewrS  rV  r  
contiguous)r   r`  ra  rb  r#   r#   r$   _shapeg  s    zVitsAttention._shapeNFr   key_value_statesattention_masklayer_head_maskoutput_attentionsr   c                 C   s  |  \}}}| || j }	| | |d|}
| | |d|}|| j d| jf}| |	||j| }	|
j| }
|j| }|
 d}t	
|	|
dd}|  || j ||fkrmtd|| j ||f d|   | jdur| | j|}t	|	|dd}| |}||7 }|dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd	}|dur|  | jfkrtd
| jf d|   |dddd||| j|| }||| j ||}|r||| j||}||| j ||}nd}tjj|| j| jd}t	
||}|  || j || jfkrFtd|| j|| jf d|   | jdurb| | j|}| |}t	||}||7 }||| j|| j}|dd}|||| j}| |}||fS )z#Input shape: Batch x Time x Channelr9   r   rU   z$Attention weights should be of size z	, but is Nr2  z!Attention mask should be of size rQ   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )r5  r\  rW  re  rZ  r[  rS  rV  rc  r   bmmr  rY   rU  _get_relative_embeddingsr^  matmul'_relative_position_to_absolute_positionr   rH   r[   r   rk  r_  '_absolute_position_to_relative_positionr  r.  r]  )r   r   rf  rg  rh  ri  rb  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightsr#   r#   r$   r   j  sx   




"

zVitsAttention.forwardc              	   C   sn   t || jd  d}|dkrtj|dd||ddg}t | jd | d}|d|  d }|d d ||f S )Nr   r   rU   )rX   rU  r   rH   r6   )r   relative_embeddingsr  
pad_lengthslice_start_positionslice_end_positionr#   r#   r$   rm    s   z&VitsAttention._get_relative_embeddingsc                 C   s   |  \}}}tj|g d}|||d | g}tj|d|d ddg}|||d d| d g}|d d d ||d d f }|S )N)r   r   r   r   r   r   rU   r   r   r5  r   rH   r6   rc  r   xbatch_headsr  r   x_flatx_finalr#   r#   r$   ro    s   z5VitsAttention._relative_position_to_absolute_positionc              	   C   s   |  \}}}tj|d|d ddddg}|||d| d  g}tj||dddg}|||d| gd d d d dd f }|S )Nr   r   rU   r  r  r#   r#   r$   rp    s   *z5VitsAttention._absolute_position_to_relative_position)NNNF)r   r   r   r   r   r   r   Tensorr   re  r   boolr"   r   rm  ro  rp  r   r#   r#   r   r$   rQ  K  s0    
b	rQ  c                       s$   e Zd Z fddZdd Z  ZS )VitsFeedForwardc                    s   t    t|j|j|j| _t|j|j|j| _t	|j
| _t|jtr/t|j | _n|j| _|jdkrO|jd d }|jd }||ddddg| _d S d | _d S )Nr   rU   r   )r   r   r   r   r   ffn_dimffn_kernel_sizerJ  rM  r   activation_dropoutr   
isinstance
hidden_actstrr   act_fnr   )r   r   pad_left	pad_rightr   r#   r$   r     s   



zVitsFeedForward.__init__c                 C   s   | ddd}| ddd}|| }| jd urtj|| j}| |}| |}| |}|| }| jd ur?tj|| j}| |}|| }| ddd}|S )Nr   rU   r   )	r  r   r   rH   r6   rJ  r  r   rM  )r   r   r   r#   r#   r$   r     s   





zVitsFeedForward.forwardrE  r#   r#   r   r$   r    s    r  c                	       sL   e Zd Zdef fddZ		ddejdejdeej d	e	fd
dZ
  ZS )VitsEncoderLayerr   c                    sX   t    t|| _t|j| _tj|j	|j
d| _t|| _tj|j	|j
d| _d S )NrG  )r   r   rQ  	attentionr   r   hidden_dropoutr   r  r   rK  
layer_normr  feed_forwardfinal_layer_normr   r   r#   r$   r     s   


zVitsEncoderLayer.__init__NFr   r   rg  ri  c                 C   sp   |}| j |||d\}}| |}| || }|}| ||}| |}| || }|f}|r6||f7 }|S )N)r   rg  ri  )r  r   r  r  r  )r   r   r   rg  ri  r   rw  rL   r#   r#   r$   r   $  s    



zVitsEncoderLayer.forwardr   )r   r   r   r   r   r   r  r    r   r  r   r   r#   r#   r   r$   r    s    r  c                       sp   e Zd Zdef fddZ				ddejdejdeej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )VitsEncoderr   c                    sB   t     | _t fddt jD | _d| _ j	| _	d S )Nc                    s   g | ]}t  qS r#   )r  r   r   r#   r$   r   F  s    z(VitsEncoder.__init__.<locals>.<listcomp>F)
r   r   r   r   r   r   num_hidden_layerslayersgradient_checkpointing	layerdropr   r   r  r$   r   C  s
   
 zVitsEncoder.__init__Nr   r   rg  ri  output_hidden_statesreturn_dictr   c                 C   s   |rdnd }|r
dnd }|d urt ||j}|| }t p t| }	| jD ]5}
|r-||f }tjdd}| jo;|| j	k }|r@|	rL|
||||d}|d }|rPd}|rY||d f }q$|| }|re||f }|sst
dd |||fD S t|||dS )	Nr#   r   r   )rg  r   ri  )NNc                 s   s    | ]	}|d ur|V  qd S r)   r#   )r   vr#   r#   r$   	<genexpr>}  s    z&VitsEncoder.forward.<locals>.<genexpr>)r&   r   r   )r   r0  r	   r
   r  rE   randomuniformrk  r  r"   r   )r   r   r   rg  ri  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputsr#   r#   r$   r   J  sD   	


zVitsEncoder.forward)NNNN)r   r   r   r   r   r   r    r   r  r  r   r"   r   r   r   r#   r#   r   r$   r  B  s*    
r  c                       s   e Zd ZdZdef fddZdd Zdd Z							
ddej	dej
deej	 dee dee dee deeej	 ef fddZ  ZS )VitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    r   c                    sN   t    || _t|j|j|j| _t	|| _
tj|j|jd dd| _d S )NrU   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   r   r#   r$   r     s
   

zVitsTextEncoder.__init__c                 C      | j S r)   r  r   r#   r#   r$   get_input_embeddings     z$VitsTextEncoder.get_input_embeddingsc                 C   s
   || _ d S r)   r  )r   rT   r#   r#   r$   set_input_embeddings  s   
z$VitsTextEncoder.set_input_embeddingsNT	input_idsr   rg  ri  r  r  r   c                 C   s   |  |t| jj }| j||||||d}|s|d n|j}	| |	dddd| }
t	j
|
| jjdd\}}|sJ|	||f|dd   }|S t|	|||j|jdS )N)r   r   rg  ri  r  r  r   r   rU   rQ   )r&   r'   r(   r   r   )r  r  rc   r   r   r  r&   r  r  r   r   r   r%   r   r   )r   r  r   rg  ri  r  r  r   encoder_outputsr&   r   r'   r(   rL   r#   r#   r$   r     s,   		zVitsTextEncoder.forward)NNNT)r   r   r   r   r   r   r  r  r   r  r    r   r  r   r"   r%   r   r   r#   r#   r   r$   r    s0    r  c                   @   s$   e Zd ZeZdZdZdZdd ZdS )VitsPreTrainedModelvitsr  Tc                 C   s  t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
r5|jj	  |jjd dS t |tjrdtj|j |jdurbt|j|j|jd   }tjj|j| |d dS dS t |tjr|jjjd| jjd |jdur|jj|j 	  dS dS dS )zInitialize the weightsr:   )r   stdNrP   r   )r~   r   )r  r   rX  r   datanormal_r   initializer_ranger   zero_r  fill_r   initkaiming_normal_r  rc   r	  r   r   uniform_r  padding_idx)r   modulekr#   r#   r$   _init_weights  s(   


z!VitsPreTrainedModel._init_weightsN)	r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r#   r#   r#   r$   r    s    r  z@
    The complete VITS model, for text-to-speech synthesis.
    c                       s   e Zd Zdef fddZdd Ze							ddeej	 deej	 d	ee
 d
ee dee dee deej deee ef fddZ  ZS )	VitsModelr   c                    s   t  | || _t|| _t|| _t|| _|j	r!t
|| _nt|| _|jdkr4t|j|j| _t|| _|j| _|j| _|j| _|   d S r!  )r   r   r   r  text_encoderr   r  r   decoder"use_stochastic_duration_predictionr'  duration_predictorrF  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_rater<  noise_scale_duration	post_initr   r   r#   r$   r     s   





zVitsModel.__init__c                 C   r  r)   )r  r  r#   r#   r$   get_encoder  r  zVitsModel.get_encoderNr  rg  
speaker_idri  r  r  labelsr   c           $      C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur&td| jjjj}|dur9|	d
|}	nt|	d
|}	| j jdkr~|dur~d|  krZ| j jk sgn td| j jd  dt|trutjd|| jd	}| |	d}
nd}
| j||	||||d
}|s|d n|j}|dd}|	dd}	|s|d n|j}|s|d n|j}| j jr| j||	|
d| jd}n| ||	|
}d| j }tt||	 | }tt|ddgd  }tj!|" |j|jd}|	d|	dk }|	d
|	j}t	|	dt	|d }|j#\}}}}t$|d%|| d}tj!||j|jd}|	d|k }|
|j%|||}|t&j'(|g dddddf  }|	ddd| }t)|*d|dd}t)|*d|dd}|t+|t| | j,  }| j-|||
dd}|| } | .| |
}!|!*d}!|t/0| j j1 }"|s|!|"| f|dd  }#|#S t2|!|"| |j3|j4dS )a  
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nz&Training of VITS is not supported yet.r9   r   r   z Set `speaker_id` in the range 0-.r   )r5  
fill_valuer/  )r  r   rg  ri  r  r  rU   T)r?   r<  rP   )r0  r/  )r   r   r   r   r   r   r   r  )r   r   r   r   r   )5r   ri  r  use_return_dictNotImplementedErrorr  r  r   r0  	unsqueezer6  r   	ones_liker  rY   r  r   fullr/  r  r&   r  r'   r(   r  r  r  r  ceilrG   r9  r^   longarangerX   rZ   r\   rc  r   rH   r6   rn  squeezer   r<  r  r  rE   prodr   r   r   r   )$r   r  rg  r  ri  r  r  r  
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r'   r(   rD  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskr  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsrB  r   r   r   rL   r#   r#   r$   r     s   %

&
zVitsModel.forward)NNNNNNN)r   r   r   r   r   r  r   r   r   r  r   r  r    r   r"   r   r   r   r   r#   r#   r   r$   r    s8    	r  )Fr4   r5   r5   r5   )?r   r  dataclassesr   typingr   r   r   numpyrE   r   torch.utils.checkpointr   activationsr   integrations.deepspeedr	   integrations.fsdpr
   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r%   jitscriptr3   rO   rI   Moduler   r   r   r   r   r   r  r  r   r'  rF  rQ  r  r  r  r  r  r  __all__r#   r#   r#   r$   <module>   s~   


J 	P>X.+d% '*'D8 !