o
    ei                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddl m!Z! e"e#Z$eeddG dd deZ%eeddG dd deZ&ej'j(dd Z)					dGddZ*d d! Z+G d"d# d#ej	j,Z-G d$d% d%e	j,Z.G d&d' d'e	j,Z/G d(d) d)e	j,Z0G d*d+ d+e	j,Z1G d,d- d-e	j,Z2G d.d/ d/e	j,Z3G d0d1 d1e	j,Z4G d2d3 d3e	j,Z5G d4d5 d5e	j,Z6G d6d7 d7e	j,Z7G d8d9 d9e	j,Z8G d:d; d;e	j,Z9G d<d= d=eZ:G d>d? d?e	j,Z;G d@dA dAe	j,Z<eG dBdC dCeZ=edDdG dEdF dFe=Z>dFdCgZ?dS )HzPyTorch VITS model.    N)	dataclass)Any)nn   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringloggingtorch_compilable_check   )
VitsConfigz`
    Describes the outputs for the VITS model, with potential hidden states and attentions.
    )custom_introc                   @   sx   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dZe
ej dB ed< dS )VitsModelOutputa"  
    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        The final audio waveform predicted by the model.
    sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
        The length in samples of each element in the `waveform` batch.
    spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
        The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
        GAN decoder model to obtain the final audio waveform.
    Nwaveformsequence_lengthsspectrogramhidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler   r    r#   r#   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vits/modeling_vits.pyr   '   s   
 
r   zm
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
    c                   @   st   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eej dB ed< dZeej dB ed< dS )VitsTextEncoderOutputa  
    prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted mean values of the prior distribution for the latent text variables.
    prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The predicted log-variance values of the prior distribution for the latent text variables.
    Nlast_hidden_stateprior_meansprior_log_variancesr   r   )r   r   r   r   r&   r   r    r!   r'   r(   r   r"   r   r#   r#   r#   r$   r%   ?   s   
 r%   c                 C   sT   | | }t |d d d |d d f }t |d d |d d d f }|| }|S N)r   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactsr#   r#   r$   fused_add_tanh_sigmoid_multiplyT   s
     r3   F      @MbP?c	                 C   s   | | k| |k@ }	|	 }
t | }t | }ttd| d }tjj|dd}||d< ||d< | |
 ||
< d||
< t| |	 ||	ddf ||	ddf ||	ddf |||||d	\||	< ||	< ||fS )	a	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r   
zeros_likenplogexpr   
functionalr6   _rational_quadratic_spline)r;   r<   r=   r>   r?   r@   rA   rB   rC   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstantr#   r#   r$   (_unconstrained_rational_quadratic_spline]   s,   .

rO   c	           *      C   s  |}	| }
t |  |
k|  |	k@ d|
 d|	 d |jd }|| dkr0td| d| || dkr@td| d| tjj|dd	}|d
||  |  }tj	|dd	}tjj
|dddd}|	|
 | |
 }|
|d< |	|d< |dd
df |dddf  }|tj| }tjj|dd	}|d
||  |  }tj	|dd	}tjj
|dddd}|	|
 | |
 }|
|d< |	|d< |dd
df |dddf  }|r|n|}|d  d7  < tj| d |kdd	d
 }|d }|d|d }|d|d }|d|d }|| }|d|d }|d|d }|dd
df d|d }|d|d }|| d|  }|s| | | }|d
|  }|||d ||   }|||  }|||  } |d||d d| |  |d
| d   }!t|!dt|  }"| |"fS | | }#|#| }$|||  |$ }%|| |$ }&| |# }'|&dd|% |'  }(t t|(dkd|(  d|' |& t|(  })|)| | } |)d
|)  }|||  }|d||)d d| |  |d
|) d   }!t|!dt|  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    zInputs are outside the range [z, ]r9         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rN   r:   )r6   modevaluer7   r8   .Ngư>).N      r   z!Discriminant has negative values )r   minmaxshape
ValueErrorr   rH   softmaxr   cumsumr6   softplussumgatherpowrF   allsqrt)*r;   r<   r=   r>   r?   r@   rA   rB   rC   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrL   derivative_numeratorrM   intermediate2intermediate3abcdiscriminantrootr#   r#   r$   rI      s   ,
  



rI   c                       s8   e Zd Zdedef fddZd
ddZdd	 Z  ZS )VitsWaveNetconfig
num_layersc                    sB  t    |j| _|| _tj | _tj | _t	|j
| _ttjjdr,tjjj}ntjj}|jdkrJtj|jd|j | d}||dd| _t|D ]P}|j| }|j| | d }tjj|jd|j |j||d}||dd}| j| ||d k rd|j }	n|j}	tj|j|	d}
||
dd}
| j|
 qNd S )Nweight_normr   rV   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__r#   r$   r   2  s>   


zVitsWaveNet.__init__Nc                 C   s  t |}t | jg}|d ur| |}t| jD ]p}| j| |}|d urA|d | j }|d d ||d| j  d d f }	nt |}	t||	|d }
| 	|
}
| j
| |
}|| jd k r|d d d | jd d f }|| | }||d d | jd d d f  }q|| }q|| S )NrV   r   r   )r   rD   	IntTensorr   r   r   r   r   r3   r   r   )r   r;   padding_maskglobal_conditioningrL   num_channels_tensorr   r   cond_offsetglobal_statesr2   res_skip_actsres_actsr#   r#   r$   forward[  s&   

&

"
zVitsWaveNet.forwardc                 C   sR   | j dkrtjj| j | jD ]	}tjj| q| jD ]	}tjj| qd S )Nr   )r   r   r   r   remove_weight_normr   r   r   r   layerr#   r#   r$   r   x  s   


zVitsWaveNet.remove_weight_normr)   )	r   r   r   r   intr   r   r   __classcell__r#   r#   r   r$   r   1  s    
)r   c                       s,   e Zd Zdef fddZdddZ  ZS )VitsPosteriorEncoderr   c                    sR   t    |j| _t|j|jd| _t	||j
d| _t|j| jd d| _d S )Nr   r   rV   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   r#   r$   r     s
   
zVitsPosteriorEncoder.__init__Nc                 C   sf   |  || }| |||}| || }tj|| jdd\}}|t|t|  | }|||fS )Nr   rR   )r   r   r   r   splitr   
randn_likerG   )r   r;   r   r   statsmean
log_stddevsampledr#   r#   r$   r     s   
zVitsPosteriorEncoder.forwardr)   r   r   r   r   r   r   r   r#   r#   r   r$   r         r   c                       s@   e Zd Zd fdd	ZdddZd	d
 Zdd Zdd Z  ZS )HifiGanResidualBlockr   r   r      皙?c                    sb   t    |_t fddttD _t fddttD _d S )Nc                    s2   g | ]}t j  d | | dqS r   )strider   r   r   r   get_padding).0r   channelsr   r   r   r#   r$   
<listcomp>  s    	z1HifiGanResidualBlock.__init__.<locals>.<listcomp>c                    s*   g | ]}t j  d d d dqS r   r   r   _)r   r   r   r#   r$   r     s    	
)	r   r   leaky_relu_sloper   r   r   lenconvs1convs2)r   r   r   r   r   r   r   r$   r     s   

	
	
zHifiGanResidualBlock.__init__r   c                 C   s   || | d S )NrV   r#   )r   r   r   r#   r#   r$   r     s   z HifiGanResidualBlock.get_paddingc                 C   sL   t jj}tt jjdrt jjj}| jD ]}|| q| jD ]}|| qd S Nr   )r   r   r   r   r   r   r   r   r   r   r#   r#   r$   apply_weight_norm     




z&HifiGanResidualBlock.apply_weight_normc                 C   s4   | j D ]}tj| q| jD ]}tj| qd S r)   )r   r   r   r   r   r   r#   r#   r$   r     s
   

z'HifiGanResidualBlock.remove_weight_normc                 C   sX   t | j| jD ]"\}}|}tj|| j}||}tj|| j}||}|| }q|S r)   )zipr   r   r   rH   
leaky_relur   )r   r   conv1conv2residualr#   r#   r$   r     s   
zHifiGanResidualBlock.forward)r   r   r   r   )	r   r   r   r   r   r   r   r   r   r#   r#   r   r$   r     s    

r   c                       sV   e Zd Zdef fddZdd Zdd Z	dd	ejd
ejdB dejfddZ	  Z
S )VitsHifiGanr   c              
      sF  t    || _t|j| _t|j| _tj	|j
|jdddd| _t | _tt|j|jD ]$\}\}}| jtj|jd|  |jd|d   |||| d d q/t | _tt| jD ]#}|jd|d   }t|j|jD ]\}}| jt||||j qrq`tj	|dddddd| _|jdkrt	|j|jd| _d S d S )	N   r   r   )r   r   r   rV   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   r#   r$   r     s@   





zVitsHifiGan.__init__c                 C   sL   t jj}tt jjdrt jjj}| jD ]}|| q| jD ]}|  qd S r   )r   r   r   r   r   r   r   r   r   r#   r#   r$   r     r   zVitsHifiGan.apply_weight_normc                 C   s0   | j D ]}tj| q| jD ]}|  qd S r)   )r   r   r   r   r   r   r#   r#   r$   r     s
   


zVitsHifiGan.remove_weight_normNr   r   returnc                 C   s   |  |}|dur|| | }t| jD ]8}tj|| jj}| j	| |}| j
|| j  |}td| jD ]}|| j
|| j |  |7 }q7|| j }qtj|}| |}t|}|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        Nr   )r   r   r   r   r   rH   r   r   r   r   r   r   r   r   r*   )r   r   r   r   r   	res_statejr   r#   r#   r$   r     s   


zVitsHifiGan.forwardr)   )r   r   r   r   r   r   r   r   r    r   r   r#   r#   r   r$   r     s    $
r   c                       ,   e Zd Zdef fddZdddZ  ZS )	VitsResidualCouplingLayerr   c                    sR   t    |jd | _t| j|jd| _t||j	d| _
t|j| jd| _d S )NrV   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r   r   r   r#   r$   r   +  s
   
z"VitsResidualCouplingLayer.__init__NFc                 C   s   t j|| jgd dd\}}| || }| |||}| || }t |}	|sJ||t |	 |  }t j||gdd}
t 	|	ddg}|
|fS || t |	  | }t j||gdd}
|
d fS )NrV   r   rR   )
r   r   r   r   r   r   rD   rG   catr_   )r   r;   r   r   r?   
first_halfsecond_halfr   r   r   rL   log_determinantr#   r#   r$   r   3  s   
z!VitsResidualCouplingLayer.forwardNFr   r#   r#   r   r$   r   *  r   r   c                       r   )	VitsResidualCouplingBlockr   c                    s8   t    t | _t|jD ]
}| jt| qd S r)   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r   )r   r   r   r   r#   r$   r   F  s
   

z"VitsResidualCouplingBlock.__init__NFc                 C   sh   |s| j D ]}||||\}}t|dg}q|S t| j D ]}t|dg}||||dd\}}q|S )Nr   Tr?   )r  r   flipreversed)r   r;   r   r   r?   flowr   r#   r#   r$   r   L  s   
z!VitsResidualCouplingBlock.forwardr   r   r#   r#   r   r$   r   E      r   c                       s.   e Zd Zddef fddZd	ddZ  ZS )
VitsDilatedDepthSeparableConvr:   r   c                    s   t    |j}|j}|j| _t|| _t	 | _
t	 | _t	 | _t	 | _t| jD ]:}|| }|| | d }| j
tj||||||d | jt||d | jt| | jt| q.d S )NrV   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)r   r   dropout_rater   r   r   r   r   r   r#   r$   r   Y  s4   





z&VitsDilatedDepthSeparableConv.__init__Nc                 C   s   |d ur|| }t | jD ]E}| j| || }| j| |dddd}tj|}| j| |}| j	| |dddd}tj|}| 
|}|| }q|| S Nr   r9   )r   r   r  r  	transposer   rH   gelur  r  r   )r   r;   r   r   r   r   r#   r#   r$   r   u  s   

z%VitsDilatedDepthSeparableConv.forward)r:   r)   r   r#   r#   r   r$   r  X  s    r  c                       r   )	VitsConvFlowr   c                    sr   t    |j| _|jd | _|j| _|j| _	t
| j| jd| _t|| _t
| j| j| jd d  d| _d S )NrV   r   r   )r   r   r   filter_channelsdepth_separable_channelsr   duration_predictor_flow_binsrf   duration_predictor_tail_boundr@   r   r   r   r  conv_ddsr   r   r   r#   r$   r     s   

&zVitsConvFlow.__init__NFc                 C   s  t j|| jgd dd\}}| |}| |||}| || }|j\}}	}
|||	d|
dddd}|dd | j	f t
| j }|d| j	d| j	 f t
| j }|dd| j	 d f }t|||||| jd\}}t j||gdd| }|st || ddg}||fS |d fS )	NrV   r   rR   r9   r   r   .)r?   r@   )r   r   r   r   r  r   rZ   reshapepermuterf   mathrc   r  rO   r@   r   r_   )r   r;   r   r   r?   r   r   r   
batch_sizer   lengthr<   r=   r>   rM   rL   r   r#   r#   r$   r     s,   
$
	zVitsConvFlow.forwardr   r   r#   r#   r   r$   r    s    r  c                       r   )	VitsElementwiseAffiner   c                    sB   t    |j| _tt| jd| _tt| jd| _	d S Nr   )
r   r   r  r   r   	Parameterr   zeros	translate	log_scaler   r   r#   r$   r     s   
zVitsElementwiseAffine.__init__NFc                 C   sd   |s | j t| j|  }|| }t| j| ddg}||fS || j  t| j  | }|d fS Nr   rV   )r$  r   rG   r%  r_   )r   r;   r   r   r?   rL   r   r#   r#   r$   r     s   zVitsElementwiseAffine.forwardr   r   r#   r#   r   r$   r     r  r   c                       s&   e Zd Z fddZdddZ  ZS )	VitsStochasticDurationPredictorc                    s  t    |j}|j}t||d| _t||d| _t||j	d| _
|dkr/t||d| _t | _| jt| t|jD ]
}| jt| qAtd|d| _t||d| _t||j	d| _t | _| jt| t|jD ]
}| jt| qvd S )Nr   )r  r   )r   r   r   r   r   r   r   r   r  duration_predictor_dropoutr  r   r   r  r   r   r   duration_predictor_num_flowsr  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimr  r   r   r#   r$   r     s4   


z(VitsStochasticDurationPredictor.__init__NFrQ   c                 C   s  t |}| |}|d urt |}|| | }| ||}| || }|s	| |}| ||}| || }t 	|
dd|
dj|j|jd| }d}	|}
| jD ]}||
||| d\}
}t |
dg}
|	|7 }	qYt j|
ddgdd\}}|	t tj|tj|  | ddg7 }	t dtdtj |d   | ddg|	 }|t | | }t t |d| }t | ddg}t j||gdd}| jD ]}||||d\}}t |dg}||7 }qt d	tdtj |d   | ddg| }|| S tt| j}|d d
 |d g }t 	|
dd|
dj|j|jd| }|D ]}t |dg}||||dd\}}q3t j|ddgdd\}}|S )Nr   rV   )devicedtype)r   r   rR         gh㈵>g      ?r9   T)r   r?   )r   detachr   r   r  r   r*  r,  r+  randnsizetor/  r0  r-  r  r   r_   r   rH   
logsigmoidr  rF   pir+   	clamp_minr   r  listr  )r   r;   r   r   	durationsr?   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr  r   r   r   logqlog_determinant_sumlatentsnllr  r   log_durationr#   r#   r$   r     sh   



&



"*

0&z'VitsStochasticDurationPredictor.forward)NNFrQ   r   r   r   r   r   r   r#   r#   r   r$   r'    s     r'  c                       s&   e Zd Z fddZdddZ  ZS )VitsDurationPredictorc                    s   t    |j}|j}t|j| _tj|j	|||d d| _
tj||jd| _tj||||d d| _tj||jd| _t|dd| _|jdkrVt|j|j	d| _d S d S )NrV   )r   epsr   r   )r   r   r
  "duration_predictor_filter_channelsr   r   r(  r   r   r   conv_1r  layer_norm_epsnorm_1conv_2norm_2projr   r   )r   r   r   r  r   r#   r$   r   (  s   

zVitsDurationPredictor.__init__Nc                 C   s   t |}|d urt |}|| | }| || }t |}| |dddd}| |}| || }t |}| 	|dddd}| |}| 
|| }|| S r  )r   r3  r   rJ  relurL  r  r   rM  rN  rO  )r   r;   r   r   r#   r#   r$   r   7  s   





zVitsDurationPredictor.forwardr)   rE  r#   r#   r   r$   rF  '  s    rF  c                       s   e Zd ZdZdef fddZdejdedefdd	Z		
	
	ddejdejd
B dejd
B de
deejejd
B f f
ddZdd Zdd Zdd Z  ZS )VitsAttentionz?Multi-headed attention with relative positional representation.r   c                    s.  t    |j| _|j| _|j| _|j| _| j| j | _	| j	d | _
| j	| j | jkr8td| j d| j dtj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _| jrttd| jd d | j	| j
 | _ttd| jd d | j	| j
 | _d S d S )Nr1  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   rV   )r   r   r   r.  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingr[   r   Linearuse_biask_projv_projq_projout_projr"  r   r4  	emb_rel_k	emb_rel_vr   r   r#   r$   r   O  s*   

(,zVitsAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S r&  )viewrS  rV  r  
contiguous)r   r`  ra  rb  r#   r#   r$   _shapeh  s    zVitsAttention._shapeNFr   key_value_statesattention_maskoutput_attentionsr   c                 C   s  |  \}}}| || j }| | |d|}	| | |d|}
|| j d| jf}| |||j| }|	j| }	|
j| }
|	 d}t	
||	dd}|  || j ||fkrmtd|| j ||f d|   | jdur| | j|}t	||dd}| |}||7 }|dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd	}|r||| j||}||| j ||}nd}tjj|| j| jd
}t	
||
}|  || j || jfkrtd|| j|| jf d|   | jdur,| | j|}| |}t	||}||7 }||| j|| j}|dd}|||| j}| |}||fS )z#Input shape: Batch x Time x Channelr9   r   rV   z$Attention weights should be of size z	, but is Nr2  z!Attention mask should be of size rR   )ptrainingz `attn_output` should be of size )r5  r\  rW  re  rZ  r[  rS  rV  rc  r   bmmr  r[   rU  _get_relative_embeddingsr^  matmul'_relative_position_to_absolute_positionr   rH   r\   r   rj  r_  '_absolute_position_to_relative_positionr  r.  r]  )r   r   rf  rg  rh  rb  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightsr#   r#   r$   r   k  sf   






zVitsAttention.forwardc              	   C   sn   t || jd  d}|dkrtj|dd||ddg}t | jd | d}|d|  d }|d d ||f S )Nr   r   rV   )rY   rU  r   rH   r6   )r   relative_embeddingsr  
pad_lengthslice_start_positionslice_end_positionr#   r#   r$   rl    s   z&VitsAttention._get_relative_embeddingsc                 C   s   |  \}}}tj|g d}|||d | g}tj|d|d ddg}|||d d| d g}|d d d ||d d f }|S )N)r   r   r   r   r   r   rV   r   r   r5  r   rH   r6   rc  r   xbatch_headsr  r   x_flatx_finalr#   r#   r$   rn    s   z5VitsAttention._relative_position_to_absolute_positionc              	   C   s   |  \}}}tj|d|d ddddg}|||d| d  g}tj||dddg}|||d| gd d d d dd f }|S )Nr   r   rV   r  r  r#   r#   r$   ro    s   *z5VitsAttention._absolute_position_to_relative_position)NNF)r   r   r   r   r   r   r   Tensorr   re  boolr"   r   rl  rn  ro  r   r#   r#   r   r$   rQ  L  s*    
X	rQ  c                       s$   e Zd Z fddZdd Z  ZS )VitsFeedForwardc                    s   t    t|j|j|j| _t|j|j|j| _t	|j
| _t|jtr/t|j | _n|j| _|jdkrO|jd d }|jd }||ddddg| _d S d | _d S )Nr   rV   r   )r   r   r   r   r   ffn_dimffn_kernel_sizerJ  rM  r   activation_dropoutr   
isinstance
hidden_actstrr   act_fnr   )r   r   pad_left	pad_rightr   r#   r$   r     s   



zVitsFeedForward.__init__c                 C   s   | ddd}| ddd}|| }| jd urtj|| j}| |}| |}| |}|| }| jd ur?tj|| j}| |}|| }| ddd}|S )Nr   rV   r   )	r  r   r   rH   r6   rJ  r  r   rM  )r   r   r   r#   r#   r$   r     s   





zVitsFeedForward.forwardrE  r#   r#   r   r$   r    s    r  c                	       sL   e Zd Zdef fddZ		ddejdejdejdB d	efd
dZ	  Z
S )VitsEncoderLayerr   c                    sX   t    t|| _t|j| _tj|j	|j
d| _t|| _tj|j	|j
d| _d S )NrG  )r   r   rQ  	attentionr   r   hidden_dropoutr   r  r   rK  
layer_normr  feed_forwardfinal_layer_normr   r   r#   r$   r     s   


zVitsEncoderLayer.__init__NFr   r   rg  rh  c                 C   sp   |}| j |||d\}}| |}| || }|}| ||}| |}| || }|f}|r6||f7 }|S )N)r   rg  rh  )r  r   r  r  r  )r   r   r   rg  rh  r   rv  rL   r#   r#   r$   r     s    



zVitsEncoderLayer.forwardr   )r   r   r   r   r   r   r  r    r  r   r   r#   r#   r   r$   r    s    r  c                       sl   e Zd Zdef fddZ				ddejdejdejdB dedB d	edB d
edB de	e
B fddZ  ZS )VitsEncoderr   c                    sB   t     | _t fddt jD | _d| _ j	| _	d S )Nc                    s   g | ]}t  qS r#   )r  r   r   r#   r$   r   =  s    z(VitsEncoder.__init__.<locals>.<listcomp>F)
r   r   r   r   r   r   num_hidden_layerslayersgradient_checkpointing	layerdropr   r   r  r$   r   :  s
   
 zVitsEncoder.__init__Nr   r   rg  rh  output_hidden_statesreturn_dictr   c                 C   s   |rdnd }|r
dnd }t | j||d}|| }t pt| }	| jD ]5}
|r+||f }tjdd}| jo9|| j	k }|r>|	rJ|
||||d}|d }|rNd}|rW||d f }q"|| }|rc||f }|sqt
dd |||fD S t|||d	S )
Nr#   )r   inputs_embedsrg  r   r   )rg  r   rh  )NNc                 s   s    | ]	}|d ur|V  qd S r)   r#   )r   vr#   r#   r$   	<genexpr>u  s    z&VitsEncoder.forward.<locals>.<genexpr>)r&   r   r   )r
   r   r   r	   r  rE   randomuniformrj  r  r"   r   )r   r   r   rg  rh  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputsr#   r#   r$   r   A  sJ   	


zVitsEncoder.forward)NNNN)r   r   r   r   r   r   r    r  r  r"   r   r   r   r#   r#   r   r$   r  9  s*    r  c                       sv   e Zd ZdZdef fddZ				ddejdejd	ejdB d
e	dB de	dB de	dB de
ej eB fddZ  ZS )VitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    r   c                    sN   t    || _t|j|j|j| _t	|| _
tj|j|jd dd| _d S )NrV   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   r   r#   r$   r     s
   

zVitsTextEncoder.__init__NT	input_idsr   rg  rh  r  r  r   c                 C   s   |  |t| jj }| j||||||d}|s|d n|j}	| |	dddd| }
t	j
|
| jjdd\}}|sJ|	||f|dd   }|S t|	|||j|jdS )N)r   r   rg  rh  r  r  r   r   rV   rR   )r&   r'   r(   r   r   )r  r  rc   r   r   r  r&   r  r  r   r   r   r%   r   r   )r   r  r   rg  rh  r  r  r   encoder_outputsr&   r   r'   r(   rL   r#   r#   r$   r     s,   		zVitsTextEncoder.forward)NNNT)r   r   r   r   r   r   r   r  r    r  r"   r%   r   r   r#   r#   r   r$   r  ~  s,    r  c                   @   s:   e Zd ZU eed< dZdZdZe	 de
jfddZdS )	VitsPreTrainedModelr   vitsr  Tmodulec                 C   s  | j j}t|tjr"tj|jd|d |jdur t	|j dS dS t|tj
r6t	|j t|j dS t|tjtjfrft|j |jdurdt|j|j|jd   }tj|j| |d dS dS t|tjrtj|jd|d |jdurt|jddst	|j|j  dS dS dS t|tr| j jr| j j| j j }tj|j|d d	 tj|j|d d	 dS dS t|trt	|j t	|j  dS dS )
zInitialize the weightsr:   )r   stdNr   )r~   r   _is_hf_initializedFr1  )r  )!r   initializer_ranger  r   rX  initnormal_r   r   zeros_r  ones_r   r   kaiming_normal_r  rc   r	  r   r   uniform_r  padding_idxgetattrrQ  rU  r   rR  r^  r_  r   r$  r%  )r   r  r  krV  r#   r#   r$   _init_weights  s<   



z!VitsPreTrainedModel._init_weightsN)r   r   r   r   r!   base_model_prefixmain_input_namesupports_gradient_checkpointingr   no_gradr   Moduler  r#   r#   r#   r$   r    s   
 r  z@
    The complete VITS model, for text-to-speech synthesis.
    c                       s   e Zd Zdef fddZe							ddejdB dejdB dedB de	dB d	e	dB d
e	dB dej
dB dee eB fddZ  ZS )	VitsModelr   c                    s   t  | || _t|| _t|| _t|| _|j	r!t
|| _nt|| _|jdkr4t|j|j| _t|| _|j| _|j| _|j| _|   d S r!  )r   r   r   r  text_encoderr   r  r   decoder"use_stochastic_duration_predictionr'  duration_predictorrF  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_rater<  noise_scale_duration	post_initr   r   r#   r$   r     s   





zVitsModel.__init__Nr  rg  
speaker_idrh  r  r  labelsr   c           %      K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur&td| jjjj}	|dur9|	d
|	}
nt|	d
|	}
| j jdkr~|dur~d|  krZ| j jk sgn td| j jd  dt|trutjd|| jd	}| |	d}nd}| j||
||||d
}|s|d n|j}|dd}|
dd}
|s|d n|j}|s|d n|j}| j jr| j||
|d| jd}n| ||
|}d| j }tt||
 | }tt|ddgd  }tj!|" |j|jd}|	d|	dk }|	d
|
j}t	|
dt	|d }|j#\}}}}t$|d%|| d}tj!||j|jd}|	d|k }|
|j%|||}|t&j'(|g dddddf  }|	ddd| }t)|*d|dd}t)|*d|dd}|t+|t| | j,  }| j-|||dd} | | }!| .|!|}"|"*d}"|t/0| j j1 }#|s|"|#|!f|dd  }$|$S t2|"|#|!|j3|j4dS )a  
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nz&Training of VITS is not supported yet.r9   r   r   z Set `speaker_id` in the range 0-.r   )r5  
fill_valuer/  )r  r   rg  rh  r  r  rV   T)r?   r<  rQ   )r0  r/  )r   r   r   r   r   r   r   r  )r   r   r   r   r   )5r   rh  r  use_return_dictNotImplementedErrorr  r  r   r0  	unsqueezer6  r   	ones_liker  r[   r  r   fullr/  r  r&   r  r'   r(   r  r  r  r  ceilrG   r9  r_   longarangerY   rZ   r]   rc  r   rH   r6   rm  squeezer   r<  r  r  rE   prodr   r   r   r   )%r   r  rg  r  rh  r  r  r  kwargs
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r'   r(   rD  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskr  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsrB  r   r   r   rL   r#   r#   r$   r     s   &

&
zVitsModel.forward)NNNNNNN)r   r   r   r   r   r   r   r  r   r  r    r"   r   r   r   r   r#   r#   r   r$   r    s6    

r  )Fr4   r5   r5   r5   )@r   r  dataclassesr   typingr   numpyrE   r   r    r   r  activationsr   integrations.deepspeedr   integrations.fsdpr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r%   jitscriptr3   rO   rI   r  r   r   r   r   r   r   r  r  r   r'  rF  rQ  r  r  r  r  r  r  __all__r#   r#   r#   r$   <module>   s~   


J P>X.+d% *'E2% 