o
    ۷i                     @   sr   d dl Z d dlZd dlmZ d dlm  mZ ddlmZm	Z	 ddl
mZ G dd dejZG dd deeZdS )	    N   )ConfigMixinregister_to_config)
ModelMixinc                       s`   e Zd Z					ddededed	eed
f dedef fddZdej	dej	fddZ
  ZS )ResBlockr      r   r      皙?samechannelskernel_sizestride	dilations.leaky_relu_negative_slopepadding_modec                    sb   t    || _|| _t fdd|D | _t fddtt|D | _	d S )Nc              
      s"   g | ]}t j  |d qS )r   dilationpaddingnnConv1d).0r   r   r   r   r    V/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/ltx2/vocoder.py
<listcomp>       z%ResBlock.__init__.<locals>.<listcomp>c              
      s"   g | ]}t j  d dqS )r   r   r   )r   _r   r   r   r   !   r   )
super__init__r   negative_sloper   
ModuleListconvs1rangelenconvs2)selfr   r   r   r   r   r   	__class__r   r   r       s   
	

zResBlock.__init__xreturnc                 C   sT   t | j| jD ] \}}tj|| jd}||}tj|| jd}||}|| }q|S )Nr!   )zipr#   r&   F
leaky_relur!   )r'   r*   conv1conv2xtr   r   r   forward'   s   
zResBlock.forward)r   r   r   r
   r   )__name__
__module____qualname__inttuplefloatstrr    torchTensorr3   __classcell__r   r   r(   r   r      s(    
r   c                       s   e Zd ZdZedddg dg dg dg dg dg dgd	d
f	dedededee dee dee deee  dedef fddZdde	j
dede	j
fddZ  ZS )LTX2Vocoderz\
    LTX 2.0 vocoder for converting generated mel spectrograms back to audio waveforms.
       i      )            rD   )   r	   r@   r@   r@   )r         r   r
   i]  in_channelshidden_channelsout_channelsupsample_kernel_sizesupsample_factorsresnet_kernel_sizesresnet_dilationsr   output_sampling_ratec
              
      sN  t    t|| _t|| _|| _t|| _|| _	| jt|kr0t
d| j dt| d| jt|krGt
dt| j dt| dtj||dddd| _t | _t | _|}
tt||D ]4\}\}}|
d	 }| jtj|
||||| d	 d
 t||D ]\}}| jt||||d q|}
qetj||dddd
| _d S )Nza`upsample_kernel_sizes` and `upsample_factors` should be lists of the same length but are length z and z, respectively.z_`resnet_kernel_sizes` and `resnet_dilations` should be lists of the same length but are length rF   r   r   )r   r   r   r@   )r   r   )r   r   )r   r    r%   num_upsample_layersresnets_per_upsamplerJ   mathprodtotal_upsample_factorr!   
ValueErrorr   r   conv_inr"   
upsamplersresnets	enumerater-   appendConvTranspose1dr   conv_out)r'   rH   rI   rJ   rK   rL   rM   rN   r   rO   input_channelsir   r   output_channelsr   r(   r   r   r    6   s`   






zLTX2Vocoder.__init__Fhidden_states	time_lastr+   c                    s   |s  dd  dd   tjD ]6}tj jd j|   |j	 }|d j	 }t
j fddt||D dd}t
j|dd qtj d	d   t
   S )
a  
        Forward pass of the vocoder.

        Args:
            hidden_states (`torch.Tensor`):
                Input Mel spectrogram tensor of shape `(batch_size, num_channels, time, num_mel_bins)` if `time_last`
                is `False` (the default) or shape `(batch_size, num_channels, num_mel_bins, time)` if `time_last` is
                `True`.
            time_last (`bool`, *optional*, defaults to `False`):
                Whether the last dimension of the input is the time/frame dimension or the Mel bins dimension.

        Returns:
            `torch.Tensor`:
                Audio waveform tensor of shape (batch_size, out_channels, audio_length)
        r@   r   r   r,   c                    s   g | ]	}j |  qS r   )rX   )r   jr`   r'   r   r   r      s    z'LTX2Vocoder.forward.<locals>.<listcomp>r   )dimg{Gz?)	transposeflattenrV   r$   rP   r.   r/   r!   rW   rQ   r;   stackmeanr\   tanh)r'   r`   ra   r^   startendresnet_outputsr   rc   r   r3   t   s   

$

zLTX2Vocoder.forward)F)r4   r5   r6   __doc__r   r7   listr9   r    r;   r<   boolr3   r=   r   r   r(   r   r>   1   s@    
	
$=r>   )rR   r;   torch.nnr   torch.nn.functional
functionalr.   configuration_utilsr   r   models.modeling_utilsr   Moduler   r>   r   r   r   r   <module>   s    &