o
    Gi?x                     @   s(  d dl Z d dlmZ d dlm  mZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ d
ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dZG dd dejZG dd dejZG dd deeeZdS )    N   )ConfigMixinregister_to_config)apply_forward_hook   )AutoencoderKLOutput)
ModelMixin   )AutoencoderMixinDecoderOutputDiagonalGaussianDistribution   c                       s   e Zd ZdZ					ddededeeeef B ded	eeeef B d
edededdf fddZde	j
de	j
fddZ  ZS )LTX2AudioCausalConv2dzQ
    A causal 2D convolution that pads asymmetrically along the causal axis.
    r	   Theightin_channelsout_channelskernel_sizestridedilationgroupsbiascausality_axisreturnNc	              
      s  t    || _t|tr||fn|}t|tr||fn|}|d d |d  }	|d d |d  }
| jdkrJ|
d |
|
d  |	d |	|	d  f}n+| jdv r\|
d|	d |	|	d  f}n| jdkrn|
d |
|
d  |	df}ntd| || _tj||||d|||d| _	d S )	Nr   r	   noner   >   widthwidth-compatibilityr   Invalid causality_axis: )r   paddingr   r   r   )
super__init__r   
isinstanceint
ValueErrorr   nnConv2dconv)selfr   r   r   r   r   r   r   r   pad_hpad_wr   	__class__ k/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.pyr   $   s0   

&

zLTX2AudioCausalConv2d.__init__xc                 C   s   t || j}| |S N)Fpadr   r%   r&   r-   r+   r+   r,   forwardM   s   
zLTX2AudioCausalConv2d.forward)r	   r	   r	   Tr   )__name__
__module____qualname____doc__r!   tupleboolstrr   torchTensorr2   __classcell__r+   r+   r)   r,   r      s6    		
)r   c                       sF   e Zd ZdZddededdf fdd	Zd
ejdejfddZ	  Z
S )LTX2AudioPixelNormz;
    Per-pixel (per-location) RMS normalization layer.
    r	   :0yE>dimepsr   Nc                    s   t    || _|| _d S r.   )r   r   r?   r@   )r&   r?   r@   r)   r+   r,   r   W   s   

zLTX2AudioPixelNorm.__init__r-   c                 C   s.   t j|d | jdd}t || j }|| S )Nr   T)r?   keepdim)r:   meanr?   sqrtr@   )r&   r-   mean_sqrmsr+   r+   r,   r2   \   s   zLTX2AudioPixelNorm.forward)r	   r>   )r3   r4   r5   r6   r!   floatr   r:   r;   r2   r<   r+   r+   r)   r,   r=   R   s    r=   c                       sD   e Zd Z	ddededdf fddZdejdejfd	d
Z  Z	S )LTX2AudioAttnBlockgroupr   	norm_typer   Nc                    s   t    || _|dkrtjd|ddd| _n|dkr#tddd| _ntd	| tj||ddd
d| _	tj||ddd
d| _
tj||ddd
d| _tj||ddd
d| _d S )NrH       ư>T
num_groupsnum_channelsr@   affinepixelr	   r?   r@   Invalid normalization type: r   r   r   r   )r   r   r   r#   	GroupNormnormr=   r"   r$   qkvproj_out)r&   r   rI   r)   r+   r,   r   c   s   
zLTX2AudioAttnBlock.__init__r-   c                 C   s   |  |}| |}| |}| |}|j\}}}}	|||||	 ddd }|||||	  }t	||t
|d  }
tjjj|
dd}
|||||	 }|
ddd }
t	||
||||	}| |}|| S )Nr   r   r	   g      )r?   )rU   rV   rW   rX   shapereshapepermute
contiguousr:   bmmr!   r#   
functionalsoftmaxrY   )r&   r-   h_rV   rW   rX   batchchannelsr   r   attnr+   r+   r,   r2   v   s   



 
zLTX2AudioAttnBlock.forward)rH   )
r3   r4   r5   r!   r9   r   r:   r;   r2   r<   r+   r+   r)   r,   rG   b   s    rG   c                       sr   e Zd Z						ddededB d	ed
ededededdf fddZddej	dej	dB dej	fddZ
  ZS )LTX2AudioResnetBlockNF           rH   r   r   r   conv_shortcutdropouttemb_channelsrI   r   r   c                    s  t    || _| jd ur| jdkr|dkrtd|| _|d u r#|n|}|| _|| _|dkr:tjd|ddd| _	n|dkrFt
d	dd
| _	ntd| t | _|d urat||dd	|d| _ntj||dd	d	d| _|dkrwt||| _|dkrtjd|ddd| _n|dkrt
d	dd
| _ntd| t|| _|d urt||dd	|d| _ntj||dd	d	d| _| j| jkr| jr|d urt||dd	|d| _d S tj||dd	d	d| _d S |d urt||d	d	|d| _d S tj||d	d	dd| _d S d S )Nr   rH   z3Causal ResnetBlock with GroupNorm is not supported.rJ   rK   TrL   rP   r	   rQ   rR   r   r   r   r   rS   r   )r   r   r   r"   r   r   use_conv_shortcutr#   rT   norm1r=   SiLUnon_linearityr   conv1r$   Linear	temb_projnorm2Dropoutri   conv2rh   nin_shortcut)r&   r   r   rh   ri   rj   rI   r   r)   r+   r,   r      sZ   








zLTX2AudioResnetBlock.__init__r-   tembc                 C   s   |  |}| |}| |}|d ur'|| | |d d d d d d f  }| |}| |}| |}| |}| j| jkrN| j	rI| 
|n| |}|| S r.   )rm   ro   rp   rr   rs   ri   ru   r   r   rl   rh   rv   )r&   r-   rw   hr+   r+   r,   r2      s   


(



zLTX2AudioResnetBlock.forward)NFrf   rg   rH   r   r.   )r3   r4   r5   r!   r8   rF   r9   r   r:   r;   r2   r<   r+   r+   r)   r,   re      s2    	*@re   c                	       J   e Zd ZddedededB ddf fddZd	ejdejfd
dZ	  Z
S )LTX2AudioDownsampler   r   	with_convr   Nr   c                    s<   t    || _|| _| jrtjj||dddd| _d S d S )Nr   r   r   rS   )r   r   r{   r   r:   r#   r$   r%   r&   r   r{   r   r)   r+   r,   r      s   
zLTX2AudioDownsample.__init__r-   c                 C   s   | j r<| jdkrd}n!| jdkrd}n| jdkrd}n| jdkr#d}n	td	| j d
tj||ddd}| |}|S tj|ddd}|S )Nr   )r   r	   r   r	   r   )r   r   r   r	   r   )r   r	   r   r   r   )r	   r   r   r	   zInvalid `causality_axis` zL; supported values are `none`, `width`, `height`, and `width-compatibility`.constantr   )modevaluer   )r   r   )r{   r   r"   r/   r0   r%   
avg_pool2d)r&   r-   r0   r+   r+   r,   r2      s"   




zLTX2AudioDownsample.forwardr   r3   r4   r5   r!   r8   r9   r   r:   r;   r2   r<   r+   r+   r)   r,   rz      s    $rz   c                	       ry   )LTX2AudioUpsampler   r   r{   r   Nr   c                    sZ   t    || _|| _| jr+|d urt||dd|d| _d S tj||dddd| _d S d S )Nr   r	   rk   rS   )r   r   r{   r   r   r%   r#   r$   r|   r)   r+   r,   r     s   

zLTX2AudioUpsample.__init__r-   c                 C   s   t jjj|ddd}| jr]| |}| jd u s| jdkr	 |S | jdkr6|d d d d dd d d f }|S | jdkrM|d d d d d d dd f }|S | jdkrU	 |S td	| j |S )
Ng       @nearest)scale_factorr~   r   r   r	   r   r   r   )r:   r#   r_   interpolater{   r%   r   r"   r1   r+   r+   r,   r2     s"   


 
 
zLTX2AudioUpsample.forwardr   r   r+   r+   r)   r,   r      s    $r   c                   @   s   e Zd ZdZ				ddededed	ed
ef
ddZdejdejfddZ	dejdededejfddZ
edeeeef fddZdS )LTX2AudioAudioPatchifierz3
    Patchifier for spectrogram/audio latents.
    >     r   T
patch_sizesample_rate
hop_lengthaudio_latent_downsample_factor	is_causalc                 C   s(   || _ || _|| _|| _d||f| _d S )Nr	   )r   r   r   r   _patch_size)r&   r   r   r   r   r   r+   r+   r,   r   $  s
   z!LTX2AudioAudioPatchifier.__init__audio_latentsr   c                 C   s,   |j \}}}}|dddd|||| S Nr   r   r	   r   )rZ   r\   r[   )r&   r   rb   rc   timefreqr+   r+   r,   patchify2  s   z!LTX2AudioAudioPatchifier.patchifyrc   mel_binsc                 C   s(   |j \}}}|||||ddddS r   )rZ   viewr\   )r&   r   rc   r   rb   r   _r+   r+   r,   
unpatchify6  s   z#LTX2AudioAudioPatchifier.unpatchifyc                 C   s   | j S r.   )r   r&   r+   r+   r,   r   :  s   z#LTX2AudioAudioPatchifier.patch_sizeN)r   r   r   T)r3   r4   r5   r6   r!   r8   r   r:   r;   r   r   propertyr7   r   r+   r+   r+   r,   r     s*    
r   c                #       s   e Zd Z												
						d(dedededeedf dB dedededeedf dededB dededededed edB d!ef" fd"d#Zd$e	j
d%e	j
fd&d'Z  ZS ))LTX2AudioEncoder   r	   r   N      r	   r   r   rH   r   rf   Fr   r   T@   base_channelsoutput_channelsnum_res_blocksattn_resolutions.r   
resolutionlatent_channelsch_multrI   r   ri   mid_block_add_attentionr   mel_hop_lengthr   r   double_zc                    s  t    || _|| _|| _|| _|| _d| _t|| _	|| _
|| _|| _|| _d| _d| _|	| _|| _|| _|| _|
| _|}|}d|||f| _| jd urYt||dd| jd| _ntj||dddd| _t | _|}| j}t| j	D ]`}t }t |_t |_| j| j|  }t| j
D ](}|j t!||| j|| j| jd |}| jr|| jv r|j t"|| jd q|| j	d krt#|d	| jd
|_$|d }| j | qst | _%t!||| j|| j| jd| j%_&|rt"|| jd| j%_'nt( | j%_'t!||| j|| j| jd| j%_)|}|rd| n|}| jdkr%tj*d|dd	d| _+n| jdkr3t,ddd| _+nt-d| j t. | _/| jd urSt||dd| jd| _0d S tj||dddd| _0d S )Nr   Fr	   r   rk   rS   r   r   rj   ri   rI   r   rI   Tr   r   rH   rJ   rK   rL   rP   rQ   rR   )1r   r   r   r   r   r   r   temb_chlennum_resolutionsr   r   r   out_chgive_pre_endtanh_outrI   r   channel_multipliersr   r   z_shaper   conv_inr#   r$   
ModuleListdownrangeModuleblockrd   appendre   rG   rz   
downsamplemidblock_1attn_1Identityblock_2rT   norm_outr=   r"   rn   ro   conv_out)r&   r   r   r   r   r   r   r   r   rI   r   ri   r   r   r   r   r   r   base_block_channelsbase_resolutionblock_incurr_reslevelstage	block_outr   final_block_channels
z_channelsr)   r+   r,   r   @  s   











	
zLTX2AudioEncoder.__init__hidden_statesr   c                 C   s   |  |}t| jD ]2}| j| }t|jD ]\}}||d d}|jr*|j| |}q|| jd kr<t|dr<||}q
| j	j
|d d}| j	|}| j	j|d d}| |}| |}| |}|S )Nrw   r	   r   )r   r   r   r   	enumerater   rd   hasattrr   r   r   r   r   r   ro   r   )r&   r   r   r   	block_idxr   r+   r+   r,   r2     s$   





zLTX2AudioEncoder.forward)r   r	   r   Nr   r   r   r   rH   r   rf   Fr   r   Tr   T)r3   r4   r5   r!   r7   r9   rF   r8   r   r:   r;   r2   r<   r+   r+   r)   r,   r   ?  sl    
	
|r   c                #       s   e Zd ZdZ											
						d(dedededeedf dB dedededeedf dededB dedededed ed!edB d"df" fd#d$Z	d%e
jd"e
jfd&d'Z  ZS ))LTX2AudioDecoderz
    Symmetric decoder that reconstructs audio spectrograms from latent features.

    The decoder mirrors the encoder structure with configurable channel multipliers, attention resolutions, and causal
    convolutions.
    r   r	   r   Nr   r   r   rH   r   rf   Fr   r   Tr   r   r   r   r   .r   r   r   r   rI   r   ri   r   r   r   r   r   r   c                    s  t    || _|| _|| _|| _tdt|||d| _|| _	d| _
t|| _|| _|| _|| _|| _d| _d| _|	| _|| _|| _|| _|
| _|| jd  }|d| jd   }d|||f| _| jd urot||dd| jd| _ntj||dddd	| _t | _t | _ t!||| j
|| j| jd
| j _"|rt#|| jd| j _$nt% | j _$t!||| j
|| j| jd
| j _&t' | _(|}| jd| jd   }t)t*| jD ]c}t }t' |_+t' |_,| j	| j|  }t*| jd D ]*}|j+-t!||| j
|| j| jd
 |}| jr|| jv r|j,-t#|| jd q|dkr(t.|d| jd|_/|d9 }| j(0d| q|}| jdkrCtj1d|ddd| _2n| jdkrQt3ddd| _2nt4d| j | jd urlt||dd| jd| _5d S tj||dddd	| _5d S )Nr	   )r   r   r   r   r   r   Fr   r   rk   rS   r   r   Tr   rH   rJ   rK   rL   rP   rQ   rR   )6r   r   r   r   r   r   r   LATENT_DOWNSAMPLE_FACTOR
patchifierr   r   r   r   r   r   r   r   r   r   rI   r   r   r   r   r   r   r   r#   r$   rn   ro   r   r   re   r   rG   r   r   r   r   upreversedr   r   rd   r   r   upsampleinsertrT   r   r=   r"   r   )r&   r   r   r   r   r   r   r   r   rI   r   ri   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r)   r+   r,   r     s   








	



zLTX2AudioDecoder.__init__samplec                 C   s  |j \}}}}|t }| jd urt|td  d}| j}| jd ur$| jn|}| |}| jj|d d}| j	|}| jj
|d d}tt| jD ]/}	| j|	 }
t|
jD ]\}}||d d}|
jrh|
j| |}qT|	dkrwt|
drw|
|}qH| jr}|S | |}| |}| |}| jrt|n|}|j \}}}}|}|}|d d d |d t||d t||f }||j d  }||j d  }|dks|dkrdt|ddt|df}t||}|d d d |d |d |f }|S )Nr	   r   r   r   r   r   )rZ   r   r   maxr   r   r   r   r   r   r   r   r   r   r   r   r   rd   r   r   r   r   ro   r   r   r:   tanhminr/   r0   )r&   r   r   framesr   target_framestarget_channelstarget_mel_binshidden_featuresr   r   r   r   hiddendecoded_outputcurrent_timecurrent_freqtarget_timetarget_freqtime_padding_neededfreq_padding_neededr   r+   r+   r,   r2   ^  sV   






& zLTX2AudioDecoder.forward)r   r	   r   Nr   r   r   r   rH   r   rf   Fr   r   Tr   )r3   r4   r5   r6   r!   r7   r9   rF   r8   r   r:   r;   r2   r<   r+   r+   r)   r,   r     sv    	
	
 r   c                &       sP  e Zd ZdZdZe											
							d5dededeedf dedeedf dB dedededededB de	de
dedede
d edB d!e
d"df$ fd#d$Zd%ejd"ejfd&d'Zed6d%ejd(e
fd)d*Zd+ejd"ejfd,d-Zed6d+ejd(e
d"eejB fd.d/Z			d7d0ejd1e
d(e
d2ejdB d"eejB f
d3d4Z  ZS )8AutoencoderKLLTX2AudiozP
    LTX2 audio VAE for encoding and decoding audio latent representations.
    Fr   r   r   Nr   r   rP   r   rf   r   r   Tr   r   r   r   .r   r   r   r   r   rI   r   ri   r   r   r   r   r   r   r   c                    sj  t    h d}|
|vrtd|
d| |rt|n|}tdi d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|d|d|d|| _tdi d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|d|d|| _t	|f}t
|f}| jd|dd | jd|dd t| _t| _d| _d S )N>   r   r   r   r   zcausality_axis=z! is not valid. Supported values: r   r   r   r   r   r   r   r   rI   r   ri   r   r   r   r   r   r   latents_meanT)
persistentlatents_stdFr+   )r   r   r"   setr   encoderr   decoderr:   oneszerosregister_bufferr   temporal_compression_ratiomel_compression_ratiouse_slicing)r&   r   r   r   r   r   r   r   r   rI   r   ri   r   r   r   r   r   r   supported_causality_axesattn_resolution_setr   r   r)   r+   r,   r     s   
	
	

zAutoencoderKLLTX2Audio.__init__r-   c                 C   
   |  |S r.   )r   r1   r+   r+   r,   _encode     
zAutoencoderKLLTX2Audio._encodereturn_dictc                    s^    j r|jd dkr fdd|dD }t|}n |}t|}|s*|fS t|dS )Nr   r	   c                       g | ]}  |qS r+   )r   ).0x_slicer   r+   r,   
<listcomp>      z1AutoencoderKLLTX2Audio.encode.<locals>.<listcomp>)latent_dist)r   rZ   splitr:   catr   r   r   )r&   r-   r   encoded_slicesrx   	posteriorr+   r   r,   encode  s   

zAutoencoderKLLTX2Audio.encodezc                 C   r   r.   )r   )r&   r
  r+   r+   r,   _decode  r   zAutoencoderKLLTX2Audio._decodec                    sV    j r|jd dkr fdd|dD }t|}n |}|s&|fS t|dS )Nr   r	   c                    r   r+   )r  )r   z_slicer   r+   r,   r  
  r  z1AutoencoderKLLTX2Audio.decode.<locals>.<listcomp>)r   )r   rZ   r  r:   r  r  r   )r&   r
  r   decoded_slicesdecodedr+   r   r,   decode  s   

zAutoencoderKLLTX2Audio.decoder   sample_posterior	generatorc                 C   s@   |  |j}|r|j|d}n| }| |}|s|jfS |S )N)r  )r	  r  r   r~   r  )r&   r   r  r   r  r  r
  decr+   r+   r,   r2     s   
zAutoencoderKLLTX2Audio.forward)r   r   r   r   Nr   r   r   rP   r   rf   Fr   r   Tr   T)T)FTN)r3   r4   r5   r6    _supports_gradient_checkpointingr   r!   r7   r9   rF   r8   r   r:   r;   r   r   r	  r  r   r  	Generatorr2   r<   r+   r+   r)   r,   r     s    
	
P"r   )r:   torch.nnr#   torch.nn.functionalr_   r/   configuration_utilsr   r   utils.accelerate_utilsr   modeling_outputsr   modeling_utilsr   vaer
   r   r   r   r   r   r=   rG   re   rz   r   r   r   r   r   r+   r+   r+   r,   <module>   s*   3(T"   H