o
    Gi                  	   @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ e%e&Z'dej(de)ej(ej(f dej(fddZ*dej(de)ej(ej(f dej(fddZ+eG dd deZ,G dd dej-Z.G dd dZ/G dd dejj-eZ0G dd  d ej-Z1G d!d" d"ej-Z2G d#d$ d$e"e	eeeeZ3dS )%    N)	dataclass)Any   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)
BaseOutputapply_lora_scaleis_torch_versionlogging   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixinFeedForward)dispatch_attention_fn)
CacheMixin))PixArtAlphaCombinedTimestepSizeEmbeddingsPixArtAlphaTextProjection)
ModelMixin)RMSNormxfreqsreturnc                 C   s\   |\}}|  ddd\}}tj| |gddd}|  | | |  | j}|S )Nr   )r   r   dim)	unflattenunbindtorchstackflattenfloattodtype)r   r   cossinx_realx_imag	x_rotatedout r-   b/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_ltx2.pyapply_interleaved_rotary_emb&   s
    r/   c                 C   sz  |\}}| j }d}| jdkr(|jdkr(|j\}}}}	| |||ddd} d}| jd }
|
d dkr;td|
 d	|
d }| jg | jd d d|R   }|d
d dd d f }|d
dd d d f }|d}|d}|| }|d
d dd d f }|d
dd d d f }|| | ||| |jg |jd d |
R  }|r|dd||d}|j	|d}|S )NF   r      r   Tr   z6Expected x.shape[-1] to be even for split rotary, got ..r&   )
r&   ndimshapereshapeswapaxes
ValueErrorr$   	unsqueezeaddcmul_r%   )r   r   r'   r(   x_dtypeneeds_reshapebht_lastrsplit_xfirst_xsecond_xcos_usin_ur,   	first_out
second_outr-   r-   r.   apply_split_rotary_emb.   s4   
&

rK   c                   @   s"   e Zd ZU dZded< ded< dS )AudioVisualModelOutputa  
    Holds the output of an audiovisual model which produces both visual (e.g. video) and audio outputs.

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
            The hidden states output conditioned on the `encoder_hidden_states` input, representing the visual output
            of the model. This is typically a video (spatiotemporal) output.
        audio_sample (`torch.Tensor` of shape `(batch_size, TODO)`):
            The audio output of the audiovisual model.
    ztorch.Tensorsampleaudio_sampleN)__name__
__module____qualname____doc____annotations__r-   r-   r-   r.   rL   W   s   
 rL   c                       s   e Zd ZdZddededef fddZ						dd
ejde	e
ejf d	B ded	B dejd	B deejejejejejf f
ddZ  ZS )LTX2AdaLayerNormSingleaB  
    Norm layer adaptive layer norm single (adaLN-single).

    As proposed in PixArt-Alpha (see: https://huggingface.co/papers/2310.00426; Section 2.3) and adapted by the LTX-2.0
    model. In particular, the number of modulation parameters to be calculated is now configurable.

    Parameters:
        embedding_dim (`int`): The size of each embedding vector.
        num_mod_params (`int`, *optional*, defaults to `6`):
            The number of modulation parameters which will be calculated in the first return argument. The default of 6
            is standard, but sometimes we may want to have a different (usually smaller) number of modulation
            parameters.
        use_additional_conditions (`bool`, *optional*, defaults to `False`):
            Whether to use additional conditions for normalization or not.
       Fembedding_dimnum_mod_paramsuse_additional_conditionsc                    sJ   t    || _t||d |d| _t | _tj|| j| dd| _	d S )Nr   )size_emb_dimrX   Tbias)
super__init__rW   r   embnnSiLUsiluLinearlinear)selfrV   rW   rX   	__class__r-   r.   r]   y   s   


zLTX2AdaLayerNormSingle.__init__Ntimestepadded_cond_kwargs
batch_sizehidden_dtyper   c                 C   s>   |pd d d}| j |fi |||d}| | ||fS )N)
resolutionaspect_ratiori   rj   )r^   rc   ra   )rd   rg   rh   ri   rj   embedded_timestepr-   r-   r.   forward   s   zLTX2AdaLayerNormSingle.forward)rU   F)NNN)rO   rP   rQ   rR   intboolr]   r!   Tensordictstrr&   tuplero   __classcell__r-   r-   re   r.   rT   h   s"    rT   c                   @   s   e Zd ZdZdZdZdd Z				ddddejdejdB d	ejdB d
e	ejejf dB de	ejejf dB dejfddZ
dS )LTX2AudioVideoAttnProcessora=  
    Processor for implementing attention (SDPA is used by default if you're using PyTorch 2.0) for the LTX-2.0 model.
    Compared to the LTX-1.0 model, we allow the RoPE embeddings for the queries and keys to be separate so that we can
    support audio-to-video (a2v) and video-to-audio (v2a) cross attention.
    Nc                 C   s   t ddr	tdd S )N<z2.0zlLTX attention processors require a minimum PyTorch version of 2.0. Please upgrade your PyTorch installation.)r   r9   )rd   r-   r-   r.   r]      s
   
z$LTX2AudioVideoAttnProcessor.__init__attnLTX2Attentionhidden_statesencoder_hidden_statesattention_maskquery_rotary_embkey_rotary_embr   c              
   C   sn  |d u r|j n|j \}}}	|d ur$||||}|||jd|j d }|d u r*|}||}
||}||}||
}
||}|d urr|j	dkr]t
|
|}
t
||d urY|n|}n|j	dkrrt|
|}
t||d uro|n|}|
d|jdf}
|d|jdf}|d|jdf}t|
|||dd| j| jd}|dd}||
j}|jd	 |}|jd
 |}|S )Nr   interleavedsplitr           F)	attn_mask	dropout_p	is_causalbackendparallel_configr   r   r1   )r6   prepare_attention_maskviewheadsto_qto_kto_vnorm_qnorm_k	rope_typer/   rK   r   r   _attention_backend_parallel_configr#   r%   r&   to_out)rd   ry   r{   r|   r}   r~   r   ri   sequence_lengthrA   querykeyvaluer-   r-   r.   __call__   sN   










z$LTX2AudioVideoAttnProcessor.__call__NNNN)rO   rP   rQ   rR   r   r   r]   r!   rr   ru   r   r-   r-   r-   r.   rw      s0    
rw   c                       s   e Zd ZdZeZegZ													d d
ededededede	dedB de	de
dede	de
f fddZ				d!dejdejdB dejdB deejejf dB deejejf dB dejfddZ  ZS )"rz   z
    Attention class for all LTX-2.0 attention layers. Compared to LTX-1.0, this supports specifying the query and key
    RoPE embeddings separately for audio-to-video (a2v) and video-to-audio (v2a) cross-attention.
       @   r   TNrms_norm_across_headsư>r   	query_dimr   kv_headsdim_headdropoutr[   cross_attention_dimout_biasqk_normnorm_epsnorm_elementwise_affiner   c                    sL  t    |	dkrtd|| _|| | _|d u r| jn|| | _|| _|d ur*|n|| _|| _|| _	|| _
|| _|| _tjj|| |
|d| _tjj|| |
|d| _tjj|| j|d| _tjj| j| j|d| _tjj| j| j|d| _tjg | _| jtjj| j| j
|d | jtj| |d u r|  }| | d S )Nr   zIOnly 'rms_norm_across_heads' is supported as a valid value for `qk_norm`.epselementwise_affinerZ   )r\   r]   NotImplementedErrorhead_dim	inner_diminner_kv_dimr   r   use_biasr   out_dimr   r   r!   r_   r   r   r   rb   r   r   r   
ModuleListr   appendDropout_default_processor_clsset_processor)rd   r   r   r   r   r   r[   r   r   r   r   r   r   	processorre   r-   r.   r]      s0   

zLTX2Attention.__init__r{   r|   r}   r~   r   r   c                    s   t t| jjj   fdd| D }t|dkr,t	
d| d| jjj d  fdd| D }| j| |||||fi |}|S )	Nc                    s   g | ]
\}}| vr|qS r-   r-   ).0krA   attn_parametersr-   r.   
<listcomp>  s    z)LTX2Attention.forward.<locals>.<listcomp>r   zattention_kwargs z are not expected by z and will be ignored.c                    s   i | ]\}}| v r||qS r-   r-   )r   r   wr   r-   r.   
<dictcomp>   s    z)LTX2Attention.forward.<locals>.<dictcomp>)setinspect	signaturer   r   
parameterskeysitemslenloggerwarningrf   rO   )rd   r{   r|   r}   r~   r   kwargsunused_kwargsr-   r   r.   ro     s   	zLTX2Attention.forward)r   r   r   r   TNTr   r   Tr   Nr   )rO   rP   rQ   rR   rw   r   _available_processorsrp   r$   rq   rt   r]   r!   rr   ru   ro   rv   r-   r-   re   r.   rz      st    	
/rz   c                '       s@  e Zd ZdZ							d.ded	ed
edededededededededededef fddZ								d/de	j
de	j
de	j
de	j
de	j
de	j
de	j
d e	j
d!e	j
d"e	j
d#ee	j
e	j
f dB d$ee	j
e	j
f dB d%ee	j
e	j
f dB d&ee	j
e	j
f dB d'e	j
dB d(e	j
dB d)e	j
dB d*e	j
dB d+e	j
f&d,d-Z  ZS )0LTX2VideoTransformerBlocka  
    Transformer block used in [LTX-2.0](https://huggingface.co/Lightricks/LTX-Video).

    Args:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        qk_norm (`str`, defaults to `"rms_norm"`):
            The normalization layer to use.
        activation_fn (`str`, defaults to `"gelu-approximate"`):
            Activation function to use in feed-forward.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
    r   gelu-approximateTr   Fr   r   num_attention_headsattention_head_dimr   	audio_dimaudio_num_attention_headsaudio_cross_attention_dimr   activation_fnattention_biasattention_out_biasr   r   r   c                    s  t    t|||d| _t|||||d ||	|d	| _t|||d| _t|||||d ||	|d	| _t|||d| _t||||||||	|d	| _	t|||d| _
t||||||||	|d	| _t|||d| _t||||||||	|d	| _t|||d| _t||||||||	|d	| _t|||d| _t||
d| _t|||d| _t||
d| _ttd||d  | _ttd||d  | _ttd|| _ttd|| _d S )Nr   )	r   r   r   r   r[   r   r   r   r   )	r   r   r   r   r   r[   r   r   r   )r   rU         ?   )r\   r]   r   norm1rz   attn1audio_norm1audio_attn1norm2attn2audio_norm2audio_attn2audio_to_video_normaudio_to_video_attnvideo_to_audio_normvideo_to_audio_attnnorm3r   ffaudio_norm3audio_ffr_   	Parameterr!   randnscale_shift_tableaudio_scale_shift_table&video_a2v_cross_attn_scale_shift_table&audio_a2v_cross_attn_scale_shift_table)rd   r   r   r   r   r   r   audio_attention_head_dimr   r   r   r   r   r   r   r   re   r-   r.   r]   :  s   
z"LTX2VideoTransformerBlock.__init__Nr{   audio_hidden_statesr|   audio_encoder_hidden_statestemb
temb_audiotemb_ca_scale_shifttemb_ca_audio_scale_shifttemb_ca_gatetemb_ca_audio_gatevideo_rotary_embaudio_rotary_embca_video_rotary_embca_audio_rotary_embencoder_attention_maskaudio_encoder_attention_maska2v_cross_attention_maskv2a_cross_attention_maskr   c           @      C   s  | d}| |}| jjd }| jd |j||| d|d }|jdd\}}}}}}|d|  | }| j|d |d}|||  }| 	|}| j
jd }| j
d |j||| d|d } | jdd\}!}"}#}$}%}&|d|"  |! }| j|d |d}'||'|#  }| |}| j||d |d}|| }| |}| j||d |d}'||' }| |}| |}| jd d	d d f }(| jd	d d d f })|(d d d d d
f |j|||jd d	d jdd}*|)d d d d d
f |	j|	||	jd dd jdd}+|*\},}-}.}/|+d d}0| jd d	d d f }1| jd	d d d f }2|1d d d d d
f |j|||jd d	d jdd}3|2d d d d d
f |
j|
||
jd dd jdd}4|3\}5}6}7}8|4d d}9|d|,d  |-d }:|d|5d  |6d };| j|:|;|||d}<||0|<  }|d|.d  |/d }:|d|7d  |8d };| j|;|:|||d}=||9|=  }| |d|  | }| |}>||>|  }| |d|%  |$ }| |}?||?|&  }||fS )Nr   NNr1   r   r   r   )r{   r|   r~   )r|   r~   r}   r0   .)r|   r~   r   r}   )sizer   r   r6   r%   devicer7   r    r   r   r   r   r   r   r   r   r   r   r   r&   squeezer   r   r   r   r   r   r   )@rd   r{   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ri   norm_hidden_statesnum_ada_params
ada_values	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpattn_hidden_statesnorm_audio_hidden_statesnum_audio_ada_paramsaudio_ada_valuesaudio_shift_msaaudio_scale_msaaudio_gate_msaaudio_shift_mlpaudio_scale_mlpaudio_gate_mlpattn_audio_hidden_statesvideo_per_layer_ca_scale_shiftvideo_per_layer_ca_gatevideo_ca_scale_shift_tablevideo_ca_gatevideo_a2v_ca_scalevideo_a2v_ca_shiftvideo_v2a_ca_scalevideo_v2a_ca_shifta2v_gateaudio_per_layer_ca_scale_shiftaudio_per_layer_ca_gateaudio_ca_scale_shift_tableaudio_ca_gateaudio_a2v_ca_scaleaudio_a2v_ca_shiftaudio_v2a_ca_scaleaudio_v2a_ca_shiftv2a_gatemod_norm_hidden_statesmod_norm_audio_hidden_statesa2v_attn_hidden_statesv2a_attn_hidden_states	ff_outputaudio_ff_outputr-   r-   r.   ro     s   









z!LTX2VideoTransformerBlock.forward)r   r   TTr   Fr   )NNNNNNNN)rO   rP   rQ   rR   rp   rt   rq   r$   r]   r!   rr   ru   ro   rv   r-   r-   re   r.   r   '  s    	
 	
r   c                !       s  e Zd ZdZ													
		d3dededededededededeedf dededededededdf  fdd Z		!d4d"ed#ed$ed%ed&e
jd'ede
jfd(d)Z	*d5d"ed#ed&e
jd+ede
jf
d,d-Zd.d/ Z	d6d0e
jd&ee
jB dB dee
je
jf fd1d2Z  ZS )7LTX2AudioVideoRotaryPosEmbedax  
    Video and audio rotary positional embeddings (RoPE) for the LTX-2.0 model.

    Args:
        causal_offset (`int`, *optional*, defaults to `1`):
            Offset in the temporal axis for causal VAE modeling. This is typically 1 (for causal modeling where the VAE
            treats the very first frame differently), but could also be 0 (for non-causal modeling).
    r1         >     r       r2       @videoTr   r2  r   
patch_sizepatch_size_tbase_num_framesbase_height
base_widthsampling_rate
hop_lengthscale_factors.thetacausal_offsetmodalitydouble_precisionr   r   r   Nc                    s   t    || _|| _|| _|dvrtd|d|| _|| _|| _|| _	|| _
|| _|| _t|t| t|	d  | _|	| _|
| _|| _|| _| jdvrWtd| d|| _d S )N)r   r   z
rope_type=z9 not supported. Choose between 'interleaved' and 'split'.r   )r4  audioz	Modality z@ is not supported. Supported modalities are `video` and `audio`.)r\   r]   r   r5  r6  r9   r   r7  r   r8  r9  r:  r;  r$   audio_latents_per_secondr<  r=  r>  r?  r@  )rd   r   r5  r6  r7  r8  r9  r:  r;  r<  r=  r>  r?  r@  r   r   re   r-   r.   r]   ]  s*   


z%LTX2AudioVideoRotaryPosEmbed.__init__      8@ri   
num_framesheightwidthr   fpsc                 C   sh  t jd|| jt j|d}t jd|| jt j|d}t jd|| jt j|d}	t j|||	dd}
t j|
dd}
| j| j| jf}t j||
j|
j	d}|
|
dddd }t j|
|gd	d}|dd}|d|ddd}t j| j|j	d
}dg|j }d	|d< ||j
|  }|ddddf | j | jd  jdd|ddddf< |ddddf | |ddddf< |S )a  
        Create per-dimension bounds [inclusive start, exclusive end) for each patch with respect to the original pixel
        space video grid (num_frames, height, width). This will ultimately have shape (batch_size, 3, num_patches, 2)
        where
            - axis 1 (size 3) enumerates (frame, height, width) dimensions (e.g. idx 0 corresponds to frames)
            - axis 3 (size 2) stores `[start, end)` indices within each dimension

        Args:
            batch_size (`int`):
                Batch size of the video latents.
            num_frames (`int`):
                Number of latent frames in the video latents.
            height (`int`):
                Latent height of the video latents.
            width (`int`):
                Latent width of the video latents.
            device (`torch.device`):
                Device on which to create the video grid.

        Returns:
            `torch.Tensor`:
                Per-dimension patch boundaries tensor of shape [batch_size, 3, num_patches, 2].
        r   startendstepr&   r   ij)indexingr   )r&   r   r   r1   r   r   N.min)r!   aranger6  float32r5  meshgridr"   tensorr&   r   r   r#   r:   repeatr<  r5   r>  clamp)rd   ri   rD  rE  rF  r   rG  grid_fgrid_hgrid_wgridr5  patch_size_delta
patch_endslatent_coordsscale_tensorbroadcast_shapepixel_coordsr-   r-   r.   prepare_video_coords  s$   #8$z1LTX2AudioVideoRotaryPosEmbed.prepare_video_coordsr   shiftc                 C   s   t j||| | jt j|d}| jd }|| }|| j | jdd}|| j | j }|| j | }	|	| j | jdd}	|	| j | j }
t j	||
gdd}|
d|dd}|
d}|S )a  
        Create per-dimension bounds [inclusive start, exclusive end) of start and end timestamps for each latent frame.
        This will ultimately have shape (batch_size, 3, num_patches, 2) where
            - axis 1 (size 1) represents the temporal dimension
            - axis 3 (size 2) stores `[start, end)` indices within each dimension

        Args:
            batch_size (`int`):
                Batch size of the audio latents.
            num_frames (`int`):
                Number of latent frames in the audio latents.
            device (`torch.device`):
                Device on which to create the audio grid.
            shift (`int`, *optional*, defaults to `0`):
                Offset on the latent indices. Different shift values correspond to different overlapping windows with
                respect to the same underlying latent grid.

        Returns:
            `torch.Tensor`:
                Per-dimension patch boundaries tensor of shape [batch_size, 1, num_patches, 2].
        rH  r   rO  r   r   r1   )r!   rQ  r6  rR  r<  r>  clipr;  r:  r"   r:   expand)rd   ri   rD  r   rb  rW  audio_scale_factorgrid_start_melgrid_start_sgrid_end_mel
grid_end_saudio_coordsr-   r-   r.   prepare_audio_coords  s   

z1LTX2AudioVideoRotaryPosEmbed.prepare_audio_coordsc                 O   s8   | j dkr| j|i |S | j dkr| j|i |S d S )Nr4  rA  )r?  ra  rk  )rd   argsr   r-   r-   r.   prepare_coords
  s
   

z+LTX2AudioVideoRotaryPosEmbed.prepare_coordscoordsc              
      s  |p j } jd } jdkr# jddd\}}|| d   d | jdkr1| j| j| jfn	| jdkr:| jft	j
 fd	d
t|D dd|}|d }| jrXt	jnt	j}t	| jt	jdd| j| ||d}	|	t	j d jt	jd}
|dd d |
 }
|
ddd}
| jdkr|
 jddd}|
 jddd}| j| dkrt	|d d d d d | j| f }t	|d d d d d | j| f }t	j||gdd}t	j||gdd}||fS | jdkrd| jd }|
jd }|| }|
 }|
 }|dkr:t	|d d d d d |f }t	|d d d d d |f }t	j||gdd}t	j||gdd}|jd }|jd }| ||| j!d}| ||| j!d}t	"|dd}t	"|dd}||fS )Nr1   r0   r   r   r   g       @r4  rA  c                    s$   g | ]} d d |f |  qS Nr-   )r   irn  max_positionsr-   r.   r   %  s   $ z8LTX2AudioVideoRotaryPosEmbed.forward.<locals>.<listcomp>r   g      ?)rI  rJ  stepsr&   r   r4   r3   r   r   r   )axis)#r   r6   r5   chunkr   r?  r7  r8  r9  r!   r"   ranger%   r@  float64rR  powr=  linspacer   pir:   	transposer#   r   r'   repeat_interleaver(   	ones_like
zeros_likecatconcatenater7   r   r8   )rd   rn  r   num_pos_dimscoords_start
coords_endrZ  num_rope_elemsfreqs_dtypepow_indicesr   	cos_freqs	sin_freqscos_paddingsin_paddingexpected_freqscurrent_freqspad_sizecos_freqsin_freqr>   r@   r-   rq  r.   ro     s^   





(
&&


  

z$LTX2AudioVideoRotaryPosEmbed.forward)r1   r1   r-  r.  r.  r/  r0  r1  r3  r1   r4  Tr   r2  )rC  )r   ro  )rO   rP   rQ   rR   rp   ru   r$   rt   rq   r]   r!   r   rr   ra  rk  rm  ro   rv   r-   r-   re   r.   r,  S  s    	

8
K
6r,  c                L       s4  e Zd ZdZdZdgZdgZeddddeddddedd	ddd
eddddedddddeddddZ	e
																																				d\d ed!ed"B d#ed$ed%ed&ed'ed(eeeef d)ed*ed+ed,ed-ed"B d.ed/ed0ed1ed2ed3ed4ed5ed6ed7ed8ed9ed:ed;ed<ed=ed>ed?ed@edAedBedCedDedEd"fJ fdFdGZedH	"	"	"	"	"	"	I	"	"	"	"	d]dJejdKejdLejdMejdNejdOejd"B dPejd"B dQejd"B dRed"B dSed"B dTed"B dUedVed"B dWejd"B dXejd"B dHeeef d"B dYedEejf$dZd[Z  ZS )^LTX2VideoTransformer3DModela  
    A Transformer model for video-like data used in [LTX](https://huggingface.co/Lightricks/LTX-Video).

    Args:
        in_channels (`int`, defaults to `128`):
            The number of channels in the input.
        out_channels (`int`, defaults to `128`):
            The number of channels in the output.
        patch_size (`int`, defaults to `1`):
            The size of the spatial patches to use in the patch embedding layer.
        patch_size_t (`int`, defaults to `1`):
            The size of the tmeporal patches to use in the patch embedding layer.
        num_attention_heads (`int`, defaults to `32`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`, defaults to `64`):
            The number of channels in each head.
        cross_attention_dim (`int`, defaults to `2048 `):
            The number of channels for cross attention heads.
        num_layers (`int`, defaults to `28`):
            The number of layers of Transformer blocks to use.
        activation_fn (`str`, defaults to `"gelu-approximate"`):
            Activation function to use in feed-forward.
        qk_norm (`str`, defaults to `"rms_norm_across_heads"`):
            The normalization layer to use.
    Tnormr   r1   r   F)	split_dimexpected_dimssplit_outputr   )r{   r|   r   )r   r1   )
gather_dimr  ) ropeproj_out   r2     r1  r-  r.  r   r0   r/  r0  0   r   r   r      r3    r   in_channelsout_channelsNr5  r6  r   r   r   vae_scale_factorspos_embed_max_posr8  r9  audio_in_channelsaudio_out_channelsaudio_patch_sizeaudio_patch_size_tr   r   r   re  audio_pos_embed_max_posaudio_sampling_rateaudio_hop_length
num_layersr   r   r   r   caption_channelsr   r   
rope_thetarope_double_precisionr>  timestep_scale_multiplier$cross_attn_timestep_scale_multiplierr   r   c%           &         s  t    |p|}|p|} 	 t|	| _t|| _t|	d| _t|d| _t		ddd| _
t	ddd| _t		ddd| _t	ddd| _t		ddd| _t	ddd| _ttd		d  | _ttdd  | _t	|||	|
||||!d	| d
| _t||||||g||!d| d| _t|	|}%t|||%|
|||!d	| d| _t|||%||||!d| d| _t 	
fddt|D | _tj	ddd| _t	|| _ tjddd| _!t|| _"d| _#d S )N)in_featureshidden_sizerU   F)rW   rX   r0   r1   r   r   r4  )r   r5  r6  r7  r8  r9  r<  r=  r>  r?  r@  r   r   rA  )r   r5  r6  r7  r:  r;  r<  r=  r>  r?  r@  r   r   )r   r5  r6  r7  r8  r9  r=  r>  r?  r@  r   r   )r   r5  r6  r7  r:  r;  r=  r>  r?  r@  r   r   c                    s2   g | ]}t 	 
d qS ))r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   )r   rA   r   r   r   r   r   r   audio_inner_dimr   r   r   r   r   r   r   r   r-   r.   r   (  s(    z8LTX2VideoTransformer3DModel.__init__.<locals>.<listcomp>r   r   )$r\   r]   r_   rb   proj_inaudio_proj_inr   caption_projectionaudio_caption_projectionrT   
time_embedaudio_time_embedav_cross_attn_video_scale_shiftav_cross_attn_audio_scale_shiftav_cross_attn_video_a2v_gateav_cross_attn_audio_v2a_gater   r!   r   r   r   r,  r  
audio_ropemaxcross_attn_ropecross_attn_audio_roper   rv  transformer_blocks	LayerNormnorm_outr  audio_norm_outaudio_proj_outgradient_checkpointing)&rd   r  r  r5  r6  r   r   r   r  r  r8  r9  r  r  r  r  r   r   r   re  r  r  r  r  r   r   r   r   r  r   r   r  r  r>  r  r  r   cross_attn_pos_embed_max_posre   r  r.   r]     s   
(	
&
z$LTX2VideoTransformer3DModel.__init__attention_kwargsrC  r{   r   r|   r   rg   audio_timestepr   r   rD  rE  rF  rG  audio_num_framesvideo_coordsrj  return_dictc           *      C   s  |dur|n|}|dur |j dkr d||j d }|d}|dur8|j dkr8d||j d }|d}|d}|du rN| jj||	|
||j|d}|du r[| j	|||j}| j||jd}| j||jd}| j
|ddddddf |jd}| j|ddddddf |jd}| |}| |}| jj| jj }| j| ||jd\}}||d	|d	}||d	|d	}| j| ||jd\}}||d	|d	}||d	|d	}| j| ||jd\}}| j| | ||jd\}}||d	|jd	 }||d	|jd	 }| j| ||jd\}}| j| | ||jd\} }||d	|jd	 }| |d	| jd	 } | |}||d	|d	}| |}||d	|d	}| jD ]\}!t r| jr|  |!|||||||||| ||||||\}}qf|!di d
|d|d|d|d|d|d|d|d|d| d|d|d|d|d|d|\}}qf| j!d |dddddf  }"|"dddddf |"dddddf }#}$| "|}|d|$  |# }| #|}%| j$d |dddddf  }&|&dddddf |&dddddf }'}(| %|}|d|(  |' }| &|})|s<|%|)fS t'|%|)dS )a  
        Forward pass for LTX-2.0 audiovisual video transformer.

        Args:
            hidden_states (`torch.Tensor`):
                Input patchified video latents of shape `(batch_size, num_video_tokens, in_channels)`.
            audio_hidden_states (`torch.Tensor`):
                Input patchified audio latents of shape `(batch_size, num_audio_tokens, audio_in_channels)`.
            encoder_hidden_states (`torch.Tensor`):
                Input video text embeddings of shape `(batch_size, text_seq_len, self.config.caption_channels)`.
            audio_encoder_hidden_states (`torch.Tensor`):
                Input audio text embeddings of shape `(batch_size, text_seq_len, self.config.caption_channels)`.
            timestep (`torch.Tensor`):
                Input timestep of shape `(batch_size, num_video_tokens)`. These should already be scaled by
                `self.config.timestep_scale_multiplier`.
            audio_timestep (`torch.Tensor`, *optional*):
                Input timestep of shape `(batch_size,)` or `(batch_size, num_audio_tokens)` for audio modulation
                params. This is only used by certain pipelines such as the I2V pipeline.
            encoder_attention_mask (`torch.Tensor`, *optional*):
                Optional multiplicative text attention mask of shape `(batch_size, text_seq_len)`.
            audio_encoder_attention_mask (`torch.Tensor`, *optional*):
                Optional multiplicative text attention mask of shape `(batch_size, text_seq_len)` for audio modeling.
            num_frames (`int`, *optional*):
                The number of latent video frames. Used if calculating the video coordinates for RoPE.
            height (`int`, *optional*):
                The latent video height. Used if calculating the video coordinates for RoPE.
            width (`int`, *optional*):
                The latent video width. Used if calculating the video coordinates for RoPE.
            fps: (`float`, *optional*, defaults to `24.0`):
                The desired frames per second of the generated video. Used if calculating the video coordinates for
                RoPE.
            audio_num_frames: (`int`, *optional*):
                The number of latent audio frames. Used if calculating the audio coordinates for RoPE.
            video_coords (`torch.Tensor`, *optional*):
                The video coordinates to be used when calculating the rotary positional embeddings (RoPE) of shape
                `(batch_size, 3, num_video_tokens, 2)`. If not supplied, this will be calculated inside `forward`.
            audio_coords (`torch.Tensor`, *optional*):
                The audio coordinates to be used when calculating the rotary positional embeddings (RoPE) of shape
                `(batch_size, 1, num_audio_tokens, 2)`. If not supplied, this will be calculated inside `forward`.
            attention_kwargs (`dict[str, Any]`, *optional*):
                Optional dict of keyword args to be passed to the attention processor.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a dict-like structured output of type `AudioVisualModelOutput` or a tuple.

        Returns:
            `AudioVisualModelOutput` or `tuple`:
                If `return_dict` is `True`, returns a structured output of type `AudioVisualModelOutput`, otherwise a
                `tuple` is returned where the first element is the denoised video latent patch sequence and the second
                element is the denoised audio latent patch sequence.
        Nr   r1   g     r   )rG  rN  rm   r   r{   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rM   rN   r-   )(r5   r%   r&   r:   r   r  ra  r   r  rk  r  r  r  r  configr  r  r  r#   r   r  r  r  r6   r  r  r  r  r  r!   is_grad_enabledr  _gradient_checkpointing_funcr   r  r  r   r  r  rL   )*rd   r{   r   r|   r   rg   r  r   r   rD  rE  rF  rG  r  r  rj  r  r  ri   r   r   video_cross_attn_rotary_embaudio_cross_attn_rotary_emb%timestep_cross_attn_gate_scale_factorr   rn   r   audio_embedded_timestepvideo_cross_attn_scale_shiftrA   video_cross_attn_a2v_gateaudio_cross_attn_scale_shiftaudio_cross_attn_v2a_gateblockscale_shift_valuesrb  scaleoutputaudio_scale_shift_valuesaudio_shiftaudio_scaleaudio_outputr-   r-   r.   ro   G  s  H


&












	
 .

 .

z#LTX2VideoTransformer3DModel.forward)$r  r  r1   r1   r2  r  r  r1  r-  r.  r.  r  r  r1   r1   r2  r   r.  r0   r-  r/  r0  r  r   r   Fr   r  TTr3  Tr1   r  r  r   )NNNNNNrC  NNNNT)rO   rP   rQ   rR    _supports_gradient_checkpointing _skip_layerwise_casting_patterns_repeated_blocksr   r   _cp_planr   rp   ru   rt   rq   r$   r]   r
   r!   rr   
LongTensorrs   r   ro   rv   r-   r-   re   r.   r  ^  sb   
	
 !"#$%& <	
r  )4r   dataclassesr   typingr   r!   torch.nnr_   configuration_utilsr   r   loadersr   r   utilsr	   r
   r   r   _modeling_parallelr   r   	attentionr   r   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   modeling_utilsr   normalizationr   
get_loggerrO   r   rr   ru   r/   rK   rL   ModulerT   rw   rz   r   r,  r  r-   r-   r-   r.   <module>   s@   
&&))KK  .  
