o
    Gi}V                  
   @   sr  d dl Z d dlZd dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' e(e)Z*G dd dZ+G dd dZ,G dd dejj-eZ.G dd dej-Z/eG dd dej-Z0eG dd de$eeeeeZ1dd Z2dS )    N)Any   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)apply_lora_scale	deprecateis_torch_versionlogging)maybe_allow_in_graph   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixinFeedForward)dispatch_attention_fn)
CacheMixin)PixArtAlphaTextProjection)Transformer2DModelOutput)
ModelMixin)AdaLayerNormSingleRMSNormc                   @   s   e Zd Zdd ZdS )LTXVideoAttentionProcessor2_0c                 O   s   d}t dd| t|i |S )Nz~`LTXVideoAttentionProcessor2_0` is deprecated and this will be removed in a future version. Please use `LTXVideoAttnProcessor`r   z1.0.0)r	   LTXVideoAttnProcessor)clsargskwargsdeprecation_message r    a/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_ltx.py__new__)   s   z%LTXVideoAttentionProcessor2_0.__new__N)__name__
__module____qualname__r"   r    r    r    r!   r   (   s    r   c                   @   s`   e Zd ZdZdZdZdd Z			ddddejdejdB d	ejdB d
ejdB dejfddZ	dS )r   z
    Processor for implementing attention (SDPA is used by default if you're using PyTorch 2.0). This is used in the LTX
    model. It applies a normalization layer and rotary embedding on the query and key vector.
    Nc                 C   s   t ddr	tdd S )N<z2.0zlLTX attention processors require a minimum PyTorch version of 2.0. Please upgrade your PyTorch installation.)r
   
ValueError)selfr    r    r!   __init__9   s
   
zLTXVideoAttnProcessor.__init__attnLTXAttentionhidden_statesencoder_hidden_statesattention_maskimage_rotary_embreturnc              
   C   s,  |d u r|j n|j \}}}|d ur$||||}|||jd|j d }|d u r*|}||}	||}
||}||	}	||
}
|d urQt	|	|}	t	|
|}
|	
d|jdf}	|

d|jdf}
|
d|jdf}t|	|
||dd| j| jd}|dd}||	j}|jd |}|jd |}|S )	Nr           F)	attn_mask	dropout_p	is_causalbackendparallel_configr   r      )shapeprepare_attention_maskviewheadsto_qto_kto_vnorm_qnorm_kapply_rotary_emb	unflattenr   _attention_backend_parallel_configflattentodtypeto_out)r(   r*   r,   r-   r.   r/   
batch_sizesequence_length_querykeyvaluer    r    r!   __call__?   sB   	







zLTXVideoAttnProcessor.__call__NNN)
r#   r$   r%   __doc__rD   rE   r)   torchTensorrP   r    r    r    r!   r   0   s*    
r   c                       s   e Zd ZeZegZ									ddeded	ed
ededededB dede	f fddZ
			ddejdejdB dejdB dejdB dejf
ddZ  ZS )r+      @   r2   TNrms_norm_across_heads	query_dimr<   kv_headsdim_headdropoutbiascross_attention_dimout_biasqk_normc                    sN  t    |	dkrtd|| _|| | _|d u r| jn|| | _|| _|d ur*|n|| _|| _|| _	|| _
|| _d}d}tjj|| ||d| _tjj|| ||d| _tjj|| j|d| _tjj| j| j|d| _tjj| j| j|d| _tjg | _| jtjj| j| j
|d | jtj| |
d u r|  }
| |
 d S )NrW   zIOnly 'rms_norm_across_heads' is supported as a valid value for `qk_norm`.gh㈵>Tepselementwise_affine)r\   )superr)   NotImplementedErrorhead_dim	inner_diminner_kv_dimrX   r]   use_biasr[   out_dimr<   rS   nnr   r@   rA   Linearr=   r>   r?   
ModuleListrI   appendDropout_default_processor_clsset_processor)r(   rX   r<   rY   rZ   r[   r\   r]   r^   r_   	processornorm_epsnorm_elementwise_affine	__class__r    r!   r)   w   s2   

zLTXAttention.__init__r,   r-   r.   r/   r0   c                    s   t t| jjj   fdd| D }t|dkr,t	
d| d| jjj d  fdd| D }| j| ||||fi |S )	Nc                    s   g | ]
\}}| vr|qS r    r    ).0krL   attn_parametersr    r!   
<listcomp>   s    z(LTXAttention.forward.<locals>.<listcomp>r   zattention_kwargs z are not expected by z and will be ignored.c                    s   i | ]\}}| v r||qS r    r    )rv   rw   wrx   r    r!   
<dictcomp>   s    z(LTXAttention.forward.<locals>.<dictcomp>)setinspect	signaturerq   rP   
parameterskeysitemslenloggerwarningru   r#   )r(   r,   r-   r.   r/   r   unused_kwargsr    rx   r!   forward   s   zLTXAttention.forward)	rU   rU   rV   r2   TNTrW   NrQ   )r#   r$   r%   r   ro   _available_processorsintfloatboolstrr)   rS   rT   r   __classcell__r    r    rt   r!   r+   s   sZ    	
-r+   c                       s   e Zd Z						ddedededed	ed
ededdf fddZdededededeejeef dej	dejfddZ
					ddejdedB dedB dedB deejeef dB dejdB deejejf fddZ  ZS )LTXVideoRotaryPosEmbed      r8        @dimbase_num_framesbase_height
base_width
patch_sizepatch_size_tthetar0   Nc                    s8   t    || _|| _|| _|| _|| _|| _|| _d S )N)	rc   r)   r   r   r   r   r   r   r   )r(   r   r   r   r   r   r   r   rt   r    r!   r)      s   


zLTXVideoRotaryPosEmbed.__init__rJ   
num_framesheightwidthrope_interpolation_scaledevicec                 C   s6  t j|t j|d}t j|t j|d}t j|t j|d}	t j|	||dd}
t j|
dd}
|
d|dddd}
|d ur|
d d ddf |d  | j | j |
d d ddf< |
d d ddf |d  | j	 | j
 |
d d ddf< |
d d ddf |d  | j	 | j |
d d ddf< |
dd	dd}
|
S )
N)rH   r   ij)indexingr   r   r8   r   r      )rS   arangefloat32meshgridstack	unsqueezerepeatr   r   r   r   r   rF   	transpose)r(   rJ   r   r   r   r   r   grid_hgrid_wgrid_fgridr    r    r!   _prepare_video_coords   s   
888z,LTXVideoRotaryPosEmbed._prepare_video_coordsr,   video_coordsc                 C   s  | d}|d u r| j||||||jd}n%tj|d d df | j |d d df | j |d d df | j gdd}d}	| j}
| jtj	t
|	| jt
|
| j| jd |jtjd	 }|t
j d
 }||dd d  }|ddd}| jddd}| jddd}| jd dkrt|d d d d d | jd f }t|d d d d d | jd f }tj||gdd}tj||gdd}||fS )Nr   )r   r   r8   r   r1   r   g      ?   )r   rH   g       @)sizer   r   rS   r   r   r   r   r   linspacemathlogr   r   pir   r   rF   cosrepeat_interleavesin	ones_like
zeros_likecat)r(   r,   r   r   r   r   r   rJ   r   startendfreqs	cos_freqs	sin_freqscos_paddingsin_paddingr    r    r!   r      sJ   
			&&zLTXVideoRotaryPosEmbed.forward)r   r   r   r8   r8   r   )NNNNN)r#   r$   r%   r   r   r)   tuplerS   rT   r   r   r   r   r    r    rt   r!   r      sv    	
r   c                       s   e Zd ZdZ						ddeded	ed
ededededededef fddZ		dde	j
de	j
de	j
dee	j
e	j
f dB de	j
dB de	j
fddZ  ZS )LTXVideoTransformerBlocka  
    Transformer block used in [LTX](https://huggingface.co/Lightricks/LTX-Video).

    Args:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        qk_norm (`str`, defaults to `"rms_norm"`):
            The normalization layer to use.
        activation_fn (`str`, defaults to `"gelu-approximate"`):
            Activation function to use in feed-forward.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
    rW   gelu-approximateTư>Fr   num_attention_headsattention_head_dimr]   r_   activation_fnattention_biasattention_out_biasra   rb   c              
      s   t    t||	|
d| _t|||||d ||d| _t||	|
d| _t||||||||d| _t||d| _	t
td||d  | _d S )Nr`   )rX   r<   rY   rZ   r\   r]   r^   r_   )rX   r]   r<   rY   rZ   r\   r^   r_   )r   r         ?)rc   r)   r   norm1r+   attn1norm2attn2r   ffrj   	ParameterrS   randnscale_shift_table)r(   r   r   r   r]   r_   r   r   r   ra   rb   rt   r    r!   r)   -  s2   
 z!LTXVideoTransformerBlock.__init__Nr,   r-   tembr/   encoder_attention_maskr0   c                 C   s   | d}| |}| jjd }| jd |j||| d|d }	|	jdd\}
}}}}}|d|  |
 }| j|d |d}|||  }| j	||d |d}|| }| 
|d|  | }| |}|||  }|S )	Nr   NNr8   r1   r   r   )r,   r-   r/   )r-   r/   r.   )r   r   r   r9   rG   r   reshapeunbindr   r   r   r   )r(   r,   r-   r   r/   r   rJ   norm_hidden_statesnum_ada_params
ada_values	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpattn_hidden_states	ff_outputr    r    r!   r   X  s2   


z LTXVideoTransformerBlock.forward)rW   r   TTr   Fr   )r#   r$   r%   rR   r   r   r   r   r)   rS   rT   r   r   r   r    r    rt   r!   r     sX    	
0r   c                "       sx  e Zd ZdZdZdgZdgZeddddeddddedd	ddd
eddddedddddeddddZ	e
															d7dededededededededed ed!ed"ed#ed$ed%ed&d'f  fd(d)Zed*	'	'	'	'	'	'	d8d+ejd,ejd-ejd.ejd/ed'B d0ed'B d1ed'B d2eeeef ejB d'B d3ejd'B d*eeef d'B d4ed&ejfd5d6Z  ZS )9LTXVideoTransformer3DModela  
    A Transformer model for video-like data used in [LTX](https://huggingface.co/Lightricks/LTX-Video).

    Args:
        in_channels (`int`, defaults to `128`):
            The number of channels in the input.
        out_channels (`int`, defaults to `128`):
            The number of channels in the output.
        patch_size (`int`, defaults to `1`):
            The size of the spatial patches to use in the patch embedding layer.
        patch_size_t (`int`, defaults to `1`):
            The size of the tmeporal patches to use in the patch embedding layer.
        num_attention_heads (`int`, defaults to `32`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`, defaults to `64`):
            The number of channels in each head.
        cross_attention_dim (`int`, defaults to `2048 `):
            The number of channels for cross attention heads.
        num_layers (`int`, defaults to `28`):
            The number of layers of Transformer blocks to use.
        activation_fn (`str`, defaults to `"gelu-approximate"`):
            Activation function to use in feed-forward.
        qk_norm (`str`, defaults to `"rms_norm_across_heads"`):
            The normalization layer to use.
    Tnormr   r8   r   F)	split_dimexpected_dimssplit_outputr   )r,   r-   r   )r   r8   )
gather_dimr   ) ropeproj_out       rV   r      r   rW   r      in_channelsout_channelsr   r   r   r   r]   
num_layersr   r_   rs   rr   caption_channelsr   r   r0   Nc                    s   t    |p|} t|| _ttdd  | _t	dd| _
t|d| _tddd||dd	| _t 	f
d
dt|D | _tjddd| _t|| _d| _d S )Nr   r   F)use_additional_conditions)in_featureshidden_sizer   r   r   )r   r   r   r   r   r   r   c                    s(   g | ]}t 	 d 
qS ))
r   r   r   r]   r_   r   r   r   ra   rb   )r   )rv   rL   
r   r   r   r   r]   rf   rs   rr   r   r_   r    r!   rz     s    z7LTXVideoTransformer3DModel.__init__.<locals>.<listcomp>r   r`   )rc   r)   rj   rk   proj_inr   rS   r   r   r   
time_embedr   caption_projectionr   r   rl   rangetransformer_blocks	LayerNormnorm_outr   gradient_checkpointing)r(   r   r   r   r   r   r   r]   r   r   r_   rs   rr   r   r   r   rt   r   r!   r)     s0   


z#LTXVideoTransformer3DModel.__init__attention_kwargsr,   r-   timestepr   r   r   r   r   r   return_dictc              	   C   s  |  ||||||	}|d ur"|jdkr"d||j d }|d}|d}| |}| j| ||jd\}}|	|d|d}|	|d|d}| 
|}|	|d|d}| jD ]}t rs| jrs| ||||||}q_||||||d}q_| jd |d d d d d f  }|d d d d df |d d d d df }}| |}|d|  | }| |}|s|fS t|d	S )
Nr   r8   g     r   )rJ   hidden_dtyper1   )r,   r-   r   r/   r   r   )sample)r   ndimrG   rH   r   r   r   r   rF   r;   r   r   rS   is_grad_enabledr  _gradient_checkpointing_funcr   r   r   r   )r(   r,   r-   r  r   r   r   r   r   r   r  r  r/   rJ   r   embedded_timestepblockscale_shift_valuesshiftscaleoutputr    r    r!   r     sP   





	 .


z"LTXVideoTransformer3DModel.forward)r   r   r8   r8   r   rV   r   r   r   rW   Fr   r   TT)NNNNNNT)r#   r$   r%   rR    _supports_gradient_checkpointing _skip_layerwise_casting_patterns_repeated_blocksr   r   _cp_planr   r   r   r   r   r)   r   rS   rT   
LongTensorr   dictr   r   r   r    r    rt   r!   r     s    
	
?	
r   c                 C   s\   |\}}|  ddd\}}tj| |gddd}|  | | |  | j}|S )Nr   )r1   r   r1   r   )rC   r   rS   r   rF   r   rG   rH   )xr   r   r   x_realx_imag	x_rotatedoutr    r    r!   rB   2  s
    rB   )3r~   r   typingr   rS   torch.nnrj   configuration_utilsr   r   loadersr   r   utilsr   r	   r
   r   utils.torch_utilsr   _modeling_parallelr   r   	attentionr   r   r   attention_dispatchr   cache_utilsr   
embeddingsr   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr#   r   r   r   Moduler+   r   r   r   rB   r    r    r    r!   <module>   s<   
C@ff
 2