o
    Gi[5                     @   s   d dl Z d dlmZ d dlm  mZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZmZ G dd	 d	ejZG d
d dejZG dd dejZG dd dee
eZdS )    N   )ConfigMixinregister_to_config)PeftAdapterMixin)FeedForward)
ModelMixin)LTX2AttentionLTX2AudioVideoAttnProcessorc                       sr   e Zd ZdZ					ddeded	ed
ededef fddZdededee	j
B dee	je	jf fddZ  ZS )LTX2RotaryPosEmbed1dzY
    1D rotary positional embeddings (RoPE) for the LTX 2.0 text encoder connectors.
            @Tinterleaved    dimbase_seq_lenthetadouble_precision	rope_typenum_attention_headsc                    sJ   t    |dvrtd|d|| _|| _|| _|| _|| _|| _d S )N)r   splitz
rope_type=z9 not supported. Choose between 'interleaved' and 'split'.)	super__init__
ValueErrorr   r   r   r   r   r   )selfr   r   r   r   r   r   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/ltx2/connectors.pyr      s   
	
zLTX2RotaryPosEmbed1d.__init__
batch_sizeposdevicereturnc              
   C   sJ  t j|t j|d}|| j }|d|d}d}| jrt jnt j}t | j	t j
dd| j| ||d}|t j d jt jd	}	|d
d d |	 }	| jdkr|	 jdd
d}
|	 jdd
d}| j| dkrt |
d d d d d | j| f }t |d d d d d | j| f }t j||
gd
d}
t j||gd
d}|
|fS | jdkr!| jd }|	jd
 }|| }|	 }|	 }|dkrt |d d d d d |f }t |d d d d d |f }t j||gd
d}t j||gd
d}|jd }|jd }|||| jd
}|||| jd
}t |dd}
t |dd}|
|fS )N)dtyper    r         g              ?)startendstepsr"   r           @)r"   r   r   r   )axis)torcharangefloat32r   	unsqueezerepeatr   float64powr   linspacer   pitor   cosrepeat_interleavesin	ones_like
zeros_likecatshapeconcatenatereshaper   swapaxes)r   r   r   r    grid_1dgridnum_rope_elemsfreqs_dtypepow_indicesfreqs	cos_freqs	sin_freqscos_paddingsin_paddingexpected_freqscurrent_freqspad_sizecos_freqsin_freqbtr   r   r   forward%   sL   

&&

  

zLTX2RotaryPosEmbed1d.forward)r   r   Tr   r   )__name__
__module____qualname____doc__intfloatboolstrr   r-   r    tupleTensorrR   __classcell__r   r   r   r   r
      s:    r
   c                       sn   e Zd Z			ddededededed	ef fd
dZ		ddejdejdB dejdB dejfddZ	  Z
S )LTX2TransformerBlock1dgelu-approximateư>r   r   r   attention_head_dimactivation_fnepsr   c                    s\   t    tjj||dd| _t||||t |d| _tjj||dd| _	t
||d| _d S )NFrc   elementwise_affine)	query_dimheadskv_headsdim_head	processorr   )rb   )r   r   r-   nnRMSNormnorm1r   r	   attn1norm2r   ff)r   r   r   ra   rb   rc   r   r   r   r   r   e   s   
		zLTX2TransformerBlock1d.__init__Nhidden_statesattention_mask
rotary_embr!   c                 C   sB   |  |}| j|||d}|| }| |}| |}|| }|S )N)rr   query_rotary_emb)rm   rn   ro   rp   )r   rq   rr   rs   norm_hidden_statesattn_hidden_statesff_hidden_statesr   r   r   rR   }   s   


zLTX2TransformerBlock1d.forward)r_   r`   r   )NN)rS   rT   rU   rW   rZ   rX   r   r-   r\   rR   r]   r   r   r   r   r^   d   s8    r^   c                       s   e Zd ZdZdZ											
ddededededB dedededededef fddZ			d de
jde
jdB dedee
je
jf fddZ  ZS )!LTX2ConnectorTransformer1dz
    A 1D sequence transformer for modalities such as text.

    In LTX 2.0, this is used to process the text encoder hidden states for each of the video and audio streams.
    T      r$   r   r   r`   Fr   r   ra   
num_layersnum_learnable_registersNrope_base_seq_len
rope_thetarope_double_precisionrc   causal_temporal_positioningr   c                    s   t    _  _|	_|_d _|d ur,t|jd d }tj	
|_tj|||d_tj	 fddt|D _tj	jj|dd_d_d S )Nr)   r%   )r   r   r   r   r   c                    s   g | ]}t j d qS ))r   r   ra   r   )r^   	inner_dim).0_ra   r   r   r   r   r   
<listcomp>   s    z7LTX2ConnectorTransformer1d.__init__.<locals>.<listcomp>Frd   )r   r   r   r   r   r|   learnable_registersr-   randrk   	Parameterr
   rope
ModuleListrangetransformer_blocksrl   norm_outgradient_checkpointing)r   r   ra   r{   r|   r}   r~   r   rc   r   r   init_registersr   r   r   r      s0   

	
z#LTX2ConnectorTransformer1d.__init__     rq   rr   attn_mask_binarize_thresholdr!   c                    sx  j \}}| jd ur| j dkr tdj d  d| j | j }t| j|df}||k   jdkrA dd  fddt	|D }dd |D }	fd	d|	D }
d
d t
||
D }tjdd |D dd}tj dgdd}|| d| |  t|}| j|jd}| jD ]}t r| jr| |||q|||dq| |fS )Nr   z$The `hidden_states` sequence length r#   z: should be divisible by the number of learnable registers    c                    s&   g | ]}| |   d d f qS )N)rY   )r   i)binary_attn_maskrq   r   r   r      s   & z6LTX2ConnectorTransformer1d.forward.<locals>.<listcomp>c                 S   s   g | ]}|j d  qS r   )r=   r   xr   r   r   r          c                    s   g | ]} | qS r   r   )r   valid_seq_len)seq_lenr   r   r      s    c                 S   s(   g | ]\}}t j|d d d |fd dqS )r   )padvalue)Fr   )r   r   pr   r   r   r      s    c                 S   s   g | ]}| d qS r   )r0   r   r   r   r   r      r   r+   )dimsr*   )r    )rr   rs   )r=   r   r|   r   r-   tilerW   ndimsqueezer   zipr<   flipr0   r;   r   r    r   is_grad_enabledr   _gradient_checkpointing_funcr   )r   rq   rr   r   r   r   num_register_repeats	registershidden_states_non_paddedvalid_seq_lenspad_lengthspadded_hidden_statesflipped_maskrs   blockr   )r   rq   r   r   rR      s<   





z"LTX2ConnectorTransformer1d.forward)
ry   rz   r$   rz   r   r   Tr`   Fr   )Nr   )rS   rT   rU   rV    _supports_gradient_checkpointingrW   rX   rY   rZ   r   r-   r\   r[   rR   r]   r   r   r   r   rx      sZ    	
4rx   c                        s   e Zd ZdZe	ddedededededed	B d
edededed	B dededededef fddZ		dde
jde
jdefddZ  ZS )LTX2TextConnectorsz
    Text connector stack used by LTX 2.0 to process the packed text encoder hidden states for both the video and audio
    streams.
    r   caption_channelstext_proj_in_factor#video_connector_num_attention_heads"video_connector_attention_head_dimvideo_connector_num_layers'video_connector_num_learnable_registersN#audio_connector_num_attention_heads"audio_connector_attention_head_dimaudio_connector_num_layers'audio_connector_num_learnable_registersconnector_rope_base_seq_lenr~   r   r   r   c                    s\   t    tj|| |dd| _t|||||||||d	| _t|||	|
|||||d	| _d S )NF)bias)	r   ra   r{   r|   r}   r~   r   r   r   )r   r   rk   Lineartext_proj_inrx   video_connectoraudio_connector)r   r   r   r   r   r   r   r   r   r   r   r   r~   r   r   r   r   r   r   r     s0   
zLTX2TextConnectors.__init__Ftext_encoder_hidden_statesrr   additive_maskc           
      C   s   |s |j }|d |jd dd|jd }||t|j }| |}| ||\}}|dk tj	}||jd |jd d}|| }|
d}| ||\}}	|||fS )Nr#   r   r*   r`   )r"   r?   r=   r6   r-   finfomaxr   r   int64r   r   )
r   r   rr   r   
text_dtypevideo_text_embeddingnew_attn_mask	attn_maskaudio_text_embeddingr   r   r   r   rR   0  s    


zLTX2TextConnectors.forward)r   )F)rS   rT   rU   rV   r   rW   rX   rY   rZ   r   r-   r\   rR   r]   r   r   r   r   r      sV    	
,r   )r-   torch.nnrk   torch.nn.functional
functionalr   configuration_utilsr   r   loadersr   models.attentionr   models.modeling_utilsr   $models.transformers.transformer_ltx2r   r	   Moduler
   r^   rx   r   r   r   r   r   <module>   s    X*p