o
    Gi                  	   @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm  m	Z
 ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ d	d
lmZmZ d	dlmZmZmZ d	dlmZ d	dlm Z  d	dl!m"Z"m#Z#m$Z$ d	dl%m&Z& d	dl'm(Z( d	dl)m*Z* e+e,Z-dd Z.dd Z/dej0dej0fddZ1dddej0dej0fddZ2G dd  d ej3Z4G d!d" d"Z5G d#d dejj3eZ6G d$d% d%ej3Z7G d&d' d'ej3Z8eG d(d) d)ej3Z9G d*d+ d+e(eeee eZ:dS ),    N)	lru_cache)Any   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)apply_lora_scalelogging)maybe_allow_in_graph   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixinFeedForward)dispatch_attention_fn)
CacheMixin)PixArtAlphaTextProjectionTimestepEmbedding	Timesteps)Transformer2DModelOutput)
ModelMixin)FP32LayerNormc                 C   sj   | j \}}}}}|\}}}	|||  | }
|||  | }|	||	  |	 }tjjj| d|d|d|
fddS )Nr   	replicate)mode)shapetorchnn
functionalpad)xkernel_sizebcthwptphpwpad_tpad_hpad_w r.   d/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_helios.pypad_for_3d_conv(   s   
 r0   c                 C   s   t jjj| ||dS )N)stride)r   r   r   
avg_pool3d)r!   r"   r.   r.   r/   center_down_sample_3d1   s   r3   hidden_states	freqs_cisc                 C   s   |  ddd\}}|djddd\}}t| }||ddd df  ||ddd df   |ddd df< ||ddd df  ||ddd df   |ddd df< || S )	N)r6   r   r   dim.r      )	unflattenunbind	unsqueezechunkr   
empty_liketype_as)r4   r5   x_1x_2cossinoutr.   r.   r/   apply_rotary_emb_transposed5   s   
::
rF   attnHeliosAttentionencoder_hidden_statesc                 C   s   |d u r|}| j r,| js| |jddd\}}}n!| |}| |jddd\}}n| |}| |}| |}|||fS )Nr   r6   r8   r   )fused_projectionsis_cross_attentionto_qkvr>   to_qto_kvto_kto_v)rG   r4   rI   querykeyvaluer.   r.   r/   _get_qkv_projectionsA   s   




rT   c                       sF   e Zd Zddededef fddZdejd	ejd
efddZ	  Z
S )HeliosOutputNormư>Fr9   epselementwise_affinec                    s<   t    ttdd||d  | _t||dd| _d S )Nr:   r         ?FrX   )	super__init__r   	Parameterr   randnscale_shift_tabler   norm)selfr9   rW   rX   	__class__r.   r/   r\   V   s   
zHeliosOutputNorm.__init__r4   temboriginal_context_lengthc                 C   s   |d d | d d d f }| j d|j|d jddd\}}|d|j|d|j}}|d d | d d d f }| | d|  | |}|S )Nr   r   r8   r:   )	r_   r=   todevicer>   squeezer`   floatr@   )ra   r4   rd   re   shiftscaler.   r.   r/   forward[   s   ,& zHeliosOutputNorm.forward)rV   F)__name__
__module____qualname__intri   boolr\   r   Tensorrl   __classcell__r.   r.   rb   r/   rU   U   s    "rU   c                   @   sl   e Zd ZdZdZdd Z				ddddejdejdB dejdB d	eejejf dB d
e	dejfddZ
dS )HeliosAttnProcessorNc                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzeHeliosAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher.)hasattrFImportErrorra   r.   r.   r/   r\   h   s
   
zHeliosAttnProcessor.__init__rG   rH   r4   rI   attention_mask
rotary_embre   returnc              
   C   sb  t |||\}}}	||}||}|d|jdf}|d|jdf}|	d|jdf}	|d ur<t||}t||}|js|jr|jd | }
|
dkrdt	
|j|jd   }|jdkrg|dddd}t	j|d d d |
f | |d d |
d f gdd}t|||	|dd	| j|d u r| jnd d
}|dd}||}|jd |}|jd |}|S )Nr   r6   r:   r         ?per_headr8           F)	attn_mask	dropout_p	is_causalbackendparallel_configr   )rT   norm_qnorm_kr;   headsrF   rK   is_amplify_historyr   r   sigmoidhistory_key_scale	max_scalehistory_scale_modeviewcatr   _attention_backend_parallel_configflattenr@   to_out)ra   rG   r4   rI   rz   r{   re   rQ   rR   rS   history_seq_len	scale_keyr.   r.   r/   __call__n   s>   	




6
zHeliosAttnProcessor.__call__NNNN)rm   rn   ro   r   r   r\   r   rr   tuplerp   r   r.   r.   r.   r/   rt   d   s.    
rt   c                       s   e Zd ZeZegZ										dded	ed
ededededB dedB f fddZdd Z	e
 dd Z				dde
jde
jdB de
jdB dee
je
jf dB dede
jfddZ  ZS )rH      @   h㈵>r   NFr~   r9   r   dim_headrW   dropoutadded_kv_proj_dimcross_attention_dim_headc                    s  t    || | _|| _|| _|| _|d u r| jn|| | _tjj	|| jdd| _
tjj	|| jdd| _tjj	|| jdd| _tjtjj	| j|ddtj|g| _tjj|| |dd| _tjj|| |dd| _d  | _| _|d urtjj	|| jdd| _tjj	|| jdd| _tjj|| |d| _|	d ur|	| _n|d u| _| | |
| _|
r|dkrttd| _n|dkrtt|| _ntd| || _d	| _d S d S )
NTbias)rW   rX   )rW   scalarr:   r~   zUnknown history_scale_mode: g      $@)r[   r\   	inner_dimr   r   r   kv_inner_dimr   r   LinearrM   rO   rP   
ModuleListDropoutr   RMSNormr   r   
add_k_proj
add_v_projnorm_added_krK   set_processorr   r]   onesr   
ValueErrorr   r   )ra   r9   r   r   rW   r   r   r   	processorrK   r   r   rb   r.   r/   r\      sF   





zHeliosAttention.__init__c                 C   s  t | ddrd S | js]t| jjj| jjj| jjjg}t| jj	j| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sKw   Y  | jj||dddd nIt| jjj| jjjg}t| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sw   Y  | jj||dddd | jd urt| jjj| jjjg}t| jj	j| jj	jg}|j
\}}td tj||dd| _W d    n1 sw   Y  | jj||dddd d| _d S )NrJ   FmetaTr   )weightr   )strictassign)getattrrK   r   r   rM   r   datarO   rP   r   r   rg   r   r   rL   load_state_dictrN   r   r   r   to_added_kvrJ   )ra   concatenated_weightsconcatenated_biasout_featuresin_featuresr.   r.   r/   fuse_projections   s@   ""




z HeliosAttention.fuse_projectionsc                 C   sV   t | ddsd S t| drt| d t| drt| d t| dr&t| d d| _d S )NrJ   FrL   rN   r   )r   rv   delattrrJ   ry   r.   r.   r/   unfuse_projections  s   






z"HeliosAttention.unfuse_projectionsr4   rI   rz   r{   re   r|   c                 K   s   | j | |||||fi |S N)r   )ra   r4   rI   rz   r{   re   kwargsr.   r.   r/   rl     s   	zHeliosAttention.forward)
r   r   r   r   NNNNFr~   r   )rm   rn   ro   rt   _default_processor_cls_available_processorsrp   ri   r\   r   r   no_gradr   rr   r   rl   rs   r.   r.   rb   r/   rH      s`    :#
c                       sR   e Zd Zdedededef fddZ		dd	ejd
ejdB defddZ  Z	S )HeliosTimeTextEmbeddingr9   time_freq_dimtime_proj_dimtext_embed_dimc                    sT   t    t|ddd| _t||d| _t | _t	||| _
t||dd| _d S )NTr   )num_channelsflip_sin_to_cosdownscale_freq_shift)in_channelstime_embed_dim	gelu_tanh)act_fn)r[   r\   r   timesteps_projr   time_embedderr   SiLUr   r   	time_projr   text_embedder)ra   r9   r   r   r   rb   r.   r/   r\   $  s   

z HeliosTimeTextEmbedding.__init__NTtimesteprI   is_return_encoder_hidden_statesc                 C   s|   |  |}tt| j j}|j|kr|tjkr||}| |	|}| 
| |}|d ur9|r9| |}|||fS r   )r   nextiterr   
parametersdtyper   int8rf   r@   r   r   r   )ra   r   rI   r   time_embedder_dtyperd   timestep_projr.   r.   r/   rl   3  s   



zHeliosTimeTextEmbedding.forward)NT)
rm   rn   ro   rp   r\   r   rr   rq   rl   rs   r.   r.   rb   r/   r   #  s&    r   c                       s^   e Zd Z fddZdd Ze dd Ze eddd	d
 Z	e dd Z
  ZS )HeliosRotaryPosEmbedc                    sl   t    |\| _| _| _|| _| jd| | jdd | jd| | jdd | jd| | jdd d S )Nfreqs_base_tF)
persistentfreqs_base_yfreqs_base_x)r[   r\   DTDYDXthetaregister_buffer_get_freqs_base)ra   rope_dimr   rb   r.   r/   r\   H  s   
zHeliosRotaryPosEmbed.__init__c                 C   s.   d| j tjd|dtjdd |d  |   S )Nr}   r   r   )r   )r   r   arangefloat32)ra   r9   r.   r.   r/   r   P  s   .z$HeliosRotaryPosEmbed._get_freqs_basec                 C   s,   t d||}|jddd}| | fS )Nzd,bthw->dbthwr   r   r8   )r   einsumrepeat_interleaverC   rD   )ra   
freqs_baseposfreqsr.   r.   r/   get_frequency_batchedS  s   z*HeliosRotaryPosEmbed.get_frequency_batched    )maxsizec           	      C   sJ   t |}t j||t jd}t j||t jd}t j||dd\}}||fS )Nrg   r   ij)indexing)r   rg   r   r   meshgrid)	ra   heightwidth
device_strrg   grid_y_coordsgrid_x_coordsgrid_ygrid_xr.   r.   r/   _get_spatial_meshgridY  s
   
z*HeliosRotaryPosEmbed._get_spatial_meshgridc                 C   s  |j d }|j d }|j|tjd}| ||t|\}}|d d d d d d f ||||}	|d d d d d d f ||dd}
|d d d d d d f ||dd}| | j|	\}}| | j	|
\}}| | j
|\}}tj||||||gdd}|dddddS )	Nr   r:   r   r6   r8   r   r      )r   rf   r   r   r   strexpandr   r   r   r   r   permute)ra   frame_indicesr   r   rg   
batch_size
num_framesr   r   grid_tgrid_y_batchgrid_x_batchfreqs_cos_tfreqs_sin_tfreqs_cos_yfreqs_sin_yfreqs_cos_xfreqs_sin_xresultr.   r.   r/   rl   b  s   

$$$zHeliosRotaryPosEmbed.forward)rm   rn   ro   r\   r   r   r   r   r   r   rl   rs   r.   r.   rb   r/   r   G  s    
r   c                       s   e Zd Z							ddededed	ed
edededB dededef fddZ	ddej	dej	dej	dej	dedej	fddZ
  ZS )HeliosTransformerBlockrms_norm_across_headsFrV   Nr~   r9   ffn_dim	num_headsqk_normcross_attn_normrW   r   guidance_cross_attnr   r   c              
      s   t    t||dd| _t|||| |d t |	|
d| _t|||| |||| t d| _|r7t||ddnt	 | _
t||dd| _t||dd| _ttdd	||d
  | _|| _d S )NFrZ   )r9   r   r   rW   r   r   r   r   )r9   r   r   rW   r   r   r   Tzgelu-approximate)r   activation_fnr:      rY   )r[   r\   r   norm1rH   rt   attn1attn2r   Identitynorm2r   ffnnorm3r]   r   r^   r_   r  )ra   r9   r  r  r  r  rW   r   r  r   r   rb   r.   r/   r\   y  s4   
	
zHeliosTransformerBlock.__init__r4   rI   rd   r{   re   r|   c                 C   s  |j dkr9| jd|  jddd\}}}}	}
}|d}|d}|d}|	d}	|
d}
|d}n| j|  jddd\}}}}	}
}| | d|  | |}| |d d ||}| ||  |}| j	r|j
d | }tj|||gdd\}}| | |}| ||d d |}|| }tj||gdd}n| | |}| ||d d |}|| }| | d|
  |	 |}| |}| | |  |}|S )Nr   r   r  r   r8   r:   )ndimr_   r=   ri   r>   rh   r  r@   r  r  r   r   splitr  r  r   r  r  )ra   r4   rI   rd   r{   re   	shift_msa	scale_msagate_msac_shift_msac_scale_msa
c_gate_msanorm_hidden_statesattn_outputr   history_hidden_states	ff_outputr.   r.   r/   rl     sh   





 


zHeliosTransformerBlock.forward)r  FrV   NFFr~   r   )rm   rn   ro   rp   r   rq   ri   r\   r   rr   rl   rs   r.   r.   rb   r/   r  w  sV    	
7r  c                ,       s  e Zd ZdZdZg dZddgZg dZdgZdgZ	de
d	d
ddie
d	ddde
d	d
ddded	d
ddZe																				d<deedf d ed!ed"ed#ed$ed%ed&ed'ed(ed)edB d*ed+edB d,eedf d-ed.ed/ed0ed1ed2ed3df* fd4d5Zed6									d=dejd7ejd8ejd9ed6eeef dB d3ejeeejf B fd:d;Z  ZS )>HeliosTransformer3DModela  
    A Transformer model for video-like data used in the Helios model.

    Args:
        patch_size (`tuple[int]`, defaults to `(1, 2, 2)`):
            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
        num_attention_heads (`int`, defaults to `40`):
            Fixed length for text embeddings.
        attention_head_dim (`int`, defaults to `128`):
            The number of channels in each head.
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        text_dim (`int`, defaults to `512`):
            Input dimension for text embeddings.
        freq_dim (`int`, defaults to `256`):
            Dimension for sinusoidal time embeddings.
        ffn_dim (`int`, defaults to `13824`):
            Intermediate dimension in feed-forward network.
        num_layers (`int`, defaults to `40`):
            The number of layers of transformer blocks to use.
        window_size (`tuple[int]`, defaults to `(-1, -1)`):
            Window size for local attention (-1 indicates global attention).
        cross_attn_norm (`bool`, defaults to `True`):
            Enable cross-attention normalization.
        qk_norm (`bool`, defaults to `True`):
            Enable query/key normalization.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        add_img_emb (`bool`, defaults to `False`):
            Whether to use img_emb.
        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
            The number of channels to use for the added key and value projections. If `None`, no projection is used.
    T)patch_embeddingpatch_short	patch_mid
patch_longcondition_embedderr`   r  rU   )r   r_   r  r  r  r   norm_added_qr4   r:   r   F)	split_dimexpected_dimssplit_outputr   )rd   r{   )
gather_dimr3  )zblocks.0zblocks.*z	blocks.39r:   r   r   (                6  r  rV   N,   *   r?       @r~   
patch_size.num_attention_headsattention_head_dimr   out_channelstext_dimfreq_dimr  
num_layersr  r  rW   r   r   
rope_thetar  zero_history_timestephas_multi_term_memory_patchr   r   r|   c                    s2  t    | |p|}t||d| _tj|||d| _|| _|r]tj|||d| _tj|t	dd |D t	dd |D d| _
tj|t	dd |D t	dd |D d| _t|d |d	| _t 	f
d
dt|	D | _tdd| _t|t| | _d| _d S )N)r   r   )r"   r1   c                 s       | ]}d | V  qdS r   Nr.   .0pr.   r.   r/   	<genexpr>a      z4HeliosTransformer3DModel.__init__.<locals>.<genexpr>c                 s   rK  rL  r.   rM  r.   r.   r/   rP  b  rQ  c                 s   rK  r   Nr.   rM  r.   r.   r/   rP  g  rQ  c                 s   rK  rR  r.   rM  r.   r.   r/   rP  h  rQ  r  )r9   r   r   r   c                    s(   g | ]}t 	 d 
qS ))r  r   r   )r  )rN  _
r   r  rW   r  r  r   r   r   rB  r  r.   r/   
<listcomp>u  s    z5HeliosTransformer3DModel.__init__.<locals>.<listcomp>FrZ   )r[   r\   r   roper   Conv3dr,  rI  r-  r   r.  r/  r   r0  r   rangeblocksrU   norm_outr   mathprodproj_outgradient_checkpointing)ra   rA  rB  rC  r   rD  rE  rF  r  rG  r  r  rW   r   r   rH  r  rI  rJ  r   r   rb   rT  r/   r\   9  sD   

z!HeliosTransformer3DModel.__init__attention_kwargsr   rI   return_dictc           %   
   C   sv  |j d }| jj\}}}| |}|j \}}}}}|d u r*td|d|d}|d	dd}| j
||||jd}|d	dd}|j d }|d ur|d ur| |}|j \}}}}}|d	dd}| j
||||jd}|d	dd}tj||gdd}tj||gdd}|	d ur|d urt|	d}	| |	}	|	d	dd}	| j
||||	jd}t|d}t|d}|d	dd}tj|	|gdd}tj||gdd}|
d ur$|d ur$t|
d	}
| |
}
|
d	dd}
| j
||||
jd}t|d
}t|d
}|d	dd}tj|
|gdd}tj||gdd}|j d | }|d ure| jretjd|j|jd}| j||dd\}}}|d||d}|dddddd|d|d}| ||\} }!}|!dd}!|d ur| js|j d }"n|}"| |dd||"d} |!|ddd|d|"d}!|d ur| jrtj|| gdd} tj||!gdd}!|!jdkr|!dddd}!| }| }| }t r| jr| jD ]}#| |#|||!||}qn| jD ]}#|#|||!||}q| || |}|  |}|!|||||||d}|dddddddd}|dddddd}$|s6|$fS t"|$dS )Nr   r6   r   r:   )r  r   r   rg   r8   )r   r   r   )r   r   r   )r   r   r   )r   r   r   )r   rg   F)r   )r  r6   r  r   r         )sample)#r   configrA  r,  r   r   r=   r   r   	transposerV  rg   r-  r   r0   r.  r3   r/  rI  zerosr   r0  r;   r   r  r  
contiguousis_grad_enabledr^  rY  _gradient_checkpointing_funcrZ  r]  reshaper   )%ra   r4   r   rI   indices_hidden_statesindices_latents_history_shortindices_latents_history_midindices_latents_history_longlatents_history_shortlatents_history_midlatents_history_longr`  r_  r  p_tp_hp_wrS  post_patch_num_framespost_patch_heightpost_patch_widthr{   re   H1W1rotary_emb_history_shortrotary_emb_history_midrotary_emb_history_longhistory_context_lengthtimestep_t0temb_t0timestep_proj_t0rd   r   main_repeat_sizeblockoutputr.   r.   r/   rl     s   















	

z HeliosTransformer3DModel.forward)r6  r7  r8  r9  r9  r:  r;  r<  r7  Tr  rV   Nr=  r@  TTTFr~   )	NNNNNNNTN)rm   rn   ro   __doc__ _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_keep_in_fp32_modules"_keys_to_ignore_on_load_unexpected_repeated_blocksr   r   _cp_planr   r   rp   rq   r   ri   r\   r	   r   rr   
LongTensordictr   rl   rs   r.   r.   rb   r/   r+    s    $

	

Rr+  );r[  	functoolsr   typingr   r   torch.nnr   torch.nn.functionalr   rw   configuration_utilsr   r   loadersr   r   utilsr	   r
   utils.torch_utilsr   _modeling_parallelr   r   	attentionr   r   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   r   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerrm   loggerr0   r3   rr   rF   rT   ModulerU   rt   rH   r   r   r  r+  r.   r.   r.   r/   <module>   sL   
	
< $0
{