o
    GiB                     @   s   d dl Z d dlmZ d dlm  mZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ e
eZeG dd dej Z!G dd deeeZ"dS )    N   )ConfigMixinregister_to_config)logging)maybe_allow_in_graph   )FeedForward)AllegroAttnProcessor2_0	Attention)
CacheMixin)
PatchEmbedPixArtAlphaTextProjection)Transformer2DModelOutput)
ModelMixin)AdaLayerNormSinglec                       s   e Zd ZdZ						dded	ed
ededB dedededef fddZ					dde	j
de	j
dB de	jdB de	j
dB de	j
dB de	j
fddZ  ZS )AllegroTransformerBlocka  
    Transformer block used in [Allegro](https://github.com/rhymes-ai/Allegro) model.

    Args:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        dropout (`float`, defaults to `0.0`):
            The dropout probability to use.
        cross_attention_dim (`int`, defaults to `2304`):
            The dimension of the cross attention features.
        activation_fn (`str`, defaults to `"gelu-approximate"`):
            Activation function to be used in feed-forward.
        attention_bias (`bool`, defaults to `False`):
            Whether or not to use bias in attention projection layers.
        only_cross_attention (`bool`, defaults to `False`):
        norm_elementwise_affine (`bool`, defaults to `True`):
            Whether to use learnable elementwise affine parameters for normalization.
        norm_eps (`float`, defaults to `1e-5`):
            Epsilon value for normalization layers.
        final_dropout (`bool` defaults to `False`):
            Whether to apply a final dropout after the last feed-forward layer.
            NgegluFTh㈵>dimnum_attention_headsattention_head_dimcross_attention_dimactivation_fnattention_biasnorm_elementwise_affinenorm_epsc
           
   	      s   t    tj|||	d| _t|||||d t d| _tj|||	d| _t||||||t d| _	tj|||	d| _
t|||d| _ttd||d  | _d S )Nelementwise_affineeps)	query_dimheadsdim_headdropoutbiasr   	processor)r    r   r!   r"   r#   r$   r%   )r#   r            ?)super__init__nn	LayerNormnorm1r
   r	   attn1norm2attn2norm3r   ff	Parametertorchrandnscale_shift_table)
selfr   r   r   r#   r   r   r   r   r   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_allegro.pyr)   @   s8   
 z AllegroTransformerBlock.__init__hidden_statesencoder_hidden_statestembattention_maskencoder_attention_maskreturnc                 C   s  |j d }| jd  ||dd jddd\}}	}
}}}| |}|d|	  | }|d}| j|d ||d}|
| }|| }|jdkrI|d}| jd ur]|}| j|||d d}|| }| 	|}|d|  | }| 
|}|| }|| }|jdkr|d}|S )Nr   r&      r   )r<   r>   image_rotary_emb   )shaper5   reshapechunkr,   squeezer-   ndimr/   r.   r1   )r6   r;   r<   r=   r>   r?   rD   
batch_size	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpnorm_hidden_statesattn_output	ff_outputr9   r9   r:   forwards   sD   










zAllegroTransformerBlock.forward)r   Nr   FTr   )NNNNN)__name__
__module____qualname____doc__intstrboolfloatr)   r3   Tensor
LongTensorrU   __classcell__r9   r9   r7   r:   r   #   sV     	
6r   c                *       s   e Zd ZdZ	 dZg dZe										
											d4dedededededededededededed ed!e	d"ed#ed$ed%ed&ed'ef( fd(d)Z
	*	*	*	d5d+ejd,ejd-ejd.ejd*B d/ejd*B d0eejejf d*B d1efd2d3Z  ZS )6AllegroTransformer3DModelT)	pos_embednormadaln_singler   rB      `   rE       r    	  Z         gelu-approximateFư>          @皙@
patch_sizepatch_size_tr   r   in_channelsout_channels
num_layersr#   r   r   sample_heightsample_widthsample_framesr   r   r   caption_channelsinterpolation_scale_hinterpolation_scale_winterpolation_scale_tc                    s(  t     _|d ur|n|d dkr|d d d n|d }|d ur(|n|d }|d ur2|n|d }t||||jd d_t f	ddt|D _tj	jd	d
d_
ttdjjd  _tj|| | _tjd	d_t|jd_d	_d S )Nr   rB         (   )heightwidthrq   rs   	embed_dimpos_embed_typec                    s(   g | ]}t j d 	qS ))r#   r   r   r   r   r   )r   	inner_dim).0_	r   r   r   r   r#   r   r   r   r6   r9   r:   
<listcomp>  s    z6AllegroTransformer3DModel.__init__.<locals>.<listcomp>Frm   r   r'   )use_additional_conditions)in_featureshidden_size)r(   r)   r   r   rb   r*   
ModuleListrangetransformer_blocksr+   norm_outr2   r3   r4   r5   Linearproj_outr   rd   r   caption_projectiongradient_checkpointing)r6   rq   rr   r   r   rs   rt   ru   r#   r   r   rv   rw   rx   r   r   r   ry   rz   r{   r|   r7   r   r:   r)      s:   


 
z"AllegroTransformer3DModel.__init__Nr;   r<   timestepr>   r?   rD   return_dictc              
   C   st  |j \}}	}
}}| jj}| jj}|
| }|| }|| }|d url|jdkrl||j}|d d d |
f }| dkrX|d}t	j
||||f|||fd}|d|dd}| dkrjd| |j d nd }|d ur|jdkrd|| j d }|d}| j|||jd\}}|dddd	ddd}| |}|d|dfdd}| |}||d|j d }t| jD ]!\}}t r| jr| |||||||}q|||||||d
}q| jd  |d d d f  jddd\}}| |}|d|  | }| |}|d}||||||||d}|ddddddd	d}||d|
||}|s5|fS t|dS )NrE   r   rB   )kernel_sizestriderA   g     r   )rK   hidden_dtyper   )r;   r<   r=   r>   r?   rD   rC         r&   )sample) rF   configrr   rq   rJ   todtypenumel	unsqueezeF
max_pool3dflattenviewr\   rd   permuterb   	unflattenr   	enumerater   r3   is_grad_enabledr   _gradient_checkpointing_funcr5   rH   r   r   rI   rG   r   )r6   r;   r<   r   r>   r?   rD   r   rK   num_channels
num_framesr   r   p_tppost_patch_num_framespost_patch_heightpost_patch_widthembedded_timestepiblockshiftscaleoutputr9   r9   r:   rU   1  st   

&




(




z!AllegroTransformer3DModel.forward)r   rB   re   rf   rE   rE   rg   r   rh   Tri   rj   rk   rl   Frm   rn   ro   ro   rp   )NNNT)rV   rW   rX    _supports_gradient_checkpointing _skip_layerwise_casting_patternsr   rZ   r]   r\   r[   r)   r3   r^   r_   tuplerU   r`   r9   r9   r7   r:   ra      s    .	
Sra   )#r3   torch.nnr*   torch.nn.functional
functionalr   configuration_utilsr   r   utilsr   utils.torch_utilsr   	attentionr   attention_processorr	   r
   cache_utilsr   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerrV   loggerModuler   ra   r9   r9   r9   r:   <module>   s$   
 