o
    GiU                     @   s  d dl Z d dlm  mZ d dl mZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ e	eZG dd dejZG dd dejZ G dd dZ!eG dd dejZ"G dd deeZ#dS )    N)nn   )ConfigMixinregister_to_config)logging)maybe_allow_in_graph   )	AttentionFeedForward)TimestepEmbedding	Timestepsget_3d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormFP32LayerNormRMSNormc                       sx   e Zd Z				ddededededed	ed
df fddZdej	dej	dej	d
e
ej	ej	ej	ej	f fddZ  ZS )EasyAnimateLayerNormZeroTh㈵>fp32_layer_normconditioning_dimembedding_dimelementwise_affineepsbias	norm_typereturnNc                    st   t    t | _tj|d| |d| _|dkr$tj|||d| _d S |dkr2t	|||d| _d S t
d| d)N   )r   
layer_normr   r   r   zUnsupported `norm_type` (z@) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'.)super__init__r   SiLUsiluLinearlinear	LayerNormnormr   
ValueError)selfr   r   r   r   r   r   	__class__ i/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_easyanimate.pyr!   "   s   
	

z!EasyAnimateLayerNormZero.__init__hidden_statesencoder_hidden_statestembc           
      C   sv   |  | |jddd\}}}}}}	| |d|d  |d }| |d|d  |d }||||	fS )Nr      dim)r%   r#   chunkr'   	unsqueeze)
r)   r.   r/   r0   shiftscalegate	enc_shift	enc_scaleenc_gater,   r,   r-   forward9   s   &"z EasyAnimateLayerNormZero.forward)Tr   Tr   )__name__
__module____qualname__intboolfloatstrr!   torchTensortupler<   __classcell__r,   r,   r*   r-   r   !   s:    r   c                       sL   e Zd Zdedee ddf fddZdd Zd	ejdejfd
dZ	  Z
S )EasyAnimateRotaryPosEmbed
patch_sizerope_dimr   Nc                    s   t    || _|| _d S N)r    r!   rI   rJ   )r)   rI   rJ   r*   r,   r-   r!   E   s   

z"EasyAnimateRotaryPosEmbed.__init__c                 C   s   |}|}|\}}|| }||| kr|}	t t|| | }
n|}
t t|| | }	t t||	 d }t t||
 d }||f||	 ||
 ffS )Ng       @)r@   round)r)   src	tgt_width
tgt_heighttwthhwrresize_heightresize_widthcrop_top	crop_leftr,   r,   r-   get_resize_crop_region_for_gridK   s   z9EasyAnimateRotaryPosEmbed.get_resize_crop_region_for_gridr.   c                 C   sn   |  \}}}}}|| j }|| j }d| j }d| j }| ||f||}	t| j|	||f| ddd}
|
S )NZ   <   r   T)	grid_sizetemporal_sizeuse_real)sizerI   rY   r   rJ   )r)   r.   bsc
num_framesgrid_height
grid_widthbase_size_widthbase_size_heightgrid_crops_coordsimage_rotary_embr,   r,   r-   r<   \   s    




z!EasyAnimateRotaryPosEmbed.forward)r=   r>   r?   r@   listr!   rY   rD   rE   r<   rG   r,   r,   r*   r-   rH   D   s    rH   c                   @   sR   e Zd ZdZdd Z		ddedejdejdejdB d	ejdB d
ejfddZdS )EasyAnimateAttnProcessor2_0z
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
    used in the EasyAnimateTransformer3DModel model.
    c                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzaEasyAnimateAttnProcessor2_0 requires PyTorch 2.0 or above. To use it, please install PyTorch 2.0.)hasattrFImportError)r)   r,   r,   r-   r!   v   s
   
z$EasyAnimateAttnProcessor2_0.__init__Nattnr.   r/   attention_maskrh   r   c                 C   s4  |j d u r|d urtj||gdd}||}||}||}|d|jdfdd}|d|jdfdd}|d|jdfdd}|j	d urR|	|}|j
d ur\|
|}|j d ur|d ur| |}	||}
||}|	d|jdfdd}	|
d|jdfdd}
|d|jdfdd}|jd ur||	}	|jd ur||
}
tj|	|gdd}tj|
|gdd}tj||gdd}|d urddlm} ||d d d d |jd d f ||d d d d |jd d f< |js||d d d d |jd d f ||d d d d |jd d f< tj||||ddd}|dddd	}||j}|d ur|d d d |jd f |d d |jd d f }}t|d
d d urm|jd |}|jd |}t|dd d ur{||}||fS t|d
d d ur|jd |}|jd |}||fS )Nr1   r2   r   )apply_rotary_emb        F)	attn_mask	dropout_p	is_causalr   to_outr   
to_add_out)
add_q_projrD   catto_qto_kto_v	unflattenheads	transposenorm_qnorm_k
add_k_proj
add_v_projnorm_added_qnorm_added_k
embeddingsrr   shapeis_cross_attentionrm   rk   flattentodtypegetattrrw   rx   )r)   ro   r.   r/   rp   rh   querykeyvalueencoder_queryencoder_keyencoder_valuerr   r,   r,   r-   __call__|   sl   














    

z$EasyAnimateAttnProcessor2_0.__call__)NN)	r=   r>   r?   __doc__r!   r	   rD   rE   r   r,   r,   r,   r-   rj   p   s$    rj   c                       s   e Zd Z											d ded	ed
ededededededededB dededededef fddZ	d!dej	dej	dej	de
ej	ej	f dB de
ej	ej	f f
ddZ  ZS )"EasyAnimateTransformerBlockrs   gelu-approximateTư>NFr   r3   num_attention_headsattention_head_dimtime_embed_dimdropoutactivation_fnnorm_elementwise_affinenorm_epsfinal_dropoutff_inner_dimff_biasqk_norm
after_normr   is_mmdit_blockc                    s   t    t|||||dd| _t||||rdnd ddd|r |nd |r%dnd t d
| _t|||||dd| _t||||	|
|d| _	d | _
|rRt||||	|
|d| _
d | _|rat|||d| _d S d S )	NT)r   r   r   r   F)
	query_dimdim_headr   r   r   r   added_proj_biasadded_kv_proj_dimcontext_pre_only	processor)r   r   r   	inner_dimr   r   )r    r!   r   norm1r	   rj   attn1norm2r
   fftxt_ffnorm3r   )r)   r3   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   r,   r-   r!      sR   



		z$EasyAnimateTransformerBlock.__init__r.   r/   r0   rh   r   c                 C   s  |  |||\}}}}| j|||d\}	}
||d|	  }||d|
  }| |||\}}}}| jd urV| | |}| jd urM| | |}n| | |}n| |}| jd urf| |}n| |}||d|  }||d|  }||fS )N)r.   r/   rh   r1   )r   r   r5   r   r   r   r   )r)   r.   r/   r0   rh   norm_hidden_statesnorm_encoder_hidden_statesgate_msaenc_gate_msaattn_hidden_statesattn_encoder_hidden_statesgate_ffenc_gate_ffr,   r,   r-   r<     s2   





z#EasyAnimateTransformerBlock.forward)rs   r   Tr   TNTTFr   TrK   )r=   r>   r?   r@   rB   rC   rA   r!   rD   rE   rF   r<   rG   r,   r,   r*   r-   r      sn    	
Gr   c                2       s@  e Zd ZdZdZdgZg dZe									
																	d8dedededB dedB dedB dedede	de	dededede
d ed!ed"ed#ed$e
d%ed&ed'e	d(ed)ed*ef0 fd+d,Z						d9d-ejd.ejd/ejdB d0ejdB d1ejdB d2ejdB d3ejdB d4ed5eej eB fd6d7Z  ZS ):EasyAnimateTransformer3DModela
  
    A Transformer model for video-like data in [EasyAnimate](https://github.com/aigc-apps/EasyAnimate).

    Parameters:
        num_attention_heads (`int`, defaults to `48`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`, defaults to `64`):
            The number of channels in each head.
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        out_channels (`int`, *optional*, defaults to `16`):
            The number of channels in the output.
        patch_size (`int`, defaults to `2`):
            The size of the patches to use in the patch embedding layer.
        sample_width (`int`, defaults to `90`):
            The width of the input latents.
        sample_height (`int`, defaults to `60`):
            The height of the input latents.
        activation_fn (`str`, defaults to `"gelu-approximate"`):
            Activation function to use in feed-forward.
        timestep_activation_fn (`str`, defaults to `"silu"`):
            Activation function to use when generating the timestep embeddings.
        num_layers (`int`, defaults to `30`):
            The number of layers of Transformer blocks to use.
        mmdit_layers (`int`, defaults to `1000`):
            The number of layers of Multi Modal Transformer blocks to use.
        dropout (`float`, defaults to `0.0`):
            The dropout probability to use.
        time_embed_dim (`int`, defaults to `512`):
            Output dimension of timestep embeddings.
        text_embed_dim (`int`, defaults to `4096`):
            Input dimension of text embeddings from the text encoder.
        norm_eps (`float`, defaults to `1e-5`):
            The epsilon value to use in normalization layers.
        norm_elementwise_affine (`bool`, defaults to `True`):
            Whether to use elementwise affine in normalization layers.
        flip_sin_to_cos (`bool`, defaults to `True`):
            Whether to flip the sin to cos in the time embedding.
        time_position_encoding_type (`str`, defaults to `3d_rope`):
            Type of time position encoding.
        after_norm (`bool`, defaults to `False`):
            Flag to apply normalization after.
        resize_inpaint_mask_directly (`bool`, defaults to `True`):
            Flag to resize inpaint mask directly.
        enable_text_attention_mask (`bool`, defaults to `True`):
            Flag to enable text attention mask.
        add_noise_in_inpaint_model (`bool`, defaults to `False`):
            Flag to add noise in inpaint model.
    Tr   )z^proj$r'   z
^proj_out$0   @   NrZ   r[   r   r#   r   rs      F   r   3d_roper   r   in_channelsout_channelsrI   sample_widthsample_heightr   timestep_activation_fn
freq_shift
num_layersmmdit_layersr   r   add_norm_text_encodertext_embed_dimtext_embed_dim_t5r   r   flip_sin_to_costime_position_encoding_typeresize_inpaint_mask_directlyenable_text_attention_maskadd_noise_in_inpaint_modelc                    sJ  t     t||
| _t	|	| _t|| _tj	|||f|dd| _
d | _d | _|sEt|| _|d urDt|| _n$tt|dddt|| _|d uritt|dddt|| _t 	f
ddt|D | _t| _t	d dd	| _t|| | | _d
| _d S )NT)kernel_sizestrider   r   )r   c                    s4   g | ]}t 	 |k rd ndd
qS )TF)
r3   r   r   r   r   r   r   r   r   r   )r   ).0_
r   r   r   r   r   r   r   r   r   r   r,   r-   
<listcomp>  s    z:EasyAnimateTransformer3DModel.__init__.<locals>.<listcomp>r   r1   )r   
output_dimr   r   	chunk_dimF)r    r!   r   	time_projr   time_embeddingrH   rope_embeddingr   Conv2dproj	text_projtext_proj_t5r$   
Sequentialr   
ModuleListrangetransformer_blocksr&   
norm_finalr   norm_outproj_outgradient_checkpointing)r)   r   r   r   r   rI   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r*   r   r-   r!   s  sJ   

z&EasyAnimateTransformer3DModel.__init__r.   timesteptimestep_condr/   encoder_hidden_states_t5inpaint_latentscontrol_latentsreturn_dictr   c	              	   C   s  |  \}	}
}}}| jj}|| }|| }| |j|jd}| ||}| |}|d ur6t	||gd}|d urBt	||gd}|
ddddddd}| |}|d|	df
ddddd}|dddd}| |}|d ur| |}tj||gdd }| jD ]}t r| jr| |||||\}}q|||||\}}q| |}| j||d	}| |}| jj}||	||||
||}|
ddddd
ddd
ddd}|s|fS t|dS )N)r   r1   r   r   r      rq   r2   )r0      r   )sample)r_   configrI   r   r   r   r   r   rD   concatpermuter   r   r~   r   r   r   rz   
contiguousr   is_grad_enabledr   _gradient_checkpointing_funcr   r   r   reshaper   )r)   r.   r   r   r/   r   r   r   r   
batch_sizechannelsvideo_lengthheightwidthppost_patch_heightpost_patch_widthr0   rh   blockoutputr,   r,   r-   r<     sL   










&
z%EasyAnimateTransformer3DModel.forward)r   r   NNNrZ   r[   r   r#   r   r   r   rs   r   Fr   Nr   TTr   FTTT)NNNNNT)r=   r>   r?   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r@   rC   rB   rA   r!   rD   rE   rF   r   r<   rG   r,   r,   r*   r-   r   <  s    2	
]	
r   )$rD   torch.nn.functionalr   
functionalrm   configuration_utilsr   r   utilsr   utils.torch_utilsr   	attentionr	   r
   r   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerr=   loggerModuler   rH   rj   r   r   r,   r,   r,   r-   <module>   s$   
#,`k