o
    Gi3E                  	   @   sL  d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZ d	d
lmZ d	dlmZmZ d	dlmZ d	dlmZmZ d	dlmZ d	dlmZ d	dlm Z m!Z! e"e#Z$G dd dej%Z&G dd dej%Z'G dd dej%Z(eG dd dej%Z)G dd dej%Z*eG dd deee	eeZ+dS )    )AnyN   )ConfigMixinregister_to_config)PeftAdapterMixin)FromOriginalModelMixin)apply_lora_scalelogging)maybe_allow_in_graph   )FeedForward)MochiAttentionMochiAttnProcessor2_0)
CacheMixin)%MochiCombinedTimestepCaptionEmbedding
PatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousRMSNormc                       s,   e Zd Zdef fddZdddZ  ZS )MochiModulatedRMSNormepsc                    s"   t    || _td|d| _d S Nr   F)super__init__r   r   norm)selfr   	__class__ c/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_mochi.pyr   '   s   
zMochiModulatedRMSNorm.__init__Nc                 C   s:   |j }|tj}| |}|d ur|| }||}|S N)dtypetotorchfloat32r   )r   hidden_statesscalehidden_states_dtyper   r   r    forward-   s   

zMochiModulatedRMSNorm.forwardr!   )__name__
__module____qualname__floatr   r)   __classcell__r   r   r   r    r   &   s    r   c                       sH   e Zd Z		ddedef fddZdejdejd	ejfd
dZ  ZS )MochiLayerNormContinuoush㈵>Tembedding_dimconditioning_embedding_dimc                    s6   t    t | _tj|||d| _t|d| _d S )N)biasr   )	r   r   nnSiLUsiluLinearlinear_1r   r   )r   r1   r2   r   r3   r   r   r    r   <   s   

z!MochiLayerNormContinuous.__init__xconditioning_embeddingreturnc                 C   sF   |j }| | ||j }| |d|dtj }||S )N   )r"   r9   r7   r#   r   	unsqueezer$   r%   )r   r:   r;   input_dtyper'   r   r   r    r)   J   s   
z MochiLayerNormContinuous.forward)r0   T)	r*   r+   r,   intr   r$   Tensorr)   r.   r   r   r   r    r/   ;   s    r/   c                       sh   e Zd ZdZ	ddededededd	f
 fd
dZdej	dej	de
ej	ej	ej	ej	f fddZ  ZS )MochiRMSNormZeroz}
    Adaptive RMS Norm used in Mochi.

    Parameters:
        embedding_dim (`int`): The size of each embedding vector.
    r0   Fr1   
hidden_dimr   elementwise_affiner<   Nc                    s4   t    t | _t||| _td|d| _d S r   )	r   r   r5   r6   r7   r8   linearr   r   )r   r1   rC   r   rD   r   r   r    r   `   s   

zMochiRMSNormZero.__init__r&   embc                 C   sp   |j }| | |}|jddd\}}}}| |tjd|d d d f tj  }||}||||fS )N   r=   dim)r"   rE   r7   chunkr   r#   r$   r%   )r   r&   rF   r(   	scale_msagate_msa	scale_mlpgate_mlpr   r   r    r)   i   s   .
zMochiRMSNormZero.forward)r0   F)r*   r+   r,   __doc__r@   r-   boolr   r$   rA   tupler)   r.   r   r   r   r    rB   X   s*    	rB   c                       s   e Zd ZdZ				ddededed	ed
ededededdf fddZ	dde	j
de	j
de	j
de	j
de	j
dB dee	j
e	j
f fddZ  ZS )MochiTransformerBlocka/  
    Transformer block used in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

    Args:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        qk_norm (`str`, defaults to `"rms_norm"`):
            The normalization layer to use.
        activation_fn (`str`, defaults to `"swiglu"`):
            Activation function to use in feed-forward.
        context_pre_only (`bool`, defaults to `False`):
            Whether or not to process context-related conditions with additional layers.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
    rms_normswigluFư>rI   num_attention_headsattention_head_dimpooled_projection_dimqk_normactivation_fncontext_pre_onlyr   r<   Nc	           	         s(  t    || _d| d d | _d| d d | _t|d| |dd| _|s3t|d| |dd| _nt|||d| _t	|||d|d|||t
 dd| _t|d	| _| jsZt|d	nd | _t|| _| jsjt|d	nd | _t|| j|dd
| _d | _|st|| j|dd
| _t|d	| _t|d	| _d S )NrG   r   r   F)r   rD   )r1   r2   r   r0   )	query_dimheadsdim_headr3   added_kv_proj_dimadded_proj_biasout_dimout_context_dimr[   	processorr   r4   )	inner_dimrZ   r3   )r   r   r[   ff_inner_dimff_context_inner_dimrB   norm1norm1_contextr/   r   r   attn1r   norm2norm2_contextnorm3norm3_contextr   ff
ff_contextnorm4norm4_context)	r   rI   rV   rW   rX   rY   rZ   r[   r   r   r   r    r      sP   

zMochiTransformerBlock.__init__r&   encoder_hidden_statestembencoder_attention_maskimage_rotary_embc                 C   s   |  ||\}}}}	| js| ||\}
}}}n| ||}
| j||
||d\}}|| |t|d }| |d|d	tj
 }| |}|| |t|	d }| js|| |t|d }| |d|d	tj
 }
| |
}|| |t|d }||fS )N)r&   rr   ru   attention_maskr=   )rg   r[   rh   ri   rj   r$   tanhr>   rl   r#   r%   rn   rp   rk   rm   ro   rq   )r   r&   rr   rs   rt   ru   norm_hidden_statesrL   rM   rN   norm_encoder_hidden_statesenc_gate_msaenc_scale_mlpenc_gate_mlpattn_hidden_statescontext_attn_hidden_states	ff_outputcontext_ff_outputr   r   r    r)      s8   


zMochiTransformerBlock.forward)rS   rT   FrU   r!   )r*   r+   r,   rO   r@   strrP   r-   r   r$   rA   rQ   r)   r.   r   r   r   r    rR   v   sN    	
DrR   c                       s   e Zd ZdZddededdf fddZdejfd	d
Z		ddedededej	dB dej
dB dejfddZdejdejdejfddZ		ddejdedededej	dB dej
dB deejejf fddZ  ZS )	MochiRoPEa  
    RoPE implementation used in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

    Args:
        base_height (`int`, defaults to `192`):
            Base height used to compute interpolation scale for rotary positional embeddings.
        base_width (`int`, defaults to `192`):
            Base width used to compute interpolation scale for rotary positional embeddings.
       base_height
base_widthr<   Nc                    s   t    || | _d S r!   )r   r   target_area)r   r   r   r   r   r    r     s   
zMochiRoPE.__init__c                 C   s4   t j|||d ||d}|d d |dd   d S )Nr=   devicer"   r   )r$   linspace)r   startstopnumr   r"   edgesr   r   r    _centers  s   zMochiRoPE._centers
num_framesheightwidthr   r"   c                 C   s   | j ||  d }tj|||d}| | | d || d |||}| | | d || d |||}	tj|||	dd\}
}}tj|
||gdddd}|S )	Ng      ?r   r   ij)indexingr   rH   r   )r   r$   aranger   meshgridstackview)r   r   r   r   r   r"   r'   thwgrid_tgrid_hgrid_w	positionsr   r   r    _get_positions
  s   $$zMochiRoPE._get_positionsfreqsposc                 C   sl   t |jjt j t d|t j|t j}W d    n1 s#w   Y  t |}t |}||fS )Nznd,dhf->nhf)	r$   autocastr   typer%   einsumr#   cossin)r   r   r   	freqs_cos	freqs_sinr   r   r    _create_rope  s    

zMochiRoPE._create_ropepos_frequenciesc           
      C   s*   |  |||||}| ||\}}	||	fS r!   )r   r   )
r   r   r   r   r   r   r"   r   rope_cosrope_sinr   r   r    r)   &  s   	zMochiRoPE.forward)r   r   )NN)r*   r+   r,   rO   r@   r   r$   rA   r   r   r"   r   r   rQ   r)   r.   r   r   r   r    r      sJ    
	
r   c                       s   e Zd ZdZdZdgZddgZe						
							d(dedededededededB de	dedede	deddf fddZ
ed 		d)d!ejd"ejd#ejd$ejd ee	ef dB d%edejfd&d'Z  ZS )*MochiTransformer3DModelaB  
    A Transformer model for video-like data introduced in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

    Args:
        patch_size (`int`, defaults to `2`):
            The size of the patches to use in the patch embedding layer.
        num_attention_heads (`int`, defaults to `24`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`, defaults to `128`):
            The number of channels in each head.
        num_layers (`int`, defaults to `48`):
            The number of layers of Transformer blocks to use.
        in_channels (`int`, defaults to `12`):
            The number of channels in the input.
        out_channels (`int`, *optional*, defaults to `None`):
            The number of channels in the output.
        qk_norm (`str`, defaults to `"rms_norm"`):
            The normalization layer to use.
        text_embed_dim (`int`, defaults to `4096`):
            Input dimension of text embeddings from the text encoder.
        time_embed_dim (`int`, defaults to `256`):
            Output dimension of timestep embeddings.
        activation_fn (`str`, defaults to `"swiglu"`):
            Activation function to use in feed-forward.
        max_sequence_length (`int`, defaults to `256`):
            The maximum sequence length of text embeddings supported.
    TrR   patch_embedr   r         0         NrS         rT   
patch_sizerV   rW   
num_layersrX   in_channelsout_channelsrY   text_embed_dimtime_embed_dimrZ   max_sequence_lengthr<   c              	      s   t     |p|}t||d d| _t|	|
dd| _tt	dd fd| _
t | _t fddtD | _td	d
dd| _t|| | | _d	| _d S )N)r   r   	embed_dimpos_embed_type   )r1   rX   r   r   rV   r   r   g        c                    s*   g | ]}t  |d  kdqS )r=   )rI   rV   rW   rX   rY   rZ   r[   )rR   ).0irZ   rW   rd   rV   r   rX   rY   r   r    
<listcomp>~  s    

z4MochiTransformer3DModel.__init__.<locals>.<listcomp>FrU   
layer_norm)rD   r   	norm_type)r   r   r   r   r   
time_embedr5   	Parameterr$   fullr   r   rope
ModuleListrangetransformer_blocksr   norm_outr8   proj_outgradient_checkpointing)r   r   rV   rW   r   rX   r   r   rY   r   r   rZ   r   r   r   r    r   V  s@   


z MochiTransformer3DModel.__init__attention_kwargsr&   rr   timesteprt   return_dictc              	   C   sR  |j \}}}	}
}| jj}|
| }|| }| j||||jd\}}|ddddddd}| |}|d|dfdd}| j	| j
|	|||jtjd}t| jD ]#\}}t ri| jri| ||||||\}}qQ||||||d	\}}qQ| ||}| |}|||	||||d}|dd
ddddd}||d|	|
|}|s|fS t|dS )N)hidden_dtyper   r   r=   r   rG   r   r   )r&   rr   rs   rt   ru         )sample)shapeconfigr   r   r"   permuteflattenr   	unflattenr   r   r   r$   r%   	enumerater   is_grad_enabledr   _gradient_checkpointing_funcr   r   reshaper   )r   r&   rr   r   rt   r   r   
batch_sizenum_channelsr   r   r   ppost_patch_heightpost_patch_widthrs   ru   r   blockoutputr   r   r    r)     s\   


	
	

zMochiTransformer3DModel.forward)r   r   r   r   r   r   NrS   r   r   rT   r   )NT)r*   r+   r,   rO    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r@   r   r   r   r$   rA   
LongTensordictr   rP   r)   r.   r   r   r   r    r   4  s~    	
@r   ),typingr   r$   torch.nnr5   configuration_utilsr   r   loadersr   loaders.single_file_modelr   utilsr   r	   utils.torch_utilsr
   	attentionr   attention_processorr   r   cache_utilsr   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr*   loggerModuler   r/   rB   rR   r   r   r   r   r   r    <module>   s0   
>