o
    GiM                     @   s  d dl Z d dlZd dlmZ d dlm  mZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZ eeZG dd dejZG dd dejZG dd dejZG dd dZ G dd dejZ!G dd deeZ"dS )    N   )ConfigMixinregister_to_config)logging   )	Attention)TimestepEmbedding	Timestepsget_2d_sincos_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormRMSNormc                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	OmniGenFeedForwardhidden_sizeintermediate_sizec                    s@   t    tj|d| dd| _tj||dd| _t | _d S )Nr   Fbias)super__init__nnLineargate_up_proj	down_projSiLUactivation_fn)selfr   r   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_omnigen.pyr   "   s   
zOmniGenFeedForward.__init__hidden_statesreturnc                 C   s4   |  |}|jddd\}}|| | }| |S )Nr   dim)r   chunkr   r   )r   r!   	up_statesgater   r   r    forward)   s   

zOmniGenFeedForward.forward)	__name__
__module____qualname__intr   torchTensorr)   __classcell__r   r   r   r    r   !   s    r   c                       s   e Zd Z							dded	ed
ededededef fddZdd Zdej	dedej	fddZ
	ddej	dedej	dej	fddZ  ZS )OmniGenPatchEmbedr         T      @   
patch_sizein_channels	embed_dimr   interpolation_scalepos_embed_max_size	base_sizec           	         s   t    tj||||f||d| _tj||||f||d| _|| _|| _|| _t	|| j|| jdd}| j
d| ddd d S )N)kernel_sizestrider   pt)r<   r:   output_type	pos_embedr   T)
persistent)r   r   r   Conv2doutput_image_projinput_image_projr7   r:   r;   r
   register_bufferfloat	unsqueeze)	r   r7   r8   r9   r   r:   r;   r<   rA   r   r   r    r   1   s$   

zOmniGenPatchEmbed.__init__c                 C   s   | j du r	td|| j }|| j }|| j kr$td| d| j  d|| j kr5td| d| j  d| j | d }| j | d }| jd| j | j d	}|dd||| ||| ddf }|dd	|jd	 }|S )
z2Crops positional embeddings for SD3 compatibility.Nz.`pos_embed_max_size` must be set for cropping.zHeight (z/) cannot be greater than `pos_embed_max_size`: .zWidth (r   r4   r#   )r;   
ValueErrorr7   rA   reshapeshape)r   heightwidthtopleftspatial_pos_embedr   r   r    _cropped_pos_embedQ   s$   




(z$OmniGenPatchEmbed._cropped_pos_embedr!   is_input_imager"   c                 C   s0   |r|  |}n| |}|ddd}|S )Nr   r4   )rE   rD   flatten	transpose)r   r!   rS   r   r   r    _patch_embeddingsh   s
   
z#OmniGenPatchEmbed._patch_embeddingsNpadding_latentc           
      C   s   t |trM|d u rd gt| }g }t||D ]3\}}|jdd  \}}| ||}| ||}	||	 }|d urEtj||	|j
gdd}|| q|S |jdd  \}}| ||}	| ||}||	 }|S )Nr$   )
isinstancelistlenziprL   rV   rR   r.   cattodeviceappend)
r   r!   rS   rW   patched_latents
sub_latentpaddingrM   rN   rA   r   r   r    r)   p   s$   
zOmniGenPatchEmbed.forward)r   r2   r3   Tr4   r5   r6   )N)r*   r+   r,   r-   boolrG   r   rR   r.   r/   rV   r)   r0   r   r   r   r    r1   0   sF     	r1   c                       s(   e Zd Z	d	 fdd	Zdd Z  ZS )
OmniGenSuScaledRotaryEmbedding      '  Nc                    st   t    || _|| _|| _d| jtjd| jdtjd | j   }| j	d|dd |d | _
|d	 | _|| _d S )
N      ?r   r   )dtypeinv_freqF)tensorrB   short_factorlong_factor)r   r   r%   max_position_embeddingsbaser.   arangeint64rG   rF   rm   rn    original_max_position_embeddings)r   r%   ro   rs   rp   rope_scalingrk   r   r   r    r      s   
*


z'OmniGenSuScaledRotaryEmbedding.__init__c                 C   s  t |d }|| jkrt j| jt j|jd}nt j| jt j|jd}t jd| j	dt j
|jd | j	 }d|| j|   | _| jd d d d f  |jd dd}|d d d d d f  }|jj}t|tro|dkro|nd}t j|d	d
K | |  dd}	t j|	|	fddd }
| j| j }|dkrd}ntdt|t| j  }|
 | }|
 | }W d    ||fS 1 sw   Y  ||fS )Nr4   )rj   r_   r   r   ri   r#   mpscpuF)device_typeenabledr$   )r.   maxrs   rl   rn   float32r_   rm   rq   r%   rr   rG   rp   rk   expandrL   typerY   strautocastrU   r]   ro   mathsqrtlogcossin)r   r!   position_idsseq_lenext_factorsinv_freq_shapeinv_freq_expandedposition_ids_expandedrw   freqsembscalescaling_factorr   r   r   r   r    r)      s0   
"( 
z&OmniGenSuScaledRotaryEmbedding.forward)rf   rg   rh   N)r*   r+   r,   r   r)   r0   r   r   r   r    re      s    re   c                   @   sR   e Zd ZdZdd Z		ddedejdejdejdB d	ejdB d
ejfddZdS )OmniGenAttnProcessor2_0z
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
    used in the OmniGen model.
    c                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzPAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportErrorr   r   r   r    r      s   
z OmniGenAttnProcessor2_0.__init__Nattnr!   encoder_hidden_statesattention_maskimage_rotary_embr"   c                 C   s  |j \}}}||}	||}
||}|	 \}}}|
j d }||j }|| }|	|d|j|dd}	|
|d||dd}
||d||dd}|d urgddlm	} ||	|dd}	||
|dd}
t
j|	|
||d}|dd|	}||||j}|jd |}|S )	Nr#   r4   r   )apply_rotary_embrX   )use_real_unbind_dim)	attn_maskr   )rL   to_qto_kto_vsizeheadsviewrU   
embeddingsr   r   r   type_asrK   out_dimto_out)r   r   r!   r   r   r   
batch_sizesequence_length_querykeyvaluebszq_len	query_dim	inner_dimhead_dimkv_headsr   r   r   r    __call__   s(   




z OmniGenAttnProcessor2_0.__call__)NN)	r*   r+   r,   __doc__r   r   r.   r/   r   r   r   r   r    r      s$    	r   c                       sX   e Zd Zdedededededdf fdd	Zd
ejdejdejdejfddZ  Z	S )OmniGenBlockr   num_attention_headsnum_key_value_headsr   rms_norm_epsr"   Nc                    sX   t    t||d| _t|||| ||d|dt d	| _t||d| _t||| _	d S )NepsF)	r   cross_attention_dimdim_headr   r   r   r   out_bias	processor)
r   r   r   input_layernormr   r   	self_attnpost_attention_layernormr   mlp)r   r   r   r   r   r   r   r   r    r      s   
zOmniGenBlock.__init__r!   r   r   c                 C   sD   |  |}| j||||d}|| }| |}| |}|| }|S )N)r!   r   r   r   )r   r   r   r   )r   r!   r   r   norm_hidden_statesattn_output	ff_outputr   r   r    r)     s   


zOmniGenBlock.forward)
r*   r+   r,   r-   rG   r   r.   r/   r)   r0   r   r   r   r    r      s.    r   c                (       s4  e Zd ZdZdZdgZg dZe									
													d7dededede	dededededededed ed!ed"e
d#ed$ed%ed&ed'ef& fd(d)Zd*ejd+eej d,e
d-ejdB fd.d/Z	d8d0ejd1ee	B ejB d*ejd+eej d,e
eee f d2ejd3ejd4ed-eeej B fd5d6Z  ZS )9OmniGenTransformer2DModelaF  
    The Transformer model introduced in OmniGen (https://huggingface.co/papers/2409.11340).

    Parameters:
        in_channels (`int`, defaults to `4`):
            The number of channels in the input.
        patch_size (`int`, defaults to `2`):
            The size of the spatial patches to use in the patch embedding layer.
        hidden_size (`int`, defaults to `3072`):
            The dimensionality of the hidden layers in the model.
        rms_norm_eps (`float`, defaults to `1e-5`):
            Eps for RMSNorm layer.
        num_attention_heads (`int`, defaults to `32`):
            The number of heads to use for multi-head attention.
        num_key_value_heads (`int`, defaults to `32`):
            The number of heads to use for keys and values in multi-head attention.
        intermediate_size (`int`, defaults to `8192`):
            Dimension of the hidden layer in FeedForward layers.
        num_layers (`int`, default to `32`):
            The number of layers of transformer blocks to use.
        pad_token_id (`int`, default to `32000`):
            The id of the padding token.
        vocab_size (`int`, default to `32064`):
            The size of the vocabulary of the embedding vocabulary.
        rope_base (`int`, default to `10000`):
            The default theta value to use when creating RoPE.
        rope_scaling (`dict`, optional):
            The scaling factors for the RoPE. Must contain `short_factor` and `long_factor`.
        pos_embed_max_size (`int`, default to `192`):
            The maximum size of the positional embeddings.
        time_step_dim (`int`, default to `256`):
            Output dimension of timestep embeddings.
        flip_sin_to_cos (`bool`, default to `True`):
            Whether to flip the sin and cos in the positional embeddings when preparing timestep embeddings.
        downscale_freq_shift (`int`, default to `0`):
            The frequency shift to use when downscaling the timestep embeddings.
        timestep_activation_fn (`str`, default to `silu`):
            The activation function to use for the timestep embeddings.
    Tr   )patch_embeddingembed_tokensnormr2   r      h㈵>         }  @}  rf   rg   rh   Nr5      r   silur8   r7   r   r   r   r   r   
num_layerspad_token_id
vocab_sizero   rs   	rope_basert   r;   time_step_dimflip_sin_to_cosdownscale_freq_shifttimestep_activation_fnc                    s   t    || _|| _t|| |d| _t|||| _t| || _	t| || _
t|
 |	| _t  ||||d| _t fddt|D | _t d| _t dddd	| _tj || | j d
d| _d| _d S )N)r7   r8   r9   r;   )ro   rs   rp   rt   c                    s   g | ]
}t  qS r   )r   ).0r   r   r   r   r   r   r   r    
<listcomp>x  s    z6OmniGenTransformer2DModel.__init__.<locals>.<listcomp>r   Fgư>r4   )norm_elementwise_affinenorm_eps	chunk_dimTr   )r   r   r8   out_channelsr1   r   r	   	time_projr   
time_token
t_embedderr   	Embeddingr   re   rope
ModuleListrangelayersr   r   r   norm_outr   proj_outgradient_checkpointing)r   r8   r7   r   r   r   r   r   r   r   r   ro   rs   r   rt   r;   r   r   r   r   r   r   r    r   H  s:   

z"OmniGenTransformer2DModel.__init__	input_idsinput_img_latentsinput_image_sizesr"   c           
         s   |d u rd S  fdd|D }  |}d} j|dd}| D ]}|| D ]\}}	|| |j||||	f< |d7 }q'q!|S )Nc                    s   g | ]}|  jqS r   )r^   rj   )r   xr   r   r    r     s    zHOmniGenTransformer2DModel._get_multimodal_embeddings.<locals>.<listcomp>r   TrS   r4   )r   r   keysr^   rj   )
r   r   r   r   condition_tokensinput_img_inxinput_image_tokensb_inx	start_inxend_inxr   r   r    _get_multimodal_embeddings  s   

z4OmniGenTransformer2DModel._get_multimodal_embeddingsr!   timestepr   r   return_dictc	                 C   s  |j \}	}
}}| jj}|| || }}| j|dd}|d}| ||}| |d}| 	|}| 
|||}|d urKtj|||gdd}n	tj||gdd}|d}|d| }|d ur| dkr|j}t|j}d| | }|d|}| ||}| jD ]}t r| jr| ||||}q||||d}q| |}|d d | d f }| j||d}| |}||	||||d}|d	d
dddddd
dd}|s|fS t|dS )NFr   r4   r$   r#   r   )r   r   )tembr      r   r2   )sample)rL   configr7   r   r   r   r   r   rH   r   r   r.   r]   r   longr%   rj   finfominr   r   is_grad_enabledr   _gradient_checkpointing_funcr   r   r   rK   permuterT   r   )r   r!   r   r   r   r   r   r   r   r   num_channelsrM   rN   ppost_patch_heightpost_patch_widthnum_tokens_for_output_imagetimestep_projr   r   r   
seq_lengthrj   	min_dtyper   blockoutputr   r   r    r)     sF   





$
z!OmniGenTransformer2DModel.forward)r2   r   r   r   r   r   r   r   r   r   rf   rg   rh   Nr5   r   Tr   r   )T)r*   r+   r,   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r-   rG   dictrd   r}   r   r.   r/   rZ   r   FloatTensorr   tupler)   r0   r   r   r   r    r     s    (	
;
	
r   )#r   r.   torch.nnr   torch.nn.functional
functionalr   configuration_utilsr   r   utilsr   attention_processorr   r   r   r	   r
   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr*   loggerModuler   r1   re   r   r   r   r   r   r   r    <module>   s$   
X32.