o
    Gi2                     @   s   d dl Z d dlmZ ddlmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ eeZG dd dejZG dd dee	eZdS )    N   )ConfigMixinregister_to_config)logging   )AttentionMixinFeedForward)	AttentionCogVideoXAttnProcessor2_0)&CogView3CombinedTimestepSizeEmbeddingsCogView3PlusPatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuous%CogView3PlusAdaLayerNormZeroTextImagec                
       sh   e Zd ZdZ				ddededed	ef fd
dZdejdejdejdeejejf fddZ	  Z
S )CogView3PlusTransformerBlocka  
    Transformer block used in [CogView](https://github.com/THUDM/CogView3) model.

    Args:
        dim (`int`):
            The number of channels in the input and output.
        num_attention_heads (`int`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`):
            The number of channels in each head.
        time_embed_dim (`int`):
            The number of channels in timestep embedding.
     
  @   (      dimnum_attention_headsattention_head_dimtime_embed_dimc                    sn   t    t||d| _t||||ddddt d	| _tj|ddd| _	tj|ddd| _
t||d	d
| _d S )N)embedding_dimr   T
layer_normFư>)		query_dimheadsdim_headout_dimbiasqk_normelementwise_affineeps	processorgh㈵>)r#   r$   zgelu-approximate)r   dim_outactivation_fn)super__init__r   norm1r	   r
   attn1nn	LayerNormnorm2norm2_contextr   ff)selfr   r   r   r   	__class__ j/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_cogview3plus.pyr)   /   s    
z%CogView3PlusTransformerBlock.__init__hidden_statesencoder_hidden_statesembreturnc              
   C   s^  | d}| |||\
}}}}}	}
}}}}| j||
d\}}||d|  }||d|  }| |}|d|d d d f   |d d d f  }| |}
|
d|d d d f   |d d d f  }
tj|
|gdd}| |}||	d|d d |d f   }||d|d d d |f   }|j	tj
kr|dd}|j	tj
kr|dd}||fS )N   )r6   r7   )r   i  i  )sizer*   r+   	unsqueezer.   r/   torchcatr0   dtypefloat16clip)r1   r6   r7   r8   text_seq_lengthnorm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpnorm_encoder_hidden_states
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattn_hidden_statesattn_encoder_hidden_states	ff_outputr4   r4   r5   forwardK   s>   


(
(
""z$CogView3PlusTransformerBlock.forward)r   r   r   r   )__name__
__module____qualname____doc__intr)   r=   TensortuplerP   __classcell__r4   r4   r2   r5   r       s0    r   c                       s   e Zd ZdZdZddgZddgZe					
							d'dededededededededededef fddZ		d(de
jde
jde
jd e
jd!e
jd"e
jd#ed$ee
j eB fd%d&Z  ZS ))CogView3PlusTransformer2DModela  
    The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay
    Diffusion](https://huggingface.co/papers/2403.05121).

    Args:
        patch_size (`int`, defaults to `2`):
            The size of the patches to use in the patch embedding layer.
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        num_layers (`int`, defaults to `30`):
            The number of layers of Transformer blocks to use.
        attention_head_dim (`int`, defaults to `40`):
            The number of channels in each head.
        num_attention_heads (`int`, defaults to `64`):
            The number of heads to use for multi-head attention.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        text_embed_dim (`int`, defaults to `4096`):
            Input dimension of text embeddings from the text encoder.
        time_embed_dim (`int`, defaults to `512`):
            Output dimension of timestep embeddings.
        condition_dim (`int`, defaults to `256`):
            The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
            crop_coords).
        pos_embed_max_size (`int`, defaults to `128`):
            The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
            to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
            means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
            patch_size => 128 * 8 * 2 => 2048`.
        sample_size (`int`, defaults to `128`):
            The base resolution of input latents. If height/width is not provided during generation, this value is used
            to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
    Tpatch_embednormr   r   r         r   r      r         
patch_sizein_channels
num_layersr   r   out_channelstext_embed_dimr   condition_dimpos_embed_max_sizesample_sizec                    s   t    |_  _d|	 _t|j|||
d_t|	jjd_t	
 fddt|D _tjddd_t	jj|| j d	d
_d_d S )N   )rb   hidden_sizera   text_hidden_sizerg   )r   rf   pooled_projection_dimtimesteps_dimc                    s   g | ]}t j d qS ))r   r   r   r   )r   	inner_dim).0_r   r   r1   r   r4   r5   
<listcomp>   s    z;CogView3PlusTransformer2DModel.__init__.<locals>.<listcomp>Fr   )r   conditioning_embedding_dimr#   r$   T)r!   )r(   r)   rd   rn   rl   r   rZ   r   time_condition_embedr,   
ModuleListrangetransformer_blocksr   norm_outLinearproj_outgradient_checkpointing)r1   ra   rb   rc   r   r   rd   re   r   rf   rg   rh   r2   rq   r5   r)      s<   



z'CogView3PlusTransformer2DModel.__init__r6   r7   timesteporiginal_sizetarget_sizecrop_coordsreturn_dictr9   c                 C   s@  |j dd \}}	|j d }
| ||}| |||||j}|ddd|
f }|dd|
df }t| jD ]\}}t rM| jrM| 	||||\}}q7||||d\}}q7| 
||}| |}| jj}|| }|	| }	|j|j d ||	| j||fd}td|}|j|j d | j|| |	| fd}|s|fS t|dS )	a  
        The [`CogView3PlusTransformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor`):
                Input `hidden_states` of shape `(batch size, channel, height, width)`.
            encoder_hidden_states (`torch.Tensor`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape
                `(batch_size, sequence_len, text_embed_dim)`
            timestep (`torch.LongTensor`):
                Used to indicate denoising step.
            original_size (`torch.Tensor`):
                CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            target_size (`torch.Tensor`):
                CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            crop_coords (`torch.Tensor`):
                CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]:
                The denoised latents using provided inputs as conditioning.
        Nr:   )r6   r7   r8   r   )shapeznhwcpq->nchpwq)sample)r   rZ   rt   r?   	enumeraterw   r=   is_grad_enabledr{   _gradient_checkpointing_funcrx   rz   configra   reshaperd   einsumr   )r1   r6   r7   r|   r}   r~   r   r   heightwidthrB   r8   index_blockblockra   outputr4   r4   r5   rP      sH   &



z&CogView3PlusTransformer2DModel.forward)r   r\   r]   r   r   r\   r^   r   r_   r`   r`   )T)rQ   rR   rS   rT    _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modulesr   rU   r)   r=   rV   
LongTensorboolrW   r   rP   rX   r4   r4   r2   r5   rY   ~   st    "	
C	rY   )r=   torch.nnr,   configuration_utilsr   r   utilsr   	attentionr   r   attention_processorr	   r
   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrQ   loggerModuler   rY   r4   r4   r4   r5   <module>   s   
^