o
    Gi                     @   s`  d dl mZ d dlZd dlmZ d dlm  mZ ddlm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z  e!e"Z#G dd dej$Z%G dd dej$Z&G dd dZ'G dd dZ(eG dd dej$Z)G dd dej$Z*G dd dej$Z+G dd dee	eeZ,dS )     )AnyN   )ConfigMixinregister_to_config)PeftAdapterMixin)apply_lora_scalelogging)maybe_allow_in_graph   )FeedForward)	Attention)
CacheMixin)&CogView3CombinedTimestepSizeEmbeddings)Transformer2DModelOutput)
ModelMixin)	LayerNormRMSNormc                	       sT   e Zd Z				ddedededef fd	d
ZdejdejdejfddZ  ZS )CogView4PatchEmbed    
  r
      in_channelshidden_size
patch_sizetext_hidden_sizec                    s8   t    || _t||d  || _t||| _d S )Nr
   )super__init__r   nnLinearproj	text_proj)selfr   r   r   r   	__class__ f/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_cogview4.pyr   &   s   
zCogView4PatchEmbed.__init__hidden_statesencoder_hidden_statesreturnc           	      C   sz   |j \}}}}|| j }|| j }||||| j|| j}|dddddddddd}| |}| |}||fS )Nr   r
         r      )shaper   reshapepermuteflattenr   r    )	r!   r&   r'   
batch_sizechannelheightwidthpost_patch_heightpost_patch_widthr$   r$   r%   forward3   s   

$

zCogView4PatchEmbed.forward)r   r   r
   r   )	__name__
__module____qualname__intr   torchTensorr6   __classcell__r$   r$   r"   r%   r   %   s    $r   c                
       sV   e Zd Zdededdf fddZdejdejd	ejdeejejf fd
dZ  Z	S )CogView4AdaLayerNormZeroembedding_dimdimr(   Nc                    sH   t    tj|ddd| _tj|ddd| _tj|d| dd| _d S )NFh㈵>elementwise_affineeps   Tbias)r   r   r   r   normnorm_contextr   linear)r!   r?   r@   r"   r$   r%   r   C   s   
z!CogView4AdaLayerNormZero.__init__r&   r'   tembc                 C   s   |j }| |j|d}| |j|d}| |}|jddd\}}	}
}}}}}}}}}|d|
d  |d }|d|d  |	d }||||||||||f
S )NdtyperE   r*   r@   )rM   rH   torI   rJ   chunk	unsqueeze)r!   r&   r'   rK   rM   norm_hidden_statesnorm_encoder_hidden_statesemb	shift_msac_shift_msa	scale_msac_scale_msagate_msa
c_gate_msa	shift_mlpc_shift_mlp	scale_mlpc_scale_mlpgate_mlp
c_gate_mlpr$   r$   r%   r6   J   s>   
z CogView4AdaLayerNormZero.forward)
r7   r8   r9   r:   r   r;   r<   tupler6   r=   r$   r$   r"   r%   r>   B   s    r>   c                   @   sf   e Zd ZdZdd Z		ddedejdejdejdB d	eejejf dB d
eejejf fddZ	dS )CogView4AttnProcessora  
    Processor for implementing scaled dot-product attention for the CogView4 model. It applies a rotary embedding on
    query and key vectors, but does not include spatial normalization.

    The processor supports passing an attention mask for text tokens. The attention mask should have shape (batch_size,
    text_seq_length) where 1 indicates a non-padded token and 0 indicates a padded token.
    c                 C      t tds	tdd S Nscaled_dot_product_attentionzUCogView4AttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.hasattrFImportErrorr!   r$   r$   r%   r   {      
zCogView4AttnProcessor.__init__Nattnr&   r'   attention_maskimage_rotary_embr(   c                 C   sz  |j }|j\}}}	|j\}}
}	tj||gdd}||}||}||}|d|jdf	dd}|d|jdf	dd}|d|jdf	dd}|j
d ur\|
|j|d}|jd urj||j|d}|d urddlm} ||d d d d |d d d f |dd|d d d d |d d d f< ||d d d d |d d d f |dd|d d d d |d d d f< |d ur|}| dksJ d	| |j}tj|||
 f|jd
}||d d d |f< |d}||	dd }|dkd|j }tj||||ddd}|	dddd}||}|jd |}|jd |}|j||d| gdd\}}||fS )Nr*   rN   r
   rL   apply_rotary_embuse_real_unbind_dimCthe shape of text_attn_mask should be (batch_size, text_seq_length)devicer           F	attn_mask	dropout_p	is_causalr   )rM   r,   r;   catto_qto_kto_v	unflattenheads	transposenorm_qrO   norm_k
embeddingsrq   r@   floatrw   onesrQ   rh   re   r/   type_asto_outsplitsize)r!   rl   r&   r'   rm   rn   rM   r0   text_seq_length	embed_dimimage_seq_lengthquerykeyvaluerq   text_attn_maskmix_attn_maskattn_mask_matrixr$   r$   r%   __call__   sT   




""""



zCogView4AttnProcessor.__call__)NN)
r7   r8   r9   __doc__r   r   r;   r<   ra   r   r$   r$   r$   r%   rb   r   s$    	rb   c                   @   s   e Zd ZdZdd Z				ddedejdejdejdB d	ejdB d
ejdB deejejf e	eejejf  B dB deejejf fddZ
dS )CogView4TrainingAttnProcessora  
    Training Processor for implementing scaled dot-product attention for the CogView4 model. It applies a rotary
    embedding on query and key vectors, but does not include spatial normalization.

    This processor differs from CogView4AttnProcessor in several important ways:
    1. It supports attention masking with variable sequence lengths for multi-resolution training
    2. It unpacks and repacks sequences for efficient training with variable sequence lengths when batch_flag is
       provided
    c                 C   rc   rd   rf   rj   r$   r$   r%   r      rk   z&CogView4TrainingAttnProcessor.__init__Nrl   r&   r'   latent_attn_maskr   
batch_flagrn   r(   c           1   	      s  |j \}	}
}|j \}	}}|j}|j}|}tj||gdd}|du r,tj|	|
ftj|d}|du r;tj|	|ftj|d}| dksEJ d|jtjksOJ d| dksYJ d|jtjkscJ d	tj|	|
| ftj|d|ddd|
f< |dd|
df< dj	|d
}||
dd } dur;  dksJ t  d }tj|dd}
tj|dd}|
| } fddt|D }t||ksJ dd}|dd}||dk }t||j d ksJ t||}tjjjj|dddd}|j d }tj|||f||d}t|D ]%\}}| |k }d} |D ]}!d|| | |! | | |! f< | |!7 } q#q|j	tjd
}|d}|}" du rXtj||gdd}n|}||}#||}$||}%|#d|jdf
dd}#|$d|jdf
dd}$|%d|jdf
dd}%|jdur||#j	|d
}#|jdur||$j	|d
}$|durddl m!}&  du r|&|#dddd|
dddf |dd|#dddd|
dddf< |&|$dddd|
dddf |dd|$dddd|
dddf< n|#j d |ksJ |$j d |ksJ t||	ks"J d}'t|D ]z}d} |
 |k }(| |k })t"|(|)D ]c\}*}+|*|+ },|&|#|dd| |* | |, ddf ||' dd|#|dd| |* | |, ddf< |&|$|dd| |* | |, ddf ||' dd|$|dd| |* | |, ddf< | |,7 } |'d7 }'q=q(t#j$|#|$|%|"ddd}|
dddd}|%|#}|j&d |}|j&d |} du r|j|
|'d|
 gdd\}}||fS tjjjj(|t)|dd}-tj|-dd}.t|.|* }/t|/|	ks
J dd t"|/|
|D }/dd |/D }0dd |/D }-t|	D ]}|0| || || dk< |-| || || dk< q'|}||fS )a  
        Args:
            attn (`Attention`):
                The attention module.
            hidden_states (`torch.Tensor`):
                The input hidden states.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states for cross-attention.
            latent_attn_mask (`torch.Tensor`, *optional*):
                Mask for latent tokens where 0 indicates pad token and 1 indicates non-pad token. If None, full
                attention is used for all latent tokens. Note: the shape of latent_attn_mask is (batch_size,
                num_latent_tokens).
            text_attn_mask (`torch.Tensor`, *optional*):
                Mask for text tokens where 0 indicates pad token and 1 indicates non-pad token. If None, full attention
                is used for all text tokens.
            batch_flag (`torch.Tensor`, *optional*):
                Values from 0 to n-1 indicating which samples belong to the same batch. Samples with the same
                batch_flag are packed together. Example: [0, 1, 1, 2, 2] means sample 0 forms batch0, samples 1-2 form
                batch1, and samples 3-4 form batch2. If None, no packing is used.
            image_rotary_emb (`tuple[torch.Tensor, torch.Tensor]` or `list[tuple[torch.Tensor, torch.Tensor]]`, *optional*):
                The rotary embedding for the image part of the input.
        Returns:
            `tuple[torch.Tensor, torch.Tensor]`: The processed hidden states for both image and text streams.
        r*   rN   N)rM   rw   r
   ru   z1the dtype of text_attn_mask should be torch.int32zGthe shape of latent_attn_mask should be (batch_size, num_latent_tokens)z3the dtype of latent_attn_mask should be torch.int32rL   c                    s"   g | ]}t  |k  qS r$   )r;   sumitem).0	batch_idxr   mixed_attn_maskr$   r%   
<listcomp>"  s    z:CogView4TrainingAttnProcessor.__call__.<locals>.<listcomp>r   Trx   right)batch_firstpadding_valuepadding_sidero   rp   rr   rs   Fry   r   )lengthsr   c                 S   s"   g | ]\}}}t |||gqS r$   )r;   r   )r   htlenllenr$   r$   r%   r     s    c                 S      g | ]}|d  qS )r   r$   r   r   r$   r$   r%   r         c                 S   r   )r*   r$   r   r$   r$   r%   r     r   )+r,   rM   rw   r;   r}   r   int32r@   rQ   rO   r   maxr   r   rangelenr/   r   r   utilsrnnpad_sequencezeros	enumerateboolr~   r   r   r   r   r   r   r   rq   ziprh   re   r   r   r   unpad_sequencetensortolist)1r!   rl   r&   r'   r   r   r   rn   kwargsr0   r   r   r   rM   rw   latent_hidden_statesmixed_hidden_statesmixed_attn_mask_inputr   packing_batch_sizelatent_seq_lengthmixed_seq_lengthmixed_seq_length_packedmixed_attn_mask_flattenmixed_hidden_states_flattenmixed_hidden_states_unpadmixed_hidden_states_packed!mixed_hidden_states_packed_paddedlidxmaskseq_lengthsoffsetlengthrm   r   r   r   rq   rope_idxtext_seq_length_bilatent_seq_length_bir   r   mlenhidden_states_unpadhidden_states_flattenhidden_states_unpackencoder_hidden_states_unpadr$   r   r%   r      s
  %









"""$"&"&


$

z&CogView4TrainingAttnProcessor.__call__NNNN)r7   r8   r9   r   r   r   r;   r<   ra   listr   r$   r$   r$   r%   r      s0    
	&
r   c                       s   e Zd Z				ddedededed	d
f
 fddZ	
	
	
	
ddejdejdejd
B deejejf eeejejf  B d
B de	e
ejf d
B de	e
ef d
B d	eejejf fddZ  ZS )CogView4TransformerBlockr   @   (      r@   num_attention_headsattention_head_dimtime_embed_dimr(   Nc                    sl   t    t||| _t||||ddddt d	| _tj|ddd| _	tj|ddd| _
t||dd| _d S )	NT
layer_normFrA   )		query_dimr   dim_headout_dimrG   qk_normrC   rD   	processorrB   zgelu-approximate)r@   dim_outactivation_fn)r   r   r>   norm1r   rb   attn1r   r   norm2norm2_contextr   ff)r!   r@   r   r   r   r"   r$   r%   r     s    
z!CogView4TransformerBlock.__init__r&   r'   rK   rn   rm   attention_kwargsc              
   C   s   |  |||\
}}}	}
}}}}}}|d u ri }| jd||||d|\}}|||d  }|||d  }| |d|
d  |	d }| |d|d  |d }| |}| |}|||d  }|||d  }||fS )N)r&   r'   rn   rm   r*   r$   )r   r   rQ   r   r   r   )r!   r&   r'   rK   rn   rm   r   rR   rY   r[   r]   r_   rS   rZ   r\   r^   r`   attn_hidden_statesattn_encoder_hidden_states	ff_outputff_output_contextr$   r$   r%   r6     sF   
"

z CogView4TransformerBlock.forward)r   r   r   r   r   )r7   r8   r9   r:   r   r;   r<   ra   r   dictstrr   r6   r=   r$   r$   r"   r%   r     sF     &r   c                       s\   e Zd Zddededeeef deddf
 fdd	Zd
ejdeejejf fddZ	  Z
S )CogView4RotaryPosEmbed     @r@   r   rope_axes_dimthetar(   Nc                    s&   t    || _|| _|| _|| _d S )N)r   r   r@   r   r   r   )r!   r@   r   r   r   r"   r$   r%   r     s
   

zCogView4RotaryPosEmbed.__init__r&   c                 C   s  |j \}}}}|| j || j }}| jd | jd }}d| jtjd|dtjdd |d   |   }d| jtjd|dtjdd |d   |   }	t| jd }
t| jd }t	|
|}t	||	}tj||j
d}tj||j
d}|| jd  | }|| jd  | }|| }|| }|d}|d}|||d}|||d}tj||gdd}tj||gdd}||| d}| | fS )	Nr
   g      ?r   rL   r*   rv   ro   rN   )r,   r   r@   r   r;   arangefloat32r   r   outerrw   rQ   expandr}   r-   cossin)r!   r&   r0   num_channelsr2   r3   dim_hdim_w
h_inv_freq
w_inv_freqh_seqw_seqfreqs_hfreqs_wh_idxw_idxinner_h_idxinner_w_idxfreqsr$   r$   r%   r6      s6   ,,

zCogView4RotaryPosEmbed.forward)r   )r7   r8   r9   r:   ra   r   r   r;   r<   r6   r=   r$   r$   r"   r%   r     s    ,(r   c                       s`   e Zd ZdZ				ddedededed	ed
ef fddZde	j
de	j
de	j
fddZ  ZS )CogView4AdaLayerNormContinuousz
    CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the
    Linear on conditioning embedding.
    TrA   r   r?   conditioning_embedding_dimrC   rD   rG   	norm_typec                    sd   t    tj||d |d| _|dkrt||||| _d S |dkr+t|||| _d S td| )Nr
   rF   r   rms_normzunknown norm_type )	r   r   r   r   rJ   r   rH   r   
ValueError)r!   r?   r  rC   rD   rG   r  r"   r$   r%   r   M  s   
	z'CogView4AdaLayerNormContinuous.__init__xconditioning_embeddingr(   c                 C   sd   |  ||j}tj|ddd\}}| |d| d d d d d f  |d d d d d f  }|S )Nr
   r*   rN   )rJ   rO   rM   r;   rP   rH   )r!   r  r  rT   scaleshiftr$   r$   r%   r6   _  s   :z&CogView4AdaLayerNormContinuous.forward)TrA   Tr   )r7   r8   r9   r   r:   r   r   r   r   r;   r<   r6   r=   r$   r$   r"   r%   r  G  s(    	$r  c                       s   e Zd ZdZdZg dZg dZe								
					d+dededededededededededede	eef f fddZ
ed				d,dejd ejd!ejd"ejd#ejd$ejdeeef dB d%ed&ejdB d'e	ejejf ee	ejejf  B dB d(e	ej eB fd)d*Z  ZS )-CogView4Transformer2DModela  
    Args:
        patch_size (`int`, defaults to `2`):
            The size of the patches to use in the patch embedding layer.
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        num_layers (`int`, defaults to `30`):
            The number of layers of Transformer blocks to use.
        attention_head_dim (`int`, defaults to `40`):
            The number of channels in each head.
        num_attention_heads (`int`, defaults to `64`):
            The number of heads to use for multi-head attention.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        text_embed_dim (`int`, defaults to `4096`):
            Input dimension of text embeddings from the text encoder.
        time_embed_dim (`int`, defaults to `512`):
            Output dimension of timestep embeddings.
        condition_dim (`int`, defaults to `256`):
            The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
            crop_coords).
        pos_embed_max_size (`int`, defaults to `128`):
            The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
            to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
            means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
            patch_size => 128 * 8 * 2 => 2048`.
        sample_size (`int`, defaults to `128`):
            The base resolution of input latents. If height/width is not provided during generation, this value is used
            to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
    T)r   r   r   )patch_embedrH   proj_outr
   r      r   r   r   r         r  r  r   r   out_channels
num_layersr   r   text_embed_dimr   condition_dimpos_embed_max_sizesample_sizer   c                    s   t    d|	 }  |}t ||dd| _t|||| _t|	|d| _t	 fddt
|D | _tdd| _tj|| | d	d
| _d| _d S )N   r   )r   )r?   r  pooled_projection_dimtimesteps_dimc                    s   g | ]	}t  qS r$   )r   )r   _r   	inner_dimr   r   r$   r%   r     s    z7CogView4Transformer2DModel.__init__.<locals>.<listcomp>F)rC   TrF   )r   r   r   roper   r  r   time_condition_embedr   
ModuleListr   transformer_blocksr  norm_outr   r  gradient_checkpointing)r!   r   r   r  r  r   r   r  r   r  r  r   r   r"  r"   r%  r%   r     s(   

z#CogView4Transformer2DModel.__init__r   Nr&   r'   timesteporiginal_sizetarget_sizecrop_coordsreturn_dictrm   rn   r(   c              
   C   s  |j \}}}}|
d u r| |}
| jj}|| }|| }| ||\}}| |||||j}t|}| j	D ]"}t
 rM| jrM| |||||
|	|\}}q6|||||
|	|\}}q6| ||}| |}||||d||}|dddddddddd}|s|fS t|dS )	Nro   r   r   r*   r)   r
   r+   )sample)r,   r'  configr   r  r(  rM   rh   silur*  r;   is_grad_enabledr,  _gradient_checkpointing_funcr+  r  r-   r.   r/   r   )r!   r&   r'   r-  r.  r/  r0  r   r1  rm   rn   r0   r   r2   r3   pr4   r5   rK   blockoutputr$   r$   r%   r6     sF   







$
z"CogView4Transformer2DModel.forward)r
   r   r   r  r   r   r   r   r  r  r  r  )NTNN)r7   r8   r9   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r:   ra   r   r   r;   r<   
LongTensorr   r   r   r   r   r   r6   r=   r$   r$   r"   r%   r  g  s    	

2		
&r  )-typingr   r;   torch.nnr   torch.nn.functional
functionalrh   configuration_utilsr   r   loadersr   r   r   r   utils.torch_utilsr	   	attentionr   attention_processorr   cache_utilsr   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr7   loggerModuler   r>   rb   r   r   r   r  r  r$   r$   r$   r%   <module>   s6   
0P  O0 