o
    ۷ixp                     @   s  d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dl	m  m
Z ddlmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ e rZd dlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, e-e.Z/dej0dej1dej1fddZ2dej0dej1dej1fddZ3dej0dej1de4dej1fddZ5	d2de6d e4de4d!e4d"e4dej0fd#d$Z7e2e3e5d%Z8eG d&d' d'eZ9G d(d) d)ej0Z:G d*d+ d+ej0Z;G d,d- d-ej0Z<G d.d/ d/ej0Z=G d0d1 d1e(e"e*eZ>dS )3    )	dataclass)sqrt)AnyN   )ConfigMixinregister_to_config)
BaseOutputlogging)apply_forward_hook)is_transformers_available)randn_tensor)Dinov2WithRegistersConfigDinov2WithRegistersModelSiglipVisionConfigSiglipVisionModelViTMAEConfigViTMAEModel   )get_activation)AttentionMixin)	Attention)get_2d_sincos_pos_embed)
ModelMixin   )AutoencoderMixinDecoderOutputEncoderOutputmodelimagesreturnc                 C   s&   | |dd}d}|j d d |d f S )NT)output_hidden_states   last_hidden_state)r   r   outputsunused_token_num r&   c/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/autoencoders/autoencoder_rae.py_dinov2_encoder_forward:   s   r(   c                 C   s   | |ddd}|j S )NT)r    interpolate_pos_encodingr"   )r   r   r$   r&   r&   r'   _siglip2_encoder_forward@   s   r*   
patch_sizec                 C   s   |j d |j d }}t|| |d  }||d  || kr#tdt|d|j d d|j|j	}| ||dd}|j
d d dd f S )	Nr   r   z-Image size should be divisible by patch size.r   T)r)   r   )shapeint
ValueErrortorcharange	unsqueezeexpandtodevicedtyper#   )r   r   r+   hw	patch_numnoiser$   r&   r&   r'   _mae_encoder_forwardE   s   .r;   @   encoder_typehidden_sizenum_hidden_layershead_dimc                 C   s   || }| dkrt ||d||d}t|}d|j_d|j_n?| dkr:t||d||d}t|}d|jj_d|jj_n#| dkrUt	||d||d	d
}t
|}d|j_d|j_ntd|  d|d |S )z<Build a frozen encoder from config (no pretrained download).dinov2i  )r>   r+   
image_sizenum_attention_headsr?   Nsiglip2   mae           )r>   r+   rB   rC   r?   
mask_ratioUnknown encoder_type='z"'. Available: dinov2, siglip2, maeF)r   r   	layernormweightbiasr   r   vision_modelpost_layernormr   r   r/   requires_grad_)r=   r>   r+   r?   r@   rC   configr   r&   r&   r'   _build_encoderT   sL   



rR   )rA   rD   rF   c                   @   s   e Zd ZU dZejed< dS )RAEDecoderOutputz
    Output of `RAEDecoder`.

    Args:
        logits (`torch.Tensor`):
            Patch reconstruction logits of shape `(batch_size, num_patches, patch_size**2 * num_channels)`.
    logitsN)__name__
__module____qualname____doc__r0   Tensor__annotations__r&   r&   r&   r'   rS      s   
 rS   c                       sB   e Zd Zddededef fddZdejdejfd	d
Z  Z	S )ViTMAEIntermediategelur>   intermediate_size
hidden_actc                    s&   t    t||| _t|| _d S N)super__init__nnLineardenser   intermediate_act_fn)selfr>   r]   r^   	__class__r&   r'   ra      s   
zViTMAEIntermediate.__init__hidden_statesr   c                 C   s   |  |}| |}|S r_   )rd   re   )rf   ri   r&   r&   r'   forward   s   

zViTMAEIntermediate.forward)r\   )
rU   rV   rW   r.   strra   r0   rY   rj   __classcell__r&   r&   rg   r'   r[      s    r[   c                       sH   e Zd Zddededef fddZdejdejd	ejfd
dZ  Z	S )ViTMAEOutputrH   r>   r]   hidden_dropout_probc                    s(   t    t||| _t|| _d S r_   )r`   ra   rb   rc   rd   Dropoutdropout)rf   r>   r]   rn   rg   r&   r'   ra      s   
zViTMAEOutput.__init__ri   input_tensorr   c                 C   s    |  |}| |}|| }|S r_   )rd   rp   )rf   ri   rq   r&   r&   r'   rj      s   

zViTMAEOutput.forward)rH   )
rU   rV   rW   r.   floatra   r0   rY   rj   rl   r&   r&   rg   r'   rm      s    $rm   c                       sf   e Zd ZdZdddddddeded	ed
ededededef fddZde	j
de	j
fddZ  ZS )ViTMAELayerz^
    This matches the naming/parameter structure used in RAE-main (ViTMAE decoder block).
    T-q=rH   r\   )qkv_biaslayer_norm_epsrn   attention_probs_dropout_probr^   r>   rC   r]   ru   rv   rn   rw   r^   c          	         s   t    || dkrtd| d| t|||| ||d| _t|||d| _t|||d| _t	j
||d| _t	j
||d| _d S )Nr   zhidden_size=z* must be divisible by num_attention_heads=)	query_dimheadsdim_headrp   rM   )r>   r]   r^   )r>   r]   rn   eps)r`   ra   r/   r   	attentionr[   intermediaterm   outputrb   	LayerNormlayernorm_beforelayernorm_after)	rf   r>   rC   r]   ru   rv   rn   rw   r^   rg   r&   r'   ra      s(   
zViTMAELayer.__init__ri   r   c                 C   s<   |  | |}|| }| |}| |}| ||}|S r_   )r}   r   r   r~   r   )rf   ri   attention_outputlayer_outputr&   r&   r'   rj      s   

zViTMAELayer.forward)rU   rV   rW   rX   r.   boolrr   rk   ra   r0   rY   rj   rl   r&   r&   rg   r'   rs      s2    
	
!rs   c                       s   e Zd ZdZ												
			d1dedededededededededededededef fddZde	j
de	j
fdd Zd!e	j
de	j
fd"d#Zd2d%e	j
d&eeef d$B fd'd(Zd)d)d	d*d+e	j
d,ed-ed.edeee	j
 B f
d/d0Z  ZS )3
RAEDecodera
  
    Decoder implementation ported from RAE-main to keep checkpoint compatibility.

    Key attributes (must match checkpoint keys):
    - decoder_embed
    - decoder_pos_embed
    - decoder_layers
    - decoder_norm
    - decoder_pred
    - trainable_cls_token
                   rE   r   Trt   rH   r\   r>   decoder_hidden_sizedecoder_num_hidden_layersdecoder_num_attention_headsdecoder_intermediate_sizenum_patchesr+   num_channelsrB   ru   rv   rn   rw   r^   c              
      s   t    | _|| _|| _|	| _|| _tj|dd| _	t
|d }t|dddd}| jd|d d	d
 t fddt|D | _tjd| _tj|d | dd| _d	| _ttdd| _d S )NT)rM         ?r   pt)	cls_tokenextra_tokensoutput_typedecoder_pos_embedr   F
persistentc                    s$   g | ]}t  d qS ))r>   rC   r]   ru   rv   rn   rw   r^   )rs   ).0_rw   r   r   r   r^   rn   rv   ru   r&   r'   
<listcomp>  s    z'RAEDecoder.__init__.<locals>.<listcomp>r{   r   )r`   ra   r   r+   r   rB   r   rb   rc   decoder_embedr.   r   register_bufferr2   rr   
ModuleListrangedecoder_layersr   decoder_normdecoder_predgradient_checkpointing	Parameterr0   zerostrainable_cls_token)rf   r>   r   r   r   r   r   r+   r   rB   ru   rv   rn   rw   r^   	grid_size	pos_embedrg   r   r'   ra      s*   

zRAEDecoder.__init__
embeddingsr   c                 C   s   |j d d }| jj d d }| jd d dd d f }| jd d dd d d f }| jj d }|ddd|dddd}tj|d|| fddd}|dddddd|}tj|	d|fdd	S )
Nr   r   r,   r   r   bicubicF)scale_factormodealign_cornersdim)
r-   r   reshapepermuteFinterpolateviewr0   catr2   )rf   r   embeddings_positionsnum_positionsclass_pos_embedpatch_pos_embedr   r&   r&   r'   r)   &  s   
z#RAEDecoder.interpolate_pos_encodingxc                 C   s   |j \}}}|| jkr|S t|d  }}|||||dddd}t| jd t| jd f}tj||ddd}|dddd || j|}|S )	Nr   r   r   r   r   bilinearFsizer   r   )	r-   r   r.   r   r   r   r   
contiguousr   )rf   r   blcr7   r8   target_sizer&   r&   r'   interpolate_latent8  s   
 zRAEDecoder.interpolate_latentNpatchified_pixel_valuesoriginal_image_sizec                 C   s   | j | j}}|d ur|n| j| jf}|\}}|| }|| }|| |jd kr8td|jd  d| d| |jd }	||	|||||}td|}||	||| || }
|
S )Nr   z5The number of patches in the patchified pixel values z9, does not match the number of patches on original image *r   znhwpqc->nchpwq)r+   r   rB   r-   r/   r   r0   einsum)rf   r   r   r+   r   original_heightoriginal_widthnum_patches_hnum_patches_w
batch_sizepixel_valuesr&   r&   r'   
unpatchifyC  s6   
zRAEDecoder.unpatchifyF)r)   drop_cls_tokenreturn_dictri   r)   r   r   c                C   s   |  |}|r|d d dd d d f }| |}n| |}| j|jd dd}tj||gdd}|rA|s;td| |}n| j	}||j
|j|jd }| jD ]}	|	|}qR| |}| |}
|
d d dd d d f }
|su|
fS t|
dS )Nr   r   r,   r   z:interpolate_pos_encoding only supports drop_cls_token=Truer5   r6   )rT   )r   r   r   r3   r-   r0   r   r/   r)   r   r4   r5   r6   r   r   r   rS   )rf   ri   r)   r   r   r   x_r   r   layer_modulerT   r&   r&   r'   rj   b  s*   






zRAEDecoder.forward)r   r   r   r   r   rE   r   r   rE   Trt   rH   rH   r\   r_   )rU   rV   rW   rX   r.   r   rr   rk   ra   r0   rY   r)   r   tupler   rS   rj   rl   r&   r&   rg   r'   r      s|    	
5"#r   c                *       s  e Zd ZdZdZdgZdgZe							
														dCdede	de	de	de	de	de	de	de	de	de	dB de	de
dB d e
dB d!e
eB ejB dB d"e
eB ejB dB d#ed$ed%ed&ef( fd'd(ZdDd)ejd*ejdB d+ejfd,d-Zd)ejd+ejfd.d/Zd)ejd+ejfd0d1Zd2ejd+ejfd3d4Zd2ejd+ejfd5d6ZdDd)ejd*ejdB d+ejfd7d8Ze	dEd)ejd9ed*ejdB d+eeej B fd:d;Zd2ejd+ejfd<d=ZedFd2ejd9ed+eeej B fd>d?Z	dEd@ejd9ed*ejdB d+eeej B fdAdBZ  ZS )GAutoencoderRAEa
  
    Representation Autoencoder (RAE) model for encoding images to latents and decoding latents to images.

    This model uses a frozen pretrained encoder (DINOv2, SigLIP2, or MAE) with a trainable ViT decoder to reconstruct
    images from learned representations.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for its generic methods implemented for
    all models (such as downloading or saving).

    Args:
        encoder_type (`str`, *optional*, defaults to `"dinov2"`):
            Type of frozen encoder to use. One of `"dinov2"`, `"siglip2"`, or `"mae"`.
        encoder_hidden_size (`int`, *optional*, defaults to `768`):
            Hidden size of the encoder model.
        encoder_patch_size (`int`, *optional*, defaults to `14`):
            Patch size of the encoder model.
        encoder_num_hidden_layers (`int`, *optional*, defaults to `12`):
            Number of hidden layers in the encoder model.
        patch_size (`int`, *optional*, defaults to `16`):
            Decoder patch size (used for unpatchify and decoder head).
        encoder_input_size (`int`, *optional*, defaults to `224`):
            Input size expected by the encoder.
        image_size (`int`, *optional*):
            Decoder output image size. If `None`, it is derived from encoder token count and `patch_size` like
            RAE-main: `image_size = patch_size * sqrt(num_patches)`, where `num_patches = (encoder_input_size //
            encoder_patch_size) ** 2`.
        num_channels (`int`, *optional*, defaults to `3`):
            Number of input/output channels.
        encoder_norm_mean (`list`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
            Channel-wise mean for encoder input normalization (ImageNet defaults).
        encoder_norm_std (`list`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
            Channel-wise std for encoder input normalization (ImageNet defaults).
        latents_mean (`list` or `tuple`, *optional*):
            Optional mean for latent normalization. Tensor inputs are accepted and converted to config-serializable
            lists.
        latents_std (`list` or `tuple`, *optional*):
            Optional standard deviation for latent normalization. Tensor inputs are accepted and converted to
            config-serializable lists.
        noise_tau (`float`, *optional*, defaults to `0.0`):
            Noise level for training (adds noise to latents during training).
        reshape_to_2d (`bool`, *optional*, defaults to `True`):
            Whether to reshape latents to 2D (B, C, H, W) format.
        use_encoder_loss (`bool`, *optional*, defaults to `False`):
            Whether to use encoder hidden states in the loss (for advanced training).
    Frs   zdecoder.decoder_pos_embedrA   r         r   r   r   r   rG   Nr   rH   T      ?r=   encoder_hidden_sizeencoder_patch_sizeencoder_num_hidden_layersr   r   r   r   r+   encoder_input_sizerB   r   encoder_norm_meanencoder_norm_stdlatents_meanlatents_std	noise_taureshape_to_2duse_encoder_lossscaling_factorc                    s  t    |tvrtd| dtt  dtdtf fdd dtjt	B t
B d B dtjd B fdd}||}| j | |d	 |
| _t|| _t|| _t|| _t|}| j| d
krmtd| j d| dt|	}|d
krytdt||||d| _t| | _| j| d }tt|}|| |krtd| d|| }|d u r|}nt|}||krtd| d| d| d| d	|d u rg d}|d u rg d}tj|tjddddd}tj|tjddddd}| jd|dd | jd|dd ||}|d u rtd}| jd |dd |d u r"td}| jd!|dd tt|t|t|t|t|t|t|t|t|d"	| _t|| _t|| _ t|| _!d#| _"d S )$NrJ   z'. Available: valuer   c                    sX   t | tjr|    S t | tr fdd| D S t | tr* fdd| D S | S )Nc                       g | ]} |qS r&   r&   r   v_to_config_compatibler&   r'   r         zJAutoencoderRAE.__init__.<locals>._to_config_compatible.<locals>.<listcomp>c                    r   r&   r&   r   r   r&   r'   r     r   )
isinstancer0   rY   detachcputolistr   listr   r   r&   r'   r     s   

z6AutoencoderRAE.__init__.<locals>._to_config_compatiblec                 S   s4   | d u rd S t | tjr|   S tj| tjdS )Nr6   )r   r0   rY   r   clonetensorfloat32r   r&   r&   r'   _as_optional_tensor  s
   z4AutoencoderRAE.__init__.<locals>._as_optional_tensor)r   r   r   zencoder_input_size=z) must be divisible by encoder_patch_size=.zCpatch_size must be a positive integer (this is decoder_patch_size).)r=   r>   r+   r?   r   zComputed num_patches=z must be a perfect square.zimage_size=z1 must equal decoder_patch_size*sqrt(num_patches)=z for patch_size=z and computed num_patches=)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?r   r   r   encoder_meanTr   encoder_std_latents_mean_latents_std)	r>   r   r   r   r   r   r+   r   rB   F)#r`   ra   _ENCODER_FORWARD_FNSr/   sortedkeysr   r0   rY   r   r   r   r   rr   r   r   r   r   r.   rR   encoder_encoder_forward_fnr   r   r   r   r   r   onesr   decoderr   decoder_patch_sizedecoder_image_sizeuse_slicing)rf   r=   r   r   r   r   r   r   r   r+   r   rB   r   r   r   r   r   r   r   r   r   r   latents_std_tensorr  r   gridderived_image_sizeencoder_mean_tensorencoder_std_tensorlatents_mean_tensorrg   r   r'   ra     s   
&	











zAutoencoderRAE.__init__r   	generatorr   c                 C   sP   | j tj|dfd|jd   |j|j|d }||t|j||j|jd  S )Nr   )r   r   )r5   r6   r  )r  r5   r6   )	r   r0   randr   ndimr5   r6   r   r-   )rf   r   r  noise_sigmar&   r&   r'   _noisingD  s   "zAutoencoderRAE._noisingc                 C   sp   |j \}}}}|| jks|| jkrtj|| j| jfddd}| jj|j|jd}| jj|j|jd}|| | S )Nr   Fr   r   )	r-   r   r   r   r   r4   r5   r6   r   )rf   r   r   r7   r8   meanstdr&   r&   r'   _resize_and_normalizeK  s   z$AutoencoderRAE._resize_and_normalizec                 C   s4   | j j|j|jd}| jj|j|jd}|| | S )Nr   )r   r4   r5   r6   r   )rf   r   r  r  r&   r&   r'   _denormalize_imageU  s   z!AutoencoderRAE._denormalize_imagezc                 C   s8   | j j|j|jd}| jj|j|jd}|| |d  S Nr   gh㈵>r   r4   r5   r6   r   rf   r  r   r   r&   r&   r'   _normalize_latentsZ     z!AutoencoderRAE._normalize_latentsc                 C   s8   | j j|j|jd}| jj|j|jd}||d  | S r  r  r  r&   r&   r'   _denormalize_latents_  r  z#AutoencoderRAE._denormalize_latentsc           	      C   s   |  |}| jjdkr| | j|| jj}n| | j|}| jr,| jdkr,| j||d}| j	rX|j
\}}}tt|}|| |krItd| d|dd ||||}n|}| |}| jjdkrk|| jj }|S )	NrF   r   r  zToken length n=z/ is not a perfect square; cannot reshape to 2D.r   r   r   )r  rQ   r=   r  r  r   trainingr   r  r   r-   r.   r   r/   	transposer   r   r  r   )	rf   r   r  tokensr   nr   sider  r&   r&   r'   _encoded  s"   

zAutoencoderRAE._encoder   c                    s\   j r|jd dkrtj fdd|dD dd}nj| d}|s)|fS t|dS )Nr   r   c                    s   g | ]	}j | d qS )r  )r$  )r   x_slicer  rf   r&   r'   r     s    z)AutoencoderRAE.encode.<locals>.<listcomp>r   r  )latent)r  r-   r0   r   splitr$  r   )rf   r   r   r  latentsr&   r&  r'   encode  s   &
zAutoencoderRAE.encodec           	      C   s   | j jdkr|| j j }| |}| jr+|j\}}}}||||| dd }n|}| j|ddj	}| j
|}| |}|j|jdS )Nr   r   r   Tr   )r5   )rQ   r   r  r   r-   r   r   r   r  rT   r   r  r4   r5   )	rf   r  r   r   r7   r8   r!  rT   x_recr&   r&   r'   _decode  s   
 
zAutoencoderRAE._decodec                    sV    j r|jd dkrtj fdd|dD dd}n |}|s&|fS t|dS )Nr   r   c                    s   g | ]}  |qS r&   )r-  )r   z_slicerf   r&   r'   r     s    z)AutoencoderRAE.decode.<locals>.<listcomp>r   sample)r  r-   r0   r   r(  r-  r   )rf   r  r   decodedr&   r/  r'   decode  s   $

zAutoencoderRAE.decoder1  c                 C   s:   | j |d|dd }| j|ddd }|s|fS t|dS )NF)r   r  r   r+  r0  )r*  r3  r   )rf   r1  r   r  r)  r2  r&   r&   r'   rj     s
   
zAutoencoderRAE.forward)rA   r   r   r   r   r   r   r   r   rG   Nr   NNNNrH   TFr   r_   )TN)T) rU   rV   rW   rX    _supports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpectedr   rk   r.   r   r   r0   rY   rr   r   ra   	Generatorr  r  r  r  r  r$  r
   r   r*  r-  r   r3  rj   rl   r&   r&   rg   r'   r     s    /	
 "
"&r   )r<   )?dataclassesr   mathr   typingr   r0   torch.nnrb   torch.nn.functional
functionalr   configuration_utilsr   r   utilsr   r	   utils.accelerate_utilsr
   utils.import_utilsr   utils.torch_utilsr   transformersr   r   r   r   r   r   activationsr   r}   r   attention_processorr   r   r   modeling_utilsr   vaer   r   r   
get_loggerrU   loggerModulerY   r(   r*   r.   r;   rk   rR   r   rS   r[   rm   rs   r   r   r&   r&   r&   r'   <module>   sb    	

 
40 &