o
    Gi8                     @   s$  d dl Z d dlm  mZ d dl mZ d dlmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z m!Z! G dd deeeeZ"G dd dej#Z$G dd dej#Z%G dd dej#Z&G dd dej#Z'dS )    N)nn
checkpoint   )ConfigMixinregister_to_config)PeftAdapterMixin)apply_lora_scale   )AttentionMixinBasicTransformerBlockSkipFFTransformerBlock)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttnAddedKVProcessorAttnProcessor)TimestepEmbeddingget_timestep_embedding)
ModelMixin)GlobalResponseNormRMSNorm)Downsample2D
Upsample2Dc                .       s   e Zd ZdZe													
										d/dededededededededededededededed ed!ed"ed#ed$ed%ed&ef, fd'd(Ze	d)d0d+d,Z
d-d. Z  ZS )1UVit2DModelT   F                 @       r               ư>@   hidden_sizeuse_biashidden_dropoutcond_embed_dimmicro_cond_encode_dimmicro_cond_embed_dimencoder_hidden_size
vocab_sizecodebook_sizein_channelsblock_out_channelsnum_res_blocks
downsampleupsampleblock_num_headsnum_hidden_layersnum_attention_headsattention_dropoutintermediate_sizelayer_norm_epsln_elementwise_affinesample_sizec                    s  t    tj|d| _t| _t|
||| _t	|| d| _
t|| |||d| _t|| _tj|d| _t fddt|D | _t| _tj|d| _t|| ||d|d| _t||
|	| _d| _d S )Nbias)sample_proj_biasFc                    s4   g | ]}t   d dqS )ada_norm_continuous)dimr7   attention_head_dimdropoutcross_attention_dimattention_bias	norm_type-ada_norm_continous_conditioning_embedding_dimnorm_elementwise_affinenorm_epsada_norm_biasff_inner_dimff_biasattention_out_bias)r   .0_r)   r'   r9   r:   r;   r7   r(    R/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/unets/uvit_2d.py
<listcomp>j   s&    z(UVit2DModel.__init__.<locals>.<listcomp>)r3   r4   )super__init__r   Linearencoder_projr   encoder_proj_layer_normUVit2DConvEmbedembedr   
cond_embed	UVitBlock
down_blockproject_to_hidden_normproject_to_hidden
ModuleListrangetransformer_layersproject_from_hidden_normproject_from_hiddenup_blockConvMlmLayer	mlm_layergradient_checkpointing)selfr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   	__class__rQ   rS   rV   )   s`   
"

zUVit2DModel.__init__cross_attention_kwargsNc                    s^  |  |}| |}t| | jjddd}||jd df}tj	||gdd}|j
| jd}| |
|j}| |}| j||||d}|j\}}	}
}|dd	d
d||
| |	}| |}| |}| jD ] t rz| jrz fdd}n }||||d|id}qj| |}| |}|||
||	dd
dd	}| j||||d}| |}|S )NTr   )flip_sin_to_cosdownscale_freq_shift   rA   )dtype)pooled_text_embencoder_hidden_statesrm   r
   r   c                     s   t  g| R  S Nr   )argslayerrR   rS   layer_   s   z#UVit2DModel.forward.<locals>.layer_rt   )ru   rm   added_cond_kwargs)rX   rY   r   flattenconfigr+   reshapeshapetorchcattors   r\   r[   r^   permuter_   r`   rc   is_grad_enabledri   rd   re   rf   rh   )rj   	input_idsru   rt   micro_condsrm   micro_cond_embedshidden_states
batch_sizechannelsheightwidthrz   logitsrR   rx   rS   forward   sR   








zUVit2DModel.forwardc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S rv   )rl   r   rO   procrR   rR   rS   	<genexpr>       z9UVit2DModel.set_default_attn_processor.<locals>.<genexpr>c                 s   r   rv   )rl   r   r   rR   rR   rS   r      r   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allattn_processorsvaluesr   r   
ValueErrornextiterset_attn_processor)rj   	processorrR   rR   rS   set_default_attn_processor   s   z&UVit2DModel.set_default_attn_processor)r   Fr   r   r   r   r   r   r    r   r   r   FFr!   r"   r#   r   r$   r%   Tr&   rv   )__name__
__module____qualname__ _supports_gradient_checkpointingr   intboolfloatrV   r	   r   r   __classcell__rR   rR   rk   rS   r   &   s    	
l>r   c                       s$   e Zd Z fddZdd Z  ZS )rZ   c                    s>   t    t||| _t|||| _tj||d|d| _d S Nrq   )kernel_sizer>   )	rU   rV   r   	Embedding
embeddingsr   
layer_normConv2dconv)rj   r0   r1   r.   elementwise_affineepsr>   rk   rR   rS   rV      s   
zUVit2DConvEmbed.__init__c                 C   s2   |  |}| |}|dddd}| |}|S )Nr   r   rq   r
   )r   r   r   r   )rj   r   r   rR   rR   rS   r      s
   


zUVit2DConvEmbed.forwardr   r   r   rV   r   r   rR   rR   rk   rS   rZ      s    rZ   c                       s2   e Zd Zdededef fddZdd Z  ZS )r]   r2   r3   r4   c                    s   t    |
rtdddddd	| _nd | _tfddt|D | _t fd	dt|D | _|rSt	dddd
ddd
| _
d S d | _
d S )NTr   Conv2d_0r
   rms_norm)use_convpaddingnamer   rF   r   r   r>   c              	      s   g | ]}t  qS rR   )ConvNextBlock)rO   i)r   r)   r'   r:   r;   r(   rR   rS   rT     s    	z&UVitBlock.__init__.<locals>.<listcomp>c                    s*   g | ]}t   d 	qS ))rE   rM   )r   rN   )r8   r5   r   r'   r(   rR   rS   rT   #  s    r   F)	use_conv_transposer   r   r   rF   r   r   r>   interpolate)rU   rV   r   r3   r   ra   rb   
res_blocksattention_blocksr   r4   )rj   r   r2   r'   r)   r;   r:   r(   r5   r8   r3   r4   rk   )r8   r5   r   r)   r'   r:   r;   r(   rS   rV      sL   

	
zUVitBlock.__init__c                 C   s   | j d ur
|  |}t| j| jD ]2\}}|||}|j\}}}	}
||||	|
 ddd}||||d}|ddd|||	|
}q| jd urN| |}|S )Nr   r
   rq   )ru   rm   )r3   zipr   r   r   viewr   r4   )rj   xrt   ru   rm   	res_blockattention_blockr   r   r   r   rR   rR   rS   r   C  s   




zUVitBlock.forward)r   r   r   r   r   rV   r   r   rR   rR   rk   rS   r]      s    Nr]   c                       s(   e Zd Z	d fdd	Zdd Z  ZS )r      c                    s   t    tj||dd||d| _t|||| _tj|t|| |d| _	t
 | _tt|| | _tjt|| ||d| _t|| _t||d || _d S )Nr   rq   )r   r   groupsr>   r=   r
   )rU   rV   r   r   	depthwiser   normrW   r   channelwise_linear_1GELUchannelwise_actr   channelwise_normchannelwise_linear_2Dropoutchannelwise_dropoutcond_embeds_mapper)rj   r   r:   r;   r(   r)   r'   res_ffn_factorrk   rR   rS   rV   X  s    

zConvNextBlock.__init__c                 C   s   |}|  |}|dddd}| |}| |}| |}| |}| |}| |}|dddd}|| }| t	
|jddd\}}|d|d d d d d d f   |d d d d d d f  }|S )Nr   r
   r   rq   rr   )r   r   r   r   r   r   r   r   r   Fsiluchunk)rj   r   cond_embedsx_resscaleshiftrR   rR   rS   r   l  s   






8zConvNextBlock.forward)r   r   rR   rR   rk   rS   r   W  s    r   c                       s>   e Zd Zdedededededef fddZd	d
 Z  ZS )rg   r1   r0   r(   r;   r:   r/   c                    sD   t    tj||d|d| _t|||| _tj||d|d| _d S r   )rU   rV   r   r   conv1r   r   conv2)rj   r1   r0   r(   r;   r:   r/   rk   rR   rS   rV     s   
	zConvMlmLayer.__init__c                 C   s:   |  |}| |dddddddd}| |}|S )Nr   r
   r   rq   )r   r   r   r   )rj   r   r   rR   rR   rS   r     s   
"
zConvMlmLayer.forward)	r   r   r   r   r   r   rV   r   r   rR   rR   rk   rS   rg     s    rg   )(r   torch.nn.functionalr   
functionalr   torch.utils.checkpointr   configuration_utilsr   r   loadersr   utilsr	   	attentionr   r   r   attention_processorr   r   r   r   r   r   r   modeling_utilsr   normalizationr   r   resnetr   r   r   ModulerZ   r]   r   rg   rR   rR   rR   rS   <module>   s&    @c-