o
    piC                     @   s&  d dl mZmZ d dlZd dlm  mZ d dlmZ d dlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" G dd deeeZ#G dd dej$Z%G dd dej$Z&G dd dej$Z'G dd dej$Z(dS )    )DictUnionN)nn
checkpoint   )ConfigMixinregister_to_config)PeftAdapterMixin   )BasicTransformerBlockSkipFFTransformerBlock)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttentionProcessorAttnAddedKVProcessorAttnProcessor)TimestepEmbeddingget_timestep_embedding)
ModelMixin)GlobalResponseNormRMSNorm)Downsample2D
Upsample2Dc                .       s
  e Zd ZdZe													
										d7dededededededededededededededed ed!ed"ed#ed$ed%ed&ef, fd'd(Zd8d)ed*d+fd,d-Z	d9d.d/Z
ed*eeef fd0d1Zd2eeeeef f fd3d4Zd5d6 Z  ZS ):UVit2DModelT   F                 @       r               ư>@   hidden_sizeuse_biashidden_dropoutcond_embed_dimmicro_cond_encode_dimmicro_cond_embed_dimencoder_hidden_size
vocab_sizecodebook_sizein_channelsblock_out_channelsnum_res_blocks
downsampleupsampleblock_num_headsnum_hidden_layersnum_attention_headsattention_dropoutintermediate_sizelayer_norm_epsln_elementwise_affinesample_sizec                    s  t    tj|d| _t| _t|
||| _t	|| d| _
t|| |||d| _t|| _tj|d| _t fddt|D | _t| _tj|d| _t|| ||d|d| _t||
|	| _d| _d S )Nbias)sample_proj_biasFc                    s4   g | ]}t   d dqS )ada_norm_continuous)dimr8   attention_head_dimdropoutcross_attention_dimattention_bias	norm_type-ada_norm_continous_conditioning_embedding_dimnorm_elementwise_affinenorm_epsada_norm_biasff_inner_dimff_biasattention_out_bias)r   .0_r*   r(   r:   r;   r<   r8   r)    \/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/unets/uvit_2d.py
<listcomp>k   s&    z(UVit2DModel.__init__.<locals>.<listcomp>)r4   r5   )super__init__r   Linearencoder_projr   encoder_proj_layer_normUVit2DConvEmbedembedr   
cond_embed	UVitBlock
down_blockproject_to_hidden_normproject_to_hidden
ModuleListrangetransformer_layersproject_from_hidden_normproject_from_hiddenup_blockConvMlmLayer	mlm_layergradient_checkpointing)selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   	__class__rR   rT   rW   *   s`   
"

zUVit2DModel.__init__valuereturnNc                 C   s   d S NrS   )rk   modulern   rS   rS   rT   _set_gradient_checkpointing   s   z'UVit2DModel._set_gradient_checkpointingc                    s\  |  |}| |}t| | jjddd}||jd df}tj	||gdd}|j
| jd}| |
|j}| |}| j||||d}|j\}}	}
}|dd	d
d||
| |	}| |}| |}| jD ] | jry| jry fdd}n }||||d|id}qj| |}| |}|||
||	dd
dd	}| j||||d}| |}|S )NTr   )flip_sin_to_cosdownscale_freq_shift   rB   )dtype)pooled_text_embencoder_hidden_statescross_attention_kwargsr   r   c                     s   t  g| R  S rp   r   )argslayerrS   rT   layer_   s   z#UVit2DModel.forward.<locals>.layer_ry   )rz   r{   added_cond_kwargs)rY   rZ   r   flattenconfigr,   reshapeshapetorchcattorx   r]   r\   r_   permuter`   ra   rd   trainingrj   re   rf   rg   ri   )rk   	input_idsrz   ry   micro_condsr{   micro_cond_embedshidden_states
batch_sizechannelsheightwidthr   logitsrS   r}   rT   forward   sR   








zUVit2DModel.forwardc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namerq   
processorsc                    sH   t |dr| ||  d< | D ]\}} |  d| || q|S )Nget_processor
.processor.)hasattrr   named_children)r   rq   r   sub_namechildfn_recursive_add_processorsrS   rT   r      s
   
z@UVit2DModel.attn_processors.<locals>.fn_recursive_add_processors)strr   r   Moduler   r   r   )rk   r   r   rq   rS   r   rT   attn_processors   s
   	&	zUVit2DModel.attn_processors	processorc                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   rq   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorr   r   )r   
isinstancedictr   popr   )r   rq   r   r   r   fn_recursive_attn_processorrS   rT   r     s   

zCUVit2DModel.set_attn_processor.<locals>.fn_recursive_attn_processorN)lenr   keysr   r   
ValueErrorr   r   r   r   r   )rk   r   countr   rq   rS   r   rT   set_attn_processor   s   
zUVit2DModel.set_attn_processorc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S rp   )rm   r   rP   procrS   rS   rT   	<genexpr>      z9UVit2DModel.set_default_attn_processor.<locals>.<genexpr>c                 s   r   rp   )rm   r   r   rS   rS   rT   r     r   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr   valuesr   r   r   nextiterr   )rk   r   rS   rS   rT   set_default_attn_processor  s   z&UVit2DModel.set_default_attn_processor)r   Fr   r   r   r   r   r    r!   r   r   r   FFr"   r#   r$   r   r%   r&   Tr'   )Frp   )__name__
__module____qualname__ _supports_gradient_checkpointingr	   intboolfloatrW   rr   r   propertyr   r   r   r   r   r   r   __classcell__rS   rS   rl   rT   r   '   s    	
l
=#r   c                       s$   e Zd Z fddZdd Z  ZS )r[   c                    s>   t    t||| _t|||| _tj||d|d| _d S Nrv   )kernel_sizer?   )	rV   rW   r   	Embedding
embeddingsr   
layer_normConv2dconv)rk   r1   r2   r/   elementwise_affineepsr?   rl   rS   rT   rW   %  s   
zUVit2DConvEmbed.__init__c                 C   s2   |  |}| |}|dddd}| |}|S )Nr   r   rv   r   )r   r   r   r   )rk   r   r   rS   rS   rT   r   +  s
   


zUVit2DConvEmbed.forwardr   r   r   rW   r   r   rS   rS   rl   rT   r[   $  s    r[   c                       s2   e Zd Zdededef fddZdd Z  ZS )r^   r3   r4   r5   c                    s   t    |
rtdddddd	| _nd | _tfddt|D | _t fd	dt|D | _|rSt	dddd
ddd
| _
d S d | _
d S )NTr   Conv2d_0r   rms_norm)use_convpaddingr   r   rG   r   r   r?   c              	      s   g | ]}t  qS rS   )ConvNextBlock)rP   i)r   r*   r(   r;   r<   r)   rS   rT   rU   T  s    	z&UVitBlock.__init__.<locals>.<listcomp>c                    s*   g | ]}t   d 	qS ))rF   rN   )r   rO   )r9   r6   r   r(   r)   rS   rT   rU   b  s    r   F)	use_conv_transposer   r   r   rG   r   r   r?   interpolate)rV   rW   r   r4   r   rb   rc   
res_blocksattention_blocksr   r5   )rk   r   r3   r(   r*   r<   r;   r)   r6   r9   r4   r5   rl   )r9   r6   r   r*   r(   r;   r<   r)   rT   rW   4  sL   

	
zUVitBlock.__init__c                 C   s   | j d ur
|  |}t| j| jD ]2\}}|||}|j\}}}	}
||||	|
 ddd}||||d}|ddd|||	|
}q| jd urN| |}|S )Nr   r   rv   )rz   r{   )r4   zipr   r   r   viewr   r5   )rk   xry   rz   r{   	res_blockattention_blockr   r   r   r   rS   rS   rT   r     s   




zUVitBlock.forward)r   r   r   r   r   rW   r   r   rS   rS   rl   rT   r^   3  s    Nr^   c                       s(   e Zd Z	d fdd	Zdd Z  ZS )r      c                    s   t    tj||dd||d| _t|||| _tj|t|| |d| _	t
 | _tt|| | _tjt|| ||d| _t|| _t||d || _d S )Nr   rv   )r   r   groupsr?   r>   r   )rV   rW   r   r   	depthwiser   normrX   r   channelwise_linear_1GELUchannelwise_actr   channelwise_normchannelwise_linear_2Dropoutchannelwise_dropoutcond_embeds_mapper)rk   r   r;   r<   r)   r*   r(   res_ffn_factorrl   rS   rT   rW     s    

zConvNextBlock.__init__c                 C   s   |}|  |}|dddd}| |}| |}| |}| |}| |}| |}|dddd}|| }| t	
|jddd\}}|d|d d d d d d f   |d d d d d d f  }|S )Nr   r   r   rv   rw   )r   r   r   r   r   r   r   r   r   Fsiluchunk)rk   r   cond_embedsx_resscaleshiftrS   rS   rT   r     s   






8zConvNextBlock.forward)r   r   rS   rS   rl   rT   r     s    r   c                       s>   e Zd Zdedededededef fddZd	d
 Z  ZS )rh   r2   r1   r)   r<   r;   r0   c                    sD   t    tj||d|d| _t|||| _tj||d|d| _d S r   )rV   rW   r   r   conv1r   r   conv2)rk   r2   r1   r)   r<   r;   r0   rl   rS   rT   rW     s   
	zConvMlmLayer.__init__c                 C   s:   |  |}| |dddddddd}| |}|S )Nr   r   r   rv   )r   r   r   r   )rk   r   r   rS   rS   rT   r     s   
"
zConvMlmLayer.forward)	r   r   r   r   r   r   rW   r   r   rS   rS   rl   rT   rh     s    rh   ))typingr   r   r   torch.nn.functionalr   
functionalr   torch.utils.checkpointr   configuration_utilsr   r	   loadersr
   	attentionr   r   attention_processorr   r   r   r   r   r   r   r   modeling_utilsr   normalizationr   r   resnetr   r   r   r   r[   r^   r   rh   rS   rS   rS   rT   <module>   s&    ~c-