o
    ߥi"                     @   s  d dl Z d dlmZ d dlZd dlZd dlmZ d dl	m  m
Z d dlmZmZ G dd dejZd/ddZd	d
 Zdd Zdd Zdd ZG dd dejZG dd dejZG dd dejeZG dd dejZG dd dejZG dd deZG dd dejZdd  ZG d!d" d"ejZG d#d$ d$ejZ G d%d& d&ejZ!G d'd( d(e!Z"G d)d* d*ejZ#G d+d, d,eZ$G d-d. d.eZ%dS )0    N)abstractmethod)PretrainedConfigPreTrainedModelc                       s   e Zd Z fddZ  ZS )	GroupNormc                    s   t t| | |jS N)superr   forwardfloattypedtypeselfx	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/guided_diffusion/unet.pyr      s   zGroupNorm.forward)__name__
__module____qualname__r   __classcell__r   r   r   r   r      s    r   '  c                 C   s   |d }t t| t jd|t jd | j| jd}| dddf  |d  }t j	t 
|t |gdd}|d rRt j	|t |ddddf gdd}|S )	aY  
    Create sinusoidal timestep embeddings.

    :param timesteps: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an [N x dim] Tensor of positional embeddings.
       r   )startendr   )deviceNdim   )thexpmathlogarangefloat32tor   r	   catcossin
zeros_like)	timestepsr   
max_periodhalffreqsargs	embeddingr   r   r   timestep_embedding   s   
r1   c                 C   L   t | tjtjtjfr"| jj | j_| jdur$| jj | j_dS dS dS )z/
    Convert primitive modules to float16.
    N)	
isinstancennConv1dConv2dConv3dweightdatar-   biasllr   r   r   convert_module_to_f16+      
r=   c                 C   r2   )zP
    Convert primitive modules to float32, undoing convert_module_to_f16().
    N)	r3   r4   r5   r6   r7   r8   r9   r	   r:   r;   r   r   r   convert_module_to_f325   r>   r?   c                 O   sV   | dkrt j|i |S | dkrt j|i |S | dkr$t j|i |S td|  )z4
    Create a 1D, 2D, or 3D convolution module.
    r   r      zunsupported dimensions: )r4   r5   r6   r7   
ValueError)dimsr/   kwargsr   r   r   conv_nd?   s   rD   c                 C   s4   |rt |t | }tj| t|g|R  S | | S )a  
    Evaluate a function without caching intermediate activations, allowing for
    reduced memory at the expense of extra compute in the backward pass.
    :param func: the function to evaluate.
    :param inputs: the argument sequence to pass to `func`.
    :param params: a sequence of parameters `func` depends on but does not
                   explicitly take as arguments.
    :param flag: if False, disable gradient checkpointing.
    )tupleCheckpointFunctionapplylen)funcinputsparamsflagr/   r   r   r   
checkpointL   s   
rM   c                	       s>   e Zd ZdZ	ddedededef fddZd	d
 Z  ZS )AttentionPool2dzS
    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
    Nspacial_dim	embed_dimnum_heads_channels
output_dimc                    sp   t    tt||d d |d  | _td|d| d| _td||p'|d| _	|| | _
t| j
| _d S )Nr   r         ?r@   )r   __init__r4   	Parameterr    randnpositional_embeddingrD   qkv_projc_proj	num_headsQKVAttention	attention)r   rO   rP   rQ   rR   r   r   r   rT   b   s   

zAttentionPool2d.__init__c                 C   s   |j ^}}}|||d}tj|jddd|gdd}|| jd d d d d f |j }| |}| 	|}| 
|}|d d d d df S )Nr   T)r   keepdimr   r   )shapereshaper    r'   meanrW   r&   r   rX   r\   rY   )r   r   bc_spatialr   r   r   r   q   s   $


zAttentionPool2d.forwardr   )r   r   r   __doc__intrT   r   r   r   r   r   r   rN   ]   s    	rN   c                   @   s   e Zd ZdZedd ZdS )TimestepBlockzT
    Any module where forward() takes timestep embeddings as a second argument.
    c                 C   s   dS )zJ
        Apply the module to `x` given `emb` timestep embeddings.
        Nr   r   r   embr   r   r   r      s    zTimestepBlock.forwardN)r   r   r   rd   r   r   r   r   r   r   rf   |   s    rf   c                   @   s   e Zd ZdZdd ZdS )TimestepEmbedSequentialzt
    A sequential module that passes timestep embeddings to the children that
    support it as an extra input.
    c                 C   s,   | D ]}t |tr|||}q||}q|S r   )r3   rf   )r   r   rh   layerr   r   r   r      s
   

zTimestepEmbedSequential.forwardN)r   r   r   rd   r   r   r   r   r   ri      s    ri   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )	UpsampleaB  
    An upsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    r   Nc                    sJ   t    || _|p|| _|| _|| _|r#t|| j| jddd| _d S d S )Nr@   r   padding)r   rT   channelsout_channelsuse_convrB   rD   conv)r   ro   rq   rB   rp   r   r   r   rT      s   

zUpsample.__init__c                 C   st   |j d | jks
J | jdkr(tj||j d |j d d |j d d fdd}ntj|ddd}| jr8| |}|S )Nr   r@   r      nearestmode)scale_factorrv   )r^   ro   rB   Finterpolaterq   rr   r   r   r   r   r      s   
$
zUpsample.forwardr   Nr   r   r   rd   rT   r   r   r   r   r   r   rl      s    	
rl   c                       rk   )	
DownsampleaE  
    A downsampling layer with an optional convolution.

    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    r   Nc                    s|   t    || _|p|| _|| _|| _|dkrdnd}|r,t|| j| jd|dd| _d S | j| jks4J tj	||d| _d S )Nr@   r   )r   r   r   r   )stridern   )kernel_sizer}   )
r   rT   ro   rp   rq   rB   rD   opr4   	AvgPool2d)r   ro   rq   rB   rp   r}   r   r   r   rT      s"   

zDownsample.__init__c                 C   s   |j d | jks
J | |S )Nr   )r^   ro   r   r   r   r   r   r      s   
zDownsample.forwardrz   r{   r   r   r   r   r|      s    	r|   c                       s@   e Zd ZdZ							d fdd	Zdd Zd	d
 Z  ZS )ResBlocka  
    A residual block that can optionally change the number of channels.

    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    NFr   c                    s  t    || _|| _|| _|p|| _|| _|| _|| _t	
td|t	 t||| jddd| _|	p4|
| _|	rGt|d|| _t|d|| _n|
rXt|d|| _t|d|| _nt	  | _| _t	
t	 t	||rod| j n| j| _t	
td| jt	 t	j|dt|| j| jddd| _t	j| jd j | j|krt	 | _d S |rt||| jddd| _d S t||| jd| _d S )	N    r@   r   rm   Fr   )pr   )r   rT   ro   emb_channelsdropoutrp   rq   use_checkpointuse_scale_shift_normr4   
Sequentialr   SiLUrD   	in_layersupdownrl   h_updx_updr|   IdentityLinear
emb_layersDropout
out_layersinitzeros_r8   skip_connection)r   ro   r   r   rp   rq   r   rB   r   updownr   r   r   rT      s\   







zResBlock.__init__c                 C   s   t | j||f|  | jS )a	  
        Apply the block to a Tensor, conditioned on a timestep embedding.

        :param x: an [N x C x ...] Tensor of features.
        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        rM   _forward
parametersr   rg   r   r   r   r   ,  s   zResBlock.forwardc                 C   s  | j r#| jd d | jd }}||}| |}| |}||}n| |}| ||j}t|jt|jk rI|d }t|jt|jk s;| j	rr| j
d | j
dd  }}tj|ddd\}	}
||d|	  |
 }||}n	|| }| 
|}| || S )Nr   ).Nr   r   r   r   )r   r   r   r   r   r
   r   rH   r^   r   r   r    chunkr   )r   r   rh   in_restin_convhemb_outout_normout_restscaleshiftr   r   r   r   7  s&   





zResBlock._forward)NFFr   FFFr   r   r   rd   rT   r   r   r   r   r   r   r   r      s    Ar   c                       s:   e Zd ZdZ				d fdd	Zdd Zd	d
 Z  ZS )AttentionBlocka  
    An attention block that allows spatial positions to attend to each other.

    Originally ported from here, but adapted to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    r   r   Fc                    s   t    || _|dkr|| _n|| dks J d| d| || | _|| _td|| _td||d d| _|rAt	| j| _
nt| j| _
td||d| _tj| jj d S )Nr   r   zq,k,v channels z' is not divisible by num_head_channels r   r   r@   )r   rT   ro   rZ   r   r   normrD   qkvr[   r\   QKVAttentionLegacyproj_outr4   r   r   r8   )r   ro   rZ   num_head_channelsr   use_new_attention_orderr   r   r   rT   V  s    

zAttentionBlock.__init__c                 C   s   t | j|f|  | jS r   r   r   r   r   r   r   u  s   zAttentionBlock.forwardc                 C   sV   |j ^}}}|||d}| | |}| |}| |}|| j||g|R  S )Nr   )r^   r_   r   r   r\   r   )r   r   ra   rb   spatialr   r   r   r   r   r   y  s   

zAttentionBlock._forward)r   r   FFr   r   r   r   r   r   N  s    
r   c                 C   sL   |d j ^}}}tt|}d| |d  | }|  jt|g7  _dS )a(  
    A counter for the `thop` package to count the operations in an
    attention operation.
    Meant to be used like:
        macs, params = thop.profile(
            model,
            inputs=(inputs, timestamps),
            custom_ops={QKVAttention: QKVAttention.count_flops},
        )
    r   r   N)r^   re   npprod	total_opsr    DoubleTensor)model_xyra   rb   r   num_spatial
matmul_opsr   r   r   count_flops_attn  s   r   c                       4   e Zd ZdZ fddZdd Zedd Z  ZS )r   zh
    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
    c                       t    || _d S r   r   rT   n_headsr   r   r   r   r   rT        

zQKVAttentionLegacy.__init__c                 C   s   |j \}}}|d| j  dksJ |d| j  }||| j |d |j|dd\}}}dtt| }	td||	 ||	 }
tj|
	 dd
|
j}
td|
|}||d|S )z
        Apply QKV attention.

        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r@   r   r   r   bct,bcs->btsr   bts,bcs->bct)r^   r   r_   splitr"   sqrtr    einsumsoftmaxr	   r
   r   r   r   bswidthlengthchqkvr   r8   ar   r   r   r     s   zQKVAttentionLegacy.forwardc                 C      t | ||S r   r   r   r   r   r   r   r   count_flops     zQKVAttentionLegacy.count_flops	r   r   r   rd   rT   r   staticmethodr   r   r   r   r   r   r     s    r   c                       r   )r[   zP
    A module which performs QKV attention and splits in a different order.
    c                    r   r   r   r   r   r   r   rT     r   zQKVAttention.__init__c              	   C   s   |j \}}}|d| j  dksJ |d| j  }|jddd\}}}dtt| }	td||	 || j ||||	 || j ||}
tj|
	 dd
|
j}
td|
||| j ||}||d|S )z
        Apply QKV attention.

        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r@   r   r   r   r   r   r   )r^   r   r   r"   r   r    r   viewr   r	   r
   r   r_   r   r   r   r   r     s   zQKVAttention.forwardc                 C   r   r   r   r   r   r   r   r     r   zQKVAttention.count_flopsr   r   r   r   r   r[     s    r[   c                       sV   e Zd ZdZ															d fd
d	Zdd Zdd ZdddZ  ZS )	UNetModela  
    The full UNet model with attention and timestep embedding.

    :param in_channels: channels in the input Tensor.
    :param model_channels: base channel count for the model.
    :param out_channels: channels in the output Tensor.
    :param num_res_blocks: number of residual blocks per downsample.
    :param attention_resolutions: a collection of downsample rates at which
        attention will take place. May be a set, list, or tuple.
        For example, if this contains 4, then at 4x downsampling, attention
        will be used.
    :param dropout: the dropout probability.
    :param channel_mult: channel multiplier for each level of the UNet.
    :param conv_resample: if True, use learned convolutions for upsampling and
        downsampling.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param num_classes: if specified (as an int), then this model will be
        class-conditional with `num_classes` classes.
    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
    :param num_heads: the number of attention heads in each attention layer.
    :param num_head_channels: if specified, ignore num_heads and instead use
                               a fixed channel width per attention head.
    :param num_heads_upsample: works with num_heads to set a different number
                               of heads for upsampling. Deprecated.
    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
    :param resblock_updown: use residual blocks for up/downsampling.
    :param use_new_attention_order: use a different attention pattern for potentially
                                    increased efficiency.
    r   r   r   rs      Tr   NFr   r   c                     s  t    |dkr|}|| _|| _|| _|| _|| _|| _|| _|| _	|	| _
|| _|| _|r1tjntj| _|| _|| _|| _|d }tt||t t||| _| jd ur`t||| _t|d |  }}ttt|
||dddg| _|| _ |g}d}t!|D ]~\}}t"|D ]<}t#|||t|| |
||dg}t|| }||v r|$t%|||||d | j$t|  |  j |7  _ |$| q|t&|d kr|}| j$t|rt#|||||
||d	d
nt'||	|
|d |}|$| |d9 }|  j |7  _ qtt#||||
||dt%|||||dt#||||
||d| _(|  j |7  _ tg | _)t*t!|d d d D ]s\}}t"|d D ]g}|+ }t#|| ||t|| |
||dg}t|| }||v rt|$t%|||||d |r||kr|}|$|rt#|||||
||d	dnt,||	|
|d |d }| j)$t|  |  j |7  _ qEq;tt-d|t t|
||ddd| _.tj/0| j.d j1 d S )Nr   rs   r   r@   r   rm   rp   rB   r   r   r   rZ   r   r   Trp   rB   r   r   r   rB   rp   r   rB   r   r   )rp   rB   r   r   r   r   )2r   rT   
image_sizein_channelsmodel_channelsrp   num_res_blocksattention_resolutionsr   channel_multconv_resamplenum_classesr   r    float16r%   r   rZ   r   num_heads_upsampler4   r   r   r   
time_embed	Embedding	label_embre   
ModuleListri   rD   input_blocks_feature_size	enumerateranger   appendr   rH   r|   middle_blockoutput_blockslistpoprl   r   outr   r   r8   ) r   r   r   r   rp   r   r   r   r   r   rB   r   r   use_fp16rZ   r   r   r   resblock_updownr   time_embed_dimr   input_chinput_block_chansdslevelmult_layersout_chiichr   r   r   rT     sJ  





	



	)zUNetModel.__init__c                 C   (   | j t | jt | jt dS z<
        Convert the torso of the model to float16.
        N)r   rG   r=   r   r   r   r   r   r   convert_to_fp16     zUNetModel.convert_to_fp16c                 C   r  z<
        Convert the torso of the model to float32.
        N)r   rG   r?   r   r   r  r   r   r   convert_to_fp32  r  zUNetModel.convert_to_fp32c                 C   s   |du| j duksJ dg }| t|| j}| j dur/|j|jd fks(J || | }|| j}| jD ]}|||}|	| q8| 
||}| jD ]}tj|| gdd}|||}qN||j}| |S )a  
        Apply the model to an input batch.

        :param x: an [N x C x ...] Tensor of inputs.
        :param timesteps: a 1-D batch of timesteps.
        :param y: an [N] Tensor of labels, if class-conditional.
        :return: an [N x C x ...] Tensor of outputs.
        Nz<must specify y if and only if the model is class-conditionalr   r   r   )r   r   r1   r   r^   r   r
   r   r   r   r   r   r    r'   r   r   )r   r   r+   r   hsrh   r   moduler   r   r   r     s,   	





zUNetModel.forward)r   r   Tr   NFFr   r   r   FFFr   	r   r   r   rd   rT   r  r  r   r   r   r   r   r   r     s(    & ;r   c                       s.   e Zd ZdZ fddZd fdd	Z  ZS )SuperResModelz
    A UNetModel that performs super-resolution.

    Expects an extra kwarg `low_res` to condition on a low-resolution image.
    c                    s$   t  j||d g|R i | d S )Nr   )r   rT   )r   r   r   r/   rC   r   r   r   rT     s   $zSuperResModel.__init__Nc           	         sJ   |j \}}}}tj|||fdd}tj||gdd}t j||fi |S )Nbilinearru   r   r   )r^   rx   ry   r    r'   r   r   )	r   r   r+   low_resrC   r   
new_height	new_width	upsampledr   r   r   r     s   
zSuperResModel.forwardr   r{   r   r   r   r   r    s    r  c                       sT   e Zd ZdZ														d fd
d	Zdd Zdd Zdd Z  ZS )EncoderUNetModelz^
    The half UNet model with attention and timestep embedding.

    For usage, see UNet.
    r   r   Tr   Fr   r   adaptivec                    sL  t    |dkr|}|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|r+tjntj| _|| _|| _|| _|d }tt||t t||| _t|d | }ttt|
||dddg| _|| _|g}d}t|D ]}\}}t|D ]<}t|||t|| |
||dg}t|| }||v r| t!|||||d | j t|  |  j|7  _| | qz|t"|d kr|}| j t|rt|||||
||d	d
nt#||	|
|d |}| | |d9 }|  j|7  _qrtt||||
||dt!|||||dt||||
||d| _$|  j|7  _|| _%|dkrBtt&d|t t'dt|
||dt( | _)tj*+| j)d j, d S |dkrd|dksNJ tt&d|t t-|| |||| _)d S |dkrtt| jdt. td| j| _)d S |dkrtt| jdt&ddt td| j| _)d S t/d| d)Nr   rs   r   r@   r   rm   r   r   Tr   r   r   r   r  r   )r   r   r\   r   i   
spatial_v2zUnexpected z pooling)0r   rT   r   r   rp   r   r   r   r   r   r   r    r   r%   r   rZ   r   r   r4   r   r   r   r   re   r   ri   rD   r   r   r   r   r   r   r   rH   r|   r   poolr   AdaptiveAvgPool2dFlattenr   r   r   r8   rN   ReLUNotImplementedError)r   r   r   r   rp   r   r   r   r   r   rB   r   r   rZ   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   r   rT     s  




	







zEncoderUNetModel.__init__c                 C      | j t | jt dS r  )r   rG   r=   r   r  r   r   r   r       z EncoderUNetModel.convert_to_fp16c                 C   r  r  )r   rG   r?   r   r  r   r   r   r    r  z EncoderUNetModel.convert_to_fp32c                 C   s   |  t|| j}g }|| j}| jD ]}|||}| jdr.|||jj	dd q| 
||}| jdrT|||jj	dd tj|dd}| |S ||j}| |S )z
        Apply the model to an input batch.

        :param x: an [N x C x ...] Tensor of inputs.
        :param timesteps: a 1-D batch of timesteps.
        :return: an [N x K] Tensor of outputs.
        r   )r   r@   r   r   )axis)r   r1   r   r
   r   r   r  
startswithr   r`   r   r    r'   r   )r   r   r+   rh   resultsr   r	  r   r   r   r     s"   




zEncoderUNetModel.forward)r   r   Tr   FFr   r   r   FFFr  r
  r   r   r   r   r    s(     $r  c                       sD   e Zd Zdddddg dddd	d
ddddddd
f fdd	Z  ZS )
UNetConfigi   r@         r   )   r   @   g        )rS   r   r   r   r   rs   rs   NFTrs   r"  r   c                    s|   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _t jdi | d S )Nr   )r   r   r   rp   r   r   r   r   r   r   r   rZ   r   r   r   r   r   r   rT   )r   r   r   r   rp   r   r   r   r   r   r   r   rZ   r   r   r   r   r   rC   r   r   r   rT     s$   zUNetConfig.__init__)r   r   r   rT   r   r   r   r   r   r    s&    r  c                       s2   e Zd ZeZ fddZdddZdd Z  ZS )	HFUNetModelc                    s   t  | tdi d|jd|jd|jd|jd|jd|jd|j	d|j
d	|jd
|jd|jd|jd|jd|jd|jd|jd|j| _d S )Nr   r   r   rp   r   r   r   r   r   r   r   rZ   r   r   r   r   r   r   )r   rT   r   r   r   r   rp   r   r   r   r   r   r   r   rZ   r   r   r   r   r   r   )r   configr   r   r   rT     sH   	

zHFUNetModel.__init__Nc                 C   s   | j |||S r   )r   r   )r   r   r+   r   r   r   r   r     s   zHFUNetModel.forwardc                 C   s.   | j jt | j jt | j jt dS r  )r   r   rG   r=   r   r   r  r   r   r   r    s   zHFUNetModel.convert_to_fp16r   )	r   r   r   r  config_classrT   r   r  r   r   r   r   r   r#    s
    
r#  )r   )&r"   abcr   numpyr   torchr    torch.nnr4   torch.nn.functional
functionalrx   transformersr   r   r   r1   r=   r?   rD   rM   ModulerN   rf   r   ri   rl   r|   r   r   r   r   r[   r   r  r  r  r#  r   r   r   r   <module>   s>   


!"t4"$   T)