o
    ߥiE#                    @   s  d dl Z d dlZd dlmZ d dlZd dlmZ d dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ dgZdZdSd	d
Zdd Zdd Zdd Zdd Zdd ZG dd dejZG dd dejZejddZG dd dejZG dd dejZ G dd  d ejZ!d!d" Z"G d#d$ d$ejZ#G d%d& d&ejZ$G d'd( d(ejZ%G d)d* d*ejZ&G d+d, d,ejZ'G d-d. d.ejZ(G d/d0 d0ejZ)G d1d2 d2ejZ*G d3d4 d4ejZ+G d5d6 d6ejZ,G d7d8 d8ejZ-G d9d: d:ejZ.G d;d< d<ejZ/G d=d dejZ0G d>d? d?ejZ1G d@dA dAejZ2G dBdC dCejZ3G dDdE dEejZ4G dFdG dGejZ5G dHdI dIejZ6G dJdK dKejZ7e8dLkrd dMl9m:Z: e0e:j;e:j<e:j=e:j>e:j?e:j@e:jAe:jBe:jCe:jDe:jEd e:jFde:jGdNZHeIeJeKdOdP eHL D dQ dR dS dS )T    N)partial	rearrange)checkpoint_wrapper)RotaryEmbedding)einsumUNetSD_temporalTc                    sP   |d u r }i } fdd|   D } |   D ]\}}| |}|||< q|S )Nc                    s   i | ]\}} |v r||qS  r	   ).0keyvalueprefixr	   g/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/videocomposer/unet_sd.py
<dictcomp>   s    zload_Block.<locals>.<dictcomp>)itemsreplace)stater   
new_prefix
state_dictr   r   new_keyr	   r   r   
load_Block   s   
r   c              	      s   i }|j  |j}|j}|j} fdddg| D } fdd|d g|d d d  D }g }d}	t| dd}
||
 t| d	d}
||
 t| d
d}
||
 d}t| d| d| dd}
||
 |d7 }|  tt|d d |dd  D ]}\}\}}t	|D ]r}d}d}t| d| d| d| d| d}
||
 |d7 }d}|	|v rt| d| d| d| d| d}
||
 |}|d7 }|| |t
|d kr||d krt| ddd}
||
 || |	d }	|d7 }qqd}t| d| d}
||
 |d7 }t| dd| d}
||
 |d7 }t	|jD ]}|d7 }q)t| dd| d}
||
 |d7 }d}tt|d d |dd  D ]\}\}}t	|d D ]}d}d}t| d| d| d| d| d}
||
 |d7 }|d7 }|	|v rt| d| d| d| d| d}
||
 |d7 }|d7 }t	|jD ]}|d7 }q|t
|d kr||krt| d| d| d| d| d}
||
 |d7 }|d7 }|	d9 }	|d7 }qaqUt| dd}
||
 |S )Nc                       g | ]} | qS r	   r	   r
   udimr	   r   
<listcomp>.       z1load_2d_pretrained_state_dict.<locals>.<listcomp>   c                    r   r	   r	   r   r   r	   r   r   /   r         ?time_embeddingr   y_embeddingcontext_embeddingr   zencoder.z.0)r   r   .   zencoder.{encoder_idx}zencoder.{encoder_idx}.0       @zmiddle.zmiddle.1zmiddle.2zdecoder.head)unet_dimunet_res_blocksunet_dim_multunet_attn_scalesr   updateappend	enumerateziprangelentemporal_attn_times)r   cfgnew_state_dictnum_res_blocksdim_multattn_scalesenc_dimsdec_dimsshortcut_dimsscaler   encoder_idxiin_dimout_dimjidxidx_
middle_idx_decoder_idxr	   r   r   load_2d_pretrained_state_dict$   s   &





*




%


*



#
rG   c              	   C   s   |d }|   } t| tdt|| | }tjt|t	|gdd}|d dkrEtj|t
|d d d df gdd}|S )Nr&   i'  r   r   r   )floattorchouterpowarangetodivcatcossin
zeros_like)	timestepsr   halfsinusoidxr	   r	   r   sinusoidal_embedding   s   (rW   c                 C   s   | d uS Nr	   )rV   r	   r	   r   exists   s   rY   c                 C   s   t | r| S t|r| S |S rX   )rY   callable)valdr	   r	   r   default   s   r]   c                 C   sf   |dkrt j| |t jdS |dkrt j| |t jdS t j| |d dd|k }| r1d|d< |S )Nr   devicedtyper   r_   F)rI   onesboolzerosrH   uniform_all)shapeprobr_   maskr	   r	   r   prob_mask_like   s   rj   c                       s8   e Zd Zd
 fdd	Ze		dddZdd	 Z  ZS )RelativePositionBias          c                    s(   t    || _|| _t||| _d S rX   )super__init__num_bucketsmax_distancenn	Embeddingrelative_attention_bias)selfheadsrq   rr   	__class__r	   r   rp      s   
zRelativePositionBias.__init__c                 C   s   d}|  }|d }||dk   | 7 }t|}|d }||k }|t| | t||  ||     }t|t||d }|t|||7 }|S )Nr   r&   r   )	longrI   abslogrH   mathmin	full_likewhere)relative_positionrq   rr   retn	max_exactis_smallval_if_larger	   r	   r   _relative_position_bucket   s(   
z.RelativePositionBias._relative_position_bucketc                 C   s`   t j|t j|d}t j|t j|d}t|dt|d }| j|| j| jd}| |}t|dS )N)r`   r_   zj -> 1 jzi -> i 1)rq   rr   zi j h -> h i j)rI   rL   rz   r   r   rq   rr   ru   )rv   r   r_   q_posk_posrel_pos	rp_bucketvaluesr	   r	   r   forward   s   

zRelativePositionBias.forward)rl   rm   rn   )rm   rn   )__name__
__module____qualname__rp   staticmethodr   r   __classcell__r	   r	   rx   r   rk      s    rk   c                       s8   e Zd ZdZ						d fdd	Zdd	d
Z  ZS )SpatialTransformera  
    Transformer block for image-like data.
    First, project the input (aka embedding)
    and reshape to b, t, d.
    Then apply standard transformer action.
    Finally, reshape to image
    NEW: use_linear for more efficiency instead of the 1x1 convs
    r           NFTc
           
   	      s   t    t rt ts g || _ tjjd|ddd| _	|s1tj
|dddd| _nt|| _t fdd	t|D | _|s]ttj
|dddd| _n	tt|| _|| _d S )
Nrm   ư>T
num_groupsnum_channelsepsaffiner   r   kernel_sizestridepaddingc                    s&   g | ]}t  | d qS ))dropoutcontext_dimdisable_self_attn
checkpointBasicTransformerBlockr
   r\   r   d_headr   r   	inner_dimn_headsuse_checkpointr	   r   r     s    z/SpatialTransformer.__init__.<locals>.<listcomp>)ro   rp   rY   
isinstancelistin_channelsrI   rs   	GroupNormnormConv2dproj_inLinear
ModuleListr1   transformer_blockszero_moduleproj_out
use_linear)
rv   r   r   r   depthr   r   r   r   r   rx   r   r   rp     s2   






zSpatialTransformer.__init__c           
      C   s   t |ts|g}|j\}}}}|}| |}| js| |}t|d }| jr-| |}t| j	D ]\}}	|	||| d}q2| jrG| 
|}t|d||d }| jsY| 
|}|| S )Nzb c h w -> b (h w) ccontextzb (h w) c -> b c h whw)r   r   rg   r   r   r   r   
contiguousr/   r   r   )
rv   rV   r   bcr   r   x_inr>   blockr	   r	   r   r   /  s$   





zSpatialTransformer.forward)r   r   NFFTrX   r   r   r   __doc__rp   r   r   r	   r	   rx   r   r      s    *r   ATTN_PRECISIONfp32c                       s0   e Zd Z				d	 fdd	Zd
ddZ  ZS )CrossAttentionNrl   @   r   c                    s   t    || }t||}|d | _|| _tj||dd| _tj||dd| _tj||dd| _	t
t||t|| _d S )N      Fbias)ro   rp   r]   r<   rw   rs   r   to_qto_kto_v
SequentialDropoutto_out)rv   	query_dimr   rw   dim_headr   r   rx   r	   r   rp   J  s   



zCrossAttention.__init__c           
         s2  | j  | |}t||}| |}| |}t fdd|||f\}}}tdkrUtjddd |	 |	 }}t
d||| j }W d    n1 sOw   Y  n
t
d||| j }~~t|rt|d}t|jj }t|d	 d
}|| | |jdd}t
d||}	t|	d d
}	| |	S )Nc                       t | d dS )Nzb n (h d) -> (b h) n dr   r   tr   r	   r   <lambda>f      z(CrossAttention.forward.<locals>.<lambda>r   Fcuda)enableddevice_typezb i d, b j d -> b i jzb ... -> b (...)zb j -> (b h) () jr   r    r   zb i j, b j d -> b i dz(b h) n d -> b n (h d))rw   r   r]   r   r   map_ATTN_PRECISIONrI   autocastrH   r   r<   rY   r   finfor`   maxrepeatmasked_fill_softmaxr   )
rv   rV   r   ri   qkvsimmax_neg_valueoutr	   r   r   r   ^  s2   






zCrossAttention.forward)Nrl   r   r   )NNr   r   r   rp   r   r   r	   r	   rx   r   r   H  s    r   c                       s<   e Zd Z					d fdd	ZdddZdd	d
Z  ZS )r   r   NTFc	           
         s   t    t}	|| _|	||||| jr|nd d| _t|||d| _|	|||||d| _t	|| _
t	|| _t	|| _|| _d S )N)r   rw   r   r   r   )r   glu)r   r   rw   r   r   )ro   rp   r   r   attn1FeedForwardffattn2rs   	LayerNormnorm1norm2norm3r   )
rv   r   r   r   r   r   gated_ffr   r   attn_clsrx   r	   r   rp     s,   
	
zBasicTransformerBlock.__init__c                 C   s   t | j||f|  | j S rX   )r   _forward
parametersrv   rV   r   r	   r	   r   forward_  s   zBasicTransformerBlock.forward_c                 C   sR   | j | || jr|nd d| }| j| ||d| }| | || }|S )Nr   )r   r   r   r   r   r   r   r   r	   r	   r   r     s   zBasicTransformerBlock.forward)r   NTTFrX   )r   r   r   rp   r   r   r   r	   r	   rx   r   r     s    
r   c                       $   e Zd Z fddZdd Z  ZS )GEGLUc                    s    t    t||d | _d S )Nr&   )ro   rp   rs   r   proj)rv   dim_indim_outrx   r	   r   rp     s   
zGEGLU.__init__c                 C   s&   |  |jddd\}}|t| S )Nr&   r    r   )r   chunkFgelu)rv   rV   gater	   r	   r   r     s   zGEGLU.forwardr   r	   r	   rx   r   r     s    r   c                 C   s   |   D ]}|   q| S )z<
    Zero out the parameters of a module and return it.
    )r   detachzero_)modulepr	   r	   r   r     s   r   c                       s&   e Zd Zd	 fdd	Zdd Z  ZS )
r   N   Fr   c                    sh   t    t|| }t||}|stt||t nt||}t|t	|t||| _
d S rX   )ro   rp   intr]   rs   r   r   GELUr   r   net)rv   r   r   multr   r   r   
project_inrx   r	   r   rp     s   




zFeedForward.__init__c                 C   s
   |  |S rX   )r  rv   rV   r	   r	   r   r     s   
zFeedForward.forward)Nr  Fr   r   r	   r	   rx   r   r     s    
r   c                       0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )
UpsampleaA  
    An upsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    r&   Nr   c                    sJ   t    || _|p|| _|| _|| _|r#tj| j| jd|d| _d S d S )N   r   )	ro   rp   channelsout_channelsuse_convdimsrs   r   conv)rv   r  r  r  r  r   rx   r	   r   rp     s   

zUpsample.__init__c                 C   st   |j d | jks
J | jdkr(tj||j d |j d d |j d d fdd}ntj|ddd}| jr8| |}|S )Nr   r  r&   r  nearest)mode)scale_factorr  )rg   r  r  r  interpolater  r  r  r	   r	   r   r     s   
$
zUpsample.forwardr&   Nr   r   r	   r	   rx   r   r    s    r  c                       sB   e Zd ZdZ								d fdd	Zdd	 Zd
d Z  ZS )ResBlocka  
    A residual block that can optionally change the number of channels.
    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    NFr&   Tc                    s  t    || _|| _|| _|p|| _|| _|| _|
| _t	
t	d|t	 t	j|| jddd| _|p5|	| _|rHt|d|| _t|d|| _n|	rYt|d|| _t|d|| _nt	  | _| _t	
t	 t	||rpd| j n| j| _t	
t	d| jt	 t	j|dtt	j| j| jddd| _| j|krt	 | _n|rt||| jddd| _n	t	|| jd| _| jrt| j| jd|d	| _d S d S )
Nrm   r  r   r  Fr&   )r  皙?)r   use_image_dataset)ro   rp   r  emb_channelsr   r  r  use_scale_shift_normuse_temporal_convrs   r   r   SiLUr   	in_layersupdownr  h_updx_upd
DownsampleIdentityr   
emb_layersr   r   
out_layersskip_connectionconv_ndTemporalConvBlock_v2temopral_conv)rv   r  r   r   r  r  r!  r  updownr"  r  rx   r	   r   rp     sj   






zResBlock.__init__c                 C   s   |  |||S )a  
        Apply the block to a Tensor, conditioned on a timestep embedding.
        :param x: an [N x C x ...] Tensor of features.
        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        )r   )rv   rV   emb
batch_sizer	   r	   r   r   O  s   zResBlock.forwardc                 C   s0  | j r#| jd d | jd }}||}| |}| |}||}n| |}| ||j}t|jt|jk rI|d }t|jt|jk s;| j	rr| j
d | j
dd  }}	tj|ddd\}
}||d|
  | }|	|}n	|| }| 
|}| || }| jrt|d|d}| |}t|d	}|S )
Nr    ).Nr   r   r&   r   (b f) c h w -> b c f h wr   b c f h w -> (b f) c h w)r%  r$  r&  r'  r*  typer`   r2   rg   r!  r+  thr   r,  r"  r   r/  )rv   rV   r2  r3  in_restin_convr   emb_outout_normout_restr<   shiftr	   r	   r   r   X  s0   







zResBlock._forward)NFFr&   FFTF)r   r   r   r   rp   r   r   r   r	   r	   rx   r   r    s    G	r  c                       r  )
r(  aD  
    A downsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    r&   Nr   c                    s|   t    || _|p|| _|| _|| _|dkrdnd}|r,tj| j| jd||d| _d S | j| jks4J t	|||d| _d S )Nr  r&   )r   r&   r&   r   r   )r   r   )
ro   rp   r  r  r  r  rs   r   opavg_pool_nd)rv   r  r  r  r  r   r   rx   r	   r   rp   ~  s    

zDownsample.__init__c                 C   s   |j d | jks
J | |S Nr   )rg   r  r@  r  r	   r	   r   r     s   
zDownsample.forwardr  r   r	   r	   rx   r   r(  u  s    r(  c                       s&   e Zd Z fddZdddZ  ZS )Resamplec                    s0   |dv sJ t t|   || _|| _|| _d S )N)noneupsample
downsample)ro   rC  rp   r?   r@   r  )rv   r?   r@   r  rx   r	   r   rp     s
   
zResample.__init__Nc                 C   sh   | j dkr|d usJ tj||jdd  dd}|S | j dkr2tj|tdd |jdd  D d}|S )	NrE  r  )sizer  rF  c                 s   s    | ]}|d  V  qdS )r&   Nr	   r   r	   r	   r   	<genexpr>  s    z#Resample.forward.<locals>.<genexpr>)output_size)r  r  r  rg   adaptive_avg_pool2dtuple)rv   rV   	referencer	   r	   r   r     s   

zResample.forwardrX   r   r	   r	   rx   r   rC    s    rC  c                       s.   e Zd Z			d	 fdd	Zd
ddZ  ZS )ResidualBlockTrD  r   c                    s   t t|   || _|| _|| _|| _|| _t	t
d|t tj||ddd| _t|||| _t	t t||r@|d n|| _t	t
d|t t|tj||ddd| _||krft nt||d| _tj| jd j d S )Nrm   r  r   r  r&   r    )ro   rN  rp   r?   	embed_dimr@   r!  r  rs   r   r   r#  r   layer1rC  resampler   	embeddingr   layer2r)  shortcutinitzeros_weight)rv   r?   rO  r@   r!  r  r   rx   r	   r   rp     s2   zResidualBlock.__init__Nc                 C   s   |  ||}| jd |  | jd d ||}| |dd|j}| jrJ|jddd\}}| jd |d|  | }| jdd  |}n	|| }| |}|| 	| }|S )Nr    r&   r   r   r   )
rQ  rP  rR  	unsqueezer7  r`   r!  r   rS  rT  )rv   rV   erM  identityr<   r>  r	   r	   r   r     s   $
zResidualBlock.forward)TrD  r   rX   r   r	   r	   rx   r   rN    s     rN  c                       s(   e Zd Zd fdd	ZdddZ  ZS )AttentionBlockNc                    s   |r|| n|}|| }|| |ksJ t t|   || _|| _|| _|| _t|d| _	t
d|| _t
||d d| _|d urLt
||d | _t
||d| _t
j| jj d S )Ng      пrm   r  r   r&   )ro   r[  rp   r   r   	num_headshead_dimr}   rK   r<   rs   r   r   r   to_qkvr   
context_kvr   rU  rV  rW  )rv   r   r   r\  r]  rx   r	   r   rp     s   zAttentionBlock.__init__c                 C   s&  |}g |  | j| jR \}}}}}}	| |}| |||d |	|| jddd\}
}}|dura| ||d|d |		ddddjddd\}}t
j||gdd}t
j||gdd}t
|
dd| j || j }tj|dd}t
||dd}|||||}| |}|| S )	zGx:       [B, C, H, W].
            context: [B, L, C] or None.
        r  r   r   Nr    r&   r   rG  )rH  r\  r]  r   r^  viewr   r_  reshapepermuterI   rO   matmul	transposer<   r  r   r   )rv   rV   r   rZ  r   r   r   r   r   r\   r   r   r   ckcvattnr	   r	   r   r     s,   &
.
 
zAttentionBlock.forwardNNNrX   r   r	   r	   rx   r   r[    s    r[  c                       s8   e Zd Z					d	 fdd	Z			d
ddZ  ZS )TemporalAttentionBlockr  rm   NFc                    s~   t    || }|| |ksJ || _|| _|d | _|| _|| }td|| _|| _	t
||d | _t
||| _d S )Nr   rm   r  )ro   rp   r  use_sim_maskr<   rw   rs   r   r   
rotary_embr   r^  r   )rv   r   rw   r   rk  r  rj  
hidden_dimrx   r	   r   rp     s   

zTemporalAttentionBlock.__init__c                 C   s  |}|j d |j d |j}}}| |}t|d}| |jddd}	t|rA| rA|	d }
| |
}t|d|d}|| S t|	d	 d
| j	d}t|	d d
| j	d}t|	d d
| j	d}|| j
 }t| jru| j|}| j|}td||}t|r|| }|d u r|d ur|d d d d d f |d d d d d f  }|dd}|| t|jj }n:t|r|  stj||f|tjd}tj||tjd}tt|dt|dt|d}|| t|jj }| jrtjtj||f|tjdd	d}|| t|jj }||jddd  }|jdd}td||}t|d}| |}t|d|d}| jrH|d	|  }|S || }|S )Nr&   rG  zb c f h w -> b (h w) f cr  r    r   zb (h w) f c -> b c f h wr   r   z... n (h d) -> ... h n dr   z!... h i d, ... h j d -> ... h i jr^   zb -> b 1 1 1 1zi j -> 1 1 1 i j)diagonalT)r   keepdimz!... h i j, ... h j d -> ... h i dz... h n d -> ... n (h d))rg   r_   r   r   r^  r   rY   rf   r   rw   r<   rk  rotate_queries_or_keysrI   r   rX  masked_fillr   r`   r   rb   rc   eyer   rj  trilamaxr  r   r  )rv   rV   pos_biasfocus_present_mask
video_maskrZ  r   heightr_   qkvr   r   r   r   r   r   ri   attend_all_maskattend_self_masksim_maskrg  r	   r	   r   r   .  sj   




,


zTemporalAttentionBlock.forward)r  rm   NFFrh  r   r	   r	   rx   r   ri    s    ri  c                       s<   e Zd ZdZ								d fdd	Zdd	d
Z  ZS )TemporalTransformerz
    Transformer block for image-like data.
    First, project the input (aka embedding)
    and reshape to b, t, d.
    Then apply standard transformer action.
    Finally, reshape to image
    r   r   NFTc                    s  t    || _|
| _d| _| jrd  t ts g || _ tj	j
d|ddd| _|s;t	j|dddd| _nt	|| _| jrLt	tt| _t	 fd	d
t|D | _|sptt	j|dddd| _ntt	|| _| jrt	tt| _|| _d S )NFrm   r   Tr   r   r   r   c              
      s$   g | ]}t  | d qS ))r   r   r   r   r   r   r   r   r   r   r   r	   r   r     s    z0TemporalTransformer.__init__.<locals>.<listcomp>)ro   rp   multiply_zeroonly_self_attuse_adaptorr   r   r   rI   rs   r   r   Conv1dr   r   frames
adaptor_inr   r1   r   r   r   adaptor_outr   )rv   r   r   r   r   r   r   r   r   r   r  r~  rx   r}  r   rp     sD   




	
zTemporalTransformer.__init__c                 C   s  | j rd }t|ts|g}|j\}}}}}|}| |}| js+t|d }| |}| jr=t|d| j	d }| |}| j r_t|d }t
| jD ]\}	}
|
|}qLt|d|d }nFt|d|d }t
| jD ]7\}	}
t||	 d| j	d ||	< t|D ] }t||	 | d	|| | j	 | j	d
 }|
|| |d||< qqm| jr| |}t|d||d }| jst|d }| |}t|d|||d }| jrd| | }|S || }|S )Nzb c f h w -> (b h w) c fz(b f) c h w -> b (h w) f c)fzbhw c f -> bhw f cz(b hw) f c -> b hw f cr5  z(b hw) c f -> b hw f cz(b f) l con -> b f l conzf l con -> (f r) l con)rr  r   zb (h w) f c -> b f c h wr   zb hw f c -> (b hw) c fz(b h w) c f -> b c f h w)r   r   r   r   )r  r   r   rg   r   r   r   r   r   r  r/   r   r1   r   r   r~  )rv   rV   r   r   r   r  r   r   r   r>   r   rA   context_i_jr	   r	   r   r     sr   








zTemporalTransformer.forward)r   r   NFFTTFrX   r   r	   r	   rx   r   r|    s    4r|  c                       s:   e Zd Z						d
 fdd	Z			ddd	Z  ZS )TemporalAttentionMultiBlockr  rm   NFr   c                    s6   t    t fddt|D | _d S )Nc              	      s   g | ]}t  qS r	   )ri  )r
   rE   r   r   rw   rk  r  rj  r	   r   r     s    
z8TemporalAttentionMultiBlock.__init__.<locals>.<listcomp>)ro   rp   rs   r   r1   
att_layers)rv   r   rw   r   rk  r  rj  r3   rx   r  r   rp     s   

z$TemporalAttentionMultiBlock.__init__c                 C   s   | j D ]	}|||||}q|S rX   )r  )rv   rV   rt  ru  rv  layerr	   r	   r   r     s   
z#TemporalAttentionMultiBlock.forward)r  rm   NFFr   rh  r   r	   r	   rx   r   r    s    r  c                       ,   e Zd Z			d fdd	Zdd Z  ZS )	InitTemporalConvBlockNr   Fc                    s   t t|   |d u r|}|| _|| _|| _ttd|t	 t
|tj||ddd| _tj| jd j tj| jd j d S Nrm   )r  r   r   )r   r   r   r  r    )ro   r  rp   r?   r@   r  rs   r   r   r#  r   Conv3dr  rU  rV  rW  r   rv   r?   r@   r   r  rx   r	   r   rp     s   zInitTemporalConvBlock.__init__c                 C   s0   |}|  |}| jr|d|  }|S || }|S Nr   )r  r  rv   rV   rZ  r	   r	   r   r   ,  s   
zInitTemporalConvBlock.forwardNr   Fr   r	   r	   rx   r   r    s    r  c                       r  )	TemporalConvBlockNr   Fc                    s   t t|   |d u r|}|| _|| _|| _ttd|t	 tj
||ddd| _ttd|t	 t|tj
||ddd| _tj| jd j tj| jd j d S r  )ro   r  rp   r?   r@   r  rs   r   r   r#  r  conv1r   conv2rU  rV  rW  r   r  rx   r	   r   rp   8  s    zTemporalConvBlock.__init__c                 C   s:   |}|  |}| |}| jr|d|  }|S || }|S r  )r  r  r  r  r	   r	   r   r   P  s   

zTemporalConvBlock.forwardr  r   r	   r	   rx   r   r  6  s    r  c                       r  )	r.  Nr   Fc                    s  t t|   |d u r|}|| _|| _|| _ttd|t	 tj
||ddd| _ttd|t	 t|tj
||ddd| _ttd|t	 t|tj
||ddd| _ttd|t	 t|tj
||ddd| _tj| jd j tj| jd j d S r  )ro   r.  rp   r?   r@   r  rs   r   r   r#  r  r  r   r  conv3conv4rU  rV  rW  r   r  rx   r	   r   rp   ]  s0   zTemporalConvBlock_v2.__init__c                 C   sN   |}|  |}| |}| |}| |}| jr!|d|  }|S || }|S )Nr   )r  r  r  r  r  r  r	   r	   r   r   {  s   



zTemporalConvBlock_v2.forwardr  r   r	   r	   rx   r   r.  [  s    r.  c                       s   e Zd Zdddddddg dddd	g d
dddddddddddddgddddf fdd	Z															dddZ	dddZ  ZS )r      i      rl      )r   r&   r  r  Nr   r  )      ?g      ?g      ?Tr  r   Fr  textri   c           -         s

   d }|
r|
n d }
t t|   || _|| _|| _|| _ | _|| _|| _	|| _
|| _|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _d}d} d}! fdddg|	 D }" fdd|	d g|	d d d  D }#g }$d	}%t |d
r|j!r|j!}&nd}&t"#t"$ |t"% t"$||| _&t"#t"$ddt"% t"$dd| _'d| jv r	t"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _*t+d|||d|d|&d| _,d| jv rMt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _-t+d|||d|d|&d| _.d| jv rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _/t+d|||d|d|&d| _0d| jv r|rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |ddddnd | _1t+d|||d|d|&d| _2d| jv rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _3t+d|||d|d|&d| _4d| jv rbt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _5t+d|||d|d|&d| _6d| jv rt"#t"j(d|d dddt"% t")dt"j(|d |d ddddt"% t"j(|d |dddd| _7t+d|||d|d|&d| _8t9|| _|rt:st;t<d|| _=t>|
dd| _?| jrt"#t"$ |t"% t"$||| _@t"jAB| j@d jC t"jAB| j@d jD t"E | _F|jGr	t"# | _Ht"Et"j(| j|  dddg}'nt"#t"j(| j| | jddd| _Ht"Et"j(| j dddg}'|rLt:r>|'ItJ |
|| ||!||d n|'ItK |
|| j=||d | jFI|' |$I  tLtM|"d d |"dd  D ]\}(\}}tN|D ]})t"EtO||||d|dg}*|%|v r|*ItP||| |d| j	ddd  | jrt:r|*ItJ||| || ||!||d n|*ItK||
|| j=|||d! |}| jFI|* |$I| |(tQ|	d kr|)|d krtR|dd|d"}+|$I| |%d# }%| jFI|+ qpqft"EtO|||d|d$tP||| |d| j	ddd g| _S| jr;t:r+| jSItJ||| || ||!||d n| jSItK||
|| j=|||d! | jSItO|||dd% t"E | _TtLtM|#d d |#dd  D ]\}(\}}tN|d D ]y})t"EtO||$U  |||d|d$g}*|%|v r|*ItP||| |ddddd  | jrt:r|*ItJ||| || ||!||d n|*ItK||
|| j=|||d! |}|(tQ|	d kr|)|krtV|dd#|d"},|%d#9 }%|*I|, | jTI|* qgq[t"#t"Wd|t"% t"j(|| jddd| _Xt"jAB| jXd jC d S )&Nr  rm   Fr   c                    r   r	   r	   r   r   r	   r   r     r   z,UNetSD_temporal.__init__.<locals>.<listcomp>c                    r   r	   r	   r   r   r	   r   r     r   r    r!   adapter_transformer_layersi   depthmapr  r  )rn   rn   r&   r?  皙?)rw   r   
dim_head_k
dim_head_vdropout_attemlp_dimdropout_ffnr   motioncannyri   sketchsingle_sketchlocal_image)rw   rr   r   )r   r   r   r   r~  )rk  r3   r  )r  r!  r  T)r   r   r   r   )rk  r  rj  r3   )r  r  r'   )r!  r  )r!  )Yro   r   rp   zero_yblack_image_featurer4   r?   r   y_dimr   hist_dim
concat_dimrO  r@   r7   r\  r]  r6   r8   r!  r3   temporal_attentionr   r  use_fps_conditionrj  training
inpaintingvideo_compositionsmisc_dropout
p_all_zero
p_all_keephasattrr  rs   r   r   r#  
time_embedpre_image_conditionr   AdaptiveAvgPool2ddepth_embeddingTransformer_v2depth_embedding_aftermotion_embeddingmotion_embedding_aftercanny_embeddingcanny_embedding_aftermasked_embeddingmask_embedding_aftersketch_embeddingsketch_embedding_aftersingle_sketch_embeddingsingle_sketch_embedding_afterlocal_image_embeddinglocal_image_embedding_afterDropPathUSE_TEMPORAL_TRANSFORMERr   r~   rk  rk   time_rel_pos_biasfps_embeddingrU  rV  rW  r   r   input_blocksresume	pre_imager.   r|  r  r/   r0   r1   r  r   r2   r(  middle_blockoutput_blockspopr  r   r   )-rv   r4   r?   r   r  r   r  r  r@   r7   r\  r]  r6   r8   r!  r   r3   r  r   r  r  rj  r  r  r  r  r  r  r  r  rO  use_linear_in_temporaltransformer_depthdisabled_sar9   r:   r;   r<   r  
init_blockr>   rA   r   rF  rE  rx   r   r   rp     sR   &








	


		
 
:




		
6zUNetSD_temporal.__init__r   r   c           %         s  | j s|	d u sJ d|j\ }}}}|j | _|dkr'd }d|| d < nt| fdd}| jrCtsC| j|jd |jd}nd }tj	 tj
d|j}tj	 tj
d|j}| jrt | jk  }t | jk  }t }d	||d| < d	|||||  < ||@  rJ t| j||d
}| | j|||}|d urt|d}| |}|jd }| t|d d}t|d |d}||| }|d urt|d}| |}|jd }| t|d d}t|d |d}||| }|d urXt|d}| |}|jd }| t|d d}t|d |d}t| jdrR| jj rR| jrRt | jj!k }|d d d d d d f }|"|# d}|| }n||| }|
d urt|
d}
| $|
}
|
jd }| %t|
d d}
t|
d |d}
|||
 }|d urt|d}| &|}|jd }| 't|d d}t|d |d}||| }|d urt|d}| (|}|jd }| )t|d d}t|d |d}||| }|	d urt|	d}	| *|	}	|	jd }| +t|	d d}	t|	d |d}	|||	 }tj,||gdd}t|d}| -|}t|d d}| j.r>|d ur>| /t0|| j1| 2t0|| j1 }n	| /t0|| j1}| d| j3} |d urb||}!tj,| |!gdd} n| j45 dd}!tj,| |!gdd} |d ur|| 6|}"tj,| |"gdd} |j7|dd}| j7|dd} t|d}g }#| j8D ]}$| 9|$||| |||}|#:| q| j;D ]}$| 9|$||| |||}q| j<D ]&}$tj,||#= gdd}| j9|$||| |||t>|#dkr|#d nd d}q| ?|}t|d d}|S )Nzinpainting is not supportedr   Fc                      s   t  fdS )Nra   )rj   r	   batchr_   prob_focus_presentr	   r   r   e  s    z)UNetSD_temporal.forward.<locals>.<lambda>r&   ra   r`   Tzerokeepr6  z(b f) c h w -> (b h w) f cr5  z(b h w) f c -> b c f h wr   r   p_zero_motion_aloner   r   r4  )repeatsr   r    )rM  )@r  rg   r_   r  r]   r  r  r  rI   rd   rc   rM   r  randr  sumr  randpermanyr   r  	new_zerosr  r   r  r  r  r  r  r  r  r4   r  p_zero_motionrp  r   r  r  r  r  r  r  r  r  rO   r  r  r  rW   r   r  r   r  r   r  repeat_interleaver  _forward_singler.   r  r  r  r2   r   )%rv   rV   r   yr   imager  r  r  maskedr  r  	histogramfpsrv  ru  r  mask_last_frame_numr   r  r   r   r  r  r  nzeronkeepindexr  concatmotion_drY  r   	y_contextimage_contextxsr   r	   r  r   r   D  sD  






































zUNetSD_temporal.forwardc	           
      C   s  t |tr| jrt|n|}| }||||}|S t |tr5| jr&t|n|}| }|||| j}|S t |trJ| jrAt|n|}|||}|S t |trl| jrVt|n|}t	|d| jd}|||}t	|d}|S t |t
r| jrxt|n|}|||}|S t |tr| jrt|n|}|||}|S t |tr|||}|S t |tr||}|S t |tr||}|S t |tr|||}|S t |tr| jrt|n|}t	|d| jd}|||||}t	|d}|S t |tr| jrt|n|}t	|d| jd}|||||}t	|d}|S t |tr0| jrt|n|}t	|d| jd}||}t	|d}|S t |trS| jr>t|n|}t	|d| jd}||}t	|d}|S t |tjrn|D ]}	| |	|||||||}q\|S ||}|S )Nr4  r5  r6  )r   rN  r   r   r   r  r  r   r|  r   r   r   r   r  r(  rC  ri  r  r  r  rs   r   r  )
rv   r  rV   rY  r   r  ru  rv  rM  r   r	   r	   r   r    s   
	?
:

6


0

,

(

&
$
"

 




zUNetSD_temporal._forward_single)NNNNNNNNNNNNNr   r   rX   )r   r   r   rp   r   r  r   r	   r	   rx   r   r     sh       @
 `c                       r   )PreNormattentionc                        t    t|| _|| _d S rX   ro   rp   rs   r   r   fnrv   r   r  rx   r	   r   rp   l     

zPreNormattention.__init__c                 K   s   | j | |fi || S rX   r  r   rv   rV   kwargsr	   r	   r   r   q     zPreNormattention.forwardr   r	   r	   rx   r   r  j      r  c                       r   )PreNormattention_qkvc                    r  rX   r  r  rx   r	   r   rp   w  r  zPreNormattention_qkv.__init__c                 K   s,   | j | || || |fi || S rX   r  )rv   r   r   r   r  r	   r	   r   r   |  s   ,zPreNormattention_qkv.forwardr   r	   r	   rx   r   r  u  r  r  c                       &   e Zd Zd fdd	Zdd Z  ZS )		Attentionrl   r   r   c                    s   t    || }|dko||k }|| _|d | _tjdd| _tj||d dd| _|r>t	t||t
|| _d S t | _d S )Nr   r   r    r   r  Fr   )ro   rp   rw   r<   rs   Softmaxattendr   r^  r   r   r)  r   rv   r   rw   r   r   r   project_outrx   r	   r   rp     s   



zAttention.__init__c           
         s   g |j | jR \}}} | |jddd}t fdd|\}}}td||| j }| |}td||}	t|	d}	| 	|	S )	Nr  r    r   c                    r   )Nb n (h d) -> b h n dr   r   r   r   r	   r   r     r   z#Attention.forward.<locals>.<lambda>b h i d, b h j d -> b h i jb h i j, b h j d -> b h i db h n d -> b n (h d))
rg   rw   r^  r   r   r   r<   r  r   r   )
rv   rV   rE   rx  r   r   r   dotsrg  r   r	   r   r   r     s   


zAttention.forwardrl   r   r   r   r	   r	   rx   r   r    s    r  c                       r  )	Attention_qkvrl   r   r   c                    s   t    || }|dko||k }|| _|d | _tjdd| _tj||dd| _tj||dd| _	tj||dd| _
|rNtt||t|| _d S t | _d S )Nr   r   r    r   Fr   )ro   rp   rw   r<   rs   r  r  r   r   r   r   r   r   r)  r   r	  rx   r	   r   rp     s    



zAttention_qkv.__init__c           
      C   s   g |j | jR \}}}}|j d }| |}| |}| |}t|d|d}t|d||d}t|d||d}td||| j }| |}td||}	t|	d}	| 	|	S )Nr   r  r   r  r  r  r  )
rg   rw   r   r   r   r   r   r<   r  r   )
rv   r   r   r   rE   r   bkr  rg  r   r	   r	   r   r     s   






zAttention_qkv.forwardr  r   r	   r	   rx   r   r    s    r  c                       r   )PostNormattentionc                    r  rX   r  r  rx   r	   r   rp     r  zPostNormattention.__init__c                 K   s   |  | j|fi || S rX   )r   r  r   r	   r	   r   r     r  zPostNormattention.forwardr   r	   r	   rx   r   r    r  r  c                       s6   e Zd Z								d
 fdd	Zdd	 Z  ZS )r  rl         r  r   c	           
         s`   t    tg | _|| _t|D ]}	| jtt|t	||||dt
|||dg qd S )N)rw   r   r   )r   )ro   rp   rs   r   layersr   r1   r.   r  r  r   )
rv   rw   r   r  r  r  r  r  r   rE   rx   r	   r   rp     s&   
	zTransformer_v2.__init__c                 C   sf   | j d d D ]\}}||}||| }q| jdkr1| j dd  D ]\}}||}||| }q"|S rB  )r  r   )rv   rV   rg  r   r	   r	   r   r     s   
zTransformer_v2.forward)rl   r  r  r  r  r  r  r   r   r	   r	   rx   r   r    s    r  c                       s8   e Zd ZdZ fddZdddddZdd	 Z  ZS )
r  zSDropPath but without rescaling and supports optional all-zero and/or all-keep.
    c                    s   t t|   || _d S rX   )ro   r  rp   r  )rv   r  rx   r	   r   rp     s   
zDropPath.__init__Nr  c          
         s  j st|dkr|d S |S |d }|d}t|jk  }|j|tjd}|d ur2d||< |d ur:d||< t	|d }|t
t|d |  }|d ur`tj|t	|d gdd}|| d |< t fdd|D }	t|dkr|	d S |	S )	Nr   r   r  Fr   r   c                 3   s     | ]}|  | V  qd S rX   )	broadcastr   
multiplierrv   r	   r   rI    s    z#DropPath.forward.<locals>.<genexpr>)r  r2   rH  rI   r  r  r  new_onesrc   r   r  rO   rL  )
rv   r  r  argsrV   r   r   ri   r  outputr	   r  r   r     s$   

zDropPath.forwardc                 C   s<   | d| dksJ | dfd|jd   }||S )Nr   )r   r   )rH  ndimr`  )rv   srcdstrg   r	   r	   r   r    s   
zDropPath.broadcast)r   r   r   r   rp   r   r  r   r	   r	   rx   r   r    s
    r  __main__)r4   )r?   r   r  r   r@   r7   r\  r]  r6   r8   r   r3   r   r  r  c                 c   s    | ]	\}}|  V  qd S rX   )numel)r
   r   r  r	   r	   r   rI  5  s    rI  i   zM parametersrX   )Mr}   os	functoolsr   rI   torch.nnrs   torch.nn.functional
functionalr  einopsr   fairscale.nn.checkpointr   rotary_embedding_torchr   r   __all__r  r   rG   rW   rY   r]   rj   Modulerk   r   environgetr   r   r   r   r   r   r  r  r(  rC  rN  r[  ri  r|  r  r  r  r.  r   r  r  r  r  r  r  r  r   configr4   unet_in_dimr)   
unet_y_dimunet_context_dimunet_out_dimr+   unet_num_headsunet_head_dimr*   r,   unet_dropoutr   r  modelprintr	  r  named_parametersr	   r	   r	   r   <module>   s   
 ,J9.	%}%17ot!%.     f&&
*