o
    }o™i  ã                	   @   s¬  d dl Z d dlmZ d dlm  mZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ G dd	„ d	ejƒZG d
d„ dejƒZedkrÔedddZedd„ e ¡ D ƒƒZ e!e ƒ e  "dddd¡Z#e  "ddd¡Z$e  %dd¡Z&e  %d¡Z'ee#e'e$e&ƒZ(e!e(j)ƒ edddZ*edd„ e* ¡ D ƒƒZ e!e ƒ e*e  +dddd¡e  +dddd¡e  %d¡e  +ddd¡e  %dd¡ƒZ(e!e(j)ƒ dS dS )é    N)ÚSelfAttentionPooling)ÚConditionalSequentialÚDBlockÚFusedCrossAttentionBlockÚResBlockÚStackedCrossAttentionBlockÚUBlock)ÚLearnedSinusoidalPosEmbÚUnLearnedSinusoidalPosEmb)Ú
Downsample)ÚUpsampleLearnable)ÚlinearÚnormalizationÚzero_modulec                       sr   e Zd ZdZdddg d¢dddddd	g d
¢dddddddddddf‡ fdd„	Z	ddd„Zdddœdd„Z‡  ZS )Ú	UNetModelaŠ	  
    The full UNet model with attention and timestep embedding used for Imagen Base and SR model.

    :param embed_dim: Dimension of embeddings. Also used to calculate the number of channels in ResBlock.
    :param image_size: Input image size. Used to calculate where to inject attention layers in UNet.
    :param channels: Input channel number, defaults to 3.
    :param text_embed_dim: Dimension of conditioned text embedding. Different text encoders and different model versions have different values, defaults to 512
    :param num_res_blocks: Number of ResBlock in each level of UNet, defaults to 3.
    :param channel_mult: Used with embed_dim to calculate the number of channels for each level of UNet, defaults to [1, 2, 3, 4]
    :param num_attn_heads: The number of heads in the attention layer, defaults to 4.
    :param per_head_channels: The number of channels per attention head, defaults to 64.
    :param cond_dim: Dimension of Conditioning projections, defaults to 512.
    :param attention_type: Type of attention layer, defaults to 'fused'.
    :param feature_pooling_type: Type of pooling, defaults to 'attention'.
    :param learned_sinu_pos_emb_dim: Dimension of learned time positional embedding. 0 for unlearned timestep embeddings. Defaults to 16
    :param attention_resolutions: List of resolutions to inject attention layers. Defaults to [8, 16, 32]
    :param dropout: The rate of dropout, defaults to 0.
    :param use_null_token: Whether to create a learned null token for attention, defaults to False.
    :param init_conv_kernel_size: Initial Conv kernel size, defaults to 3.
    :param gradient_checkpointing: Whether to use gradient checkpointing, defaults to False.
    :param scale_shift_norm: Whether to use scale shift norm, defaults to False.
    :param stable_attention: Whether to use numerically-stable attention calculation, defaults to True.
    :param flash_attention: Whether to use flash attention calculation, defaults to False.
    :param resblock_updown: Whether to use ResBlock or Downsample/Upsample, defaults to False.
    :param resample_with_conv: When resblock_updown=False, whether to use conv in addition to Pooling&ConvTranspose. Defaults to True.
    :param low_res_cond: Whether conditioned on low-resolution input, used for SR model. Defaults to False.
    :param noise_cond_aug: Whether to add noise conditioned augmentation with low-resolution input. Defaults to False.
    é   é   )é   é   r   é   r   é@   ÚfusedÚ	attentioné   )é   r   é    FTc           1         sà  t ƒ  ¡  |
dkrt}n|
dkrt}ntd |
¡ƒ‚|d }|dks$J ‚|dkrDt|ƒ}|d }t |t 	||¡t 
¡ t 	||¡¡| _nt|ƒ}t |t||ƒt 
¡ t||ƒ¡| _|dksc|dkscJ ‚|| _|dkr|t t|d	t |¡t 	||	¡¡| _t||	ƒ| _t t |	¡t 	|	|¡t 
¡ t 	||¡¡| _|rªt tjdd|	| jjjd
¡| _|| _g }t|ƒ}|| _|D ]}| |t|ƒ ¡ q¸|| _|| _ | j rú| jsÔJ dƒ‚|dkrát|ƒ}|d } nt|ƒ}|} t |t 	| |¡t 
¡ t 	||¡¡| _!|rd| n|}!||d  }"t"tj#|!|"||d dƒ| _$t%|tƒr%|gt&|ƒ }#n|}#d}$|"}%d}&t '| j$g¡| _(|%g}'t)|ƒD ]v\}(})|#|( }t*|ƒD ]6}*|)| }+t+|%|||+|$||ddg},|+}%|&|v rp|, ||%||||||	d¡ | j( t"|,Ž ¡ |' |%¡ qH|(t&|ƒd k}-|-s±| j( t"|ržt+|%|||%|$||ddd	nt,|%||$|%dƒ¡ |' |%¡ |&d9 }&q<t"t+|%|||$||dd||%||||||	dt+|%|||$||ddƒ| _-t 'g ¡| _.t/t)|ƒƒd d d… D ]p\}(})|#|( }t*|d ƒD ]`}.|' 0¡ }/||) }+t+|%|/ |||+|$||ddg},|+}%|&|v r"|, ||%d|||||	d¡ |.|k}0|(rK|0rK|, |r>t+|%|||%|$||ddd	nt1|%||$|%d¡ |&d }&| j. t"|,Ž ¡ qôqæt t2|%ƒt 
¡ t3tj#|"|||d dƒ¡| _4d S )NÚstackedr   zAttention {} not definedr   r   r   r   Úmean©Ú	input_dim©ÚdtypezVnoise conditioning augmentation should only be enabled when training with low-res condr   ©ÚpaddingT)ÚchannelsÚemb_channelsÚdropoutÚout_channelsÚdimsÚuse_checkpointÚuse_scale_shift_normÚlearnable_upsampling)r$   Ú	num_headsÚnum_head_channelsr)   Ústable_attentionÚflash_attentionÚcontext_dim)	r$   r%   r&   r'   r(   r)   r*   Údownr+   )r$   Úuse_convr(   r'   )r$   r%   r&   r(   r)   r*   r+   éÿÿÿÿ)	r$   r%   r&   r'   r(   r)   r*   Úupr+   )5ÚsuperÚ__init__r   r   Ú
ValueErrorÚformatr	   ÚnnÚ
SequentialÚLinearÚSiLUÚ
time_embedr
   r   Úfeature_pooling_typer   Ú	LayerNormÚattention_poolingÚtext_to_condÚto_text_non_attn_condÚ	ParameterÚtorchÚrandnÚweightr!   Únull_text_embeddingÚuse_null_tokenÚsortedÚ
image_sizeÚappendÚintÚlow_res_condÚnoise_cond_augÚlowres_time_embedr   ÚConv2dÚ	init_convÚ
isinstanceÚlenÚ
ModuleListÚinput_blocksÚ	enumerateÚranger   r   Úmiddle_blockÚoutput_blocksÚlistÚpopÚUpsampler   r   Úout)1ÚselfÚ	embed_dimrJ   r$   Útext_embed_dimÚnum_res_blocksÚchannel_multÚnum_attn_headsÚper_head_channelsÚcond_dimÚattention_typer>   Úlearned_sinu_pos_emb_dimÚattention_resolutionsr&   rH   Úinit_conv_kernel_sizeÚgradient_checkpointingÚscale_shift_normr.   r/   Úresblock_updownÚresample_with_convrM   rN   Úattention_fnÚtime_embed_dimÚsinu_pos_embÚsinu_pos_emb_input_dimÚattention_dsÚresÚlowres_sinu_pos_embÚlowres_sinu_pos_emb_dimÚin_channelsÚinit_dimÚres_blocks_listÚCONV_DIMÚchÚdsÚnum_input_block_channelsÚlevelÚmultÚ_r'   ÚlayersÚis_last_levelÚiÚichÚis_last_block©Ú	__class__© út/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/multimodal/modules/imagen/diffusionmodules/nets.pyr6   B   s¦  


üÿ
ý

ü ÿþ


üÿøÿ
ùÿõ÷óÿ
€ù
ù
ùê"øÿ
ùÿõ÷óÑ1
ýzUNetModel.__init__Nc                 C   s@  | j r|d usJ dƒ‚n|d u sJ dƒ‚| jr |d usJ dƒ‚n|d u s(J dƒ‚|d urL|j|jkrC|j\}}}}	tj|||	fdd}tj||gdd}|jd	 |j}
}|j|jksa|j|jkr„|j}|j	|d
}|j	|d
}|d urz|j	|d
}|d ur„|j	|d
}|  
|¡}| jr•|  |¡}||7 }|  |¡}| jr¿| j |
dd¡}tj||gdd}tj|t |
d¡ 	|¡gdd}n|}|}| jdkrÏ|jdd}n
| jdkrÙ|  |¡}|  |¡}||7 }|}g }| jD ]}|||||ƒ}| |¡ qé|  ||||¡}| jD ]}| ¡ }tj||gdd}|||||ƒ}q|  |¡S )Nzx_low_res cannot be Nonezx_low_res cannot be presentedúEtime_low_res cannot be None when training with noise conditioning augú time_low_res cannot be presentedÚbicubic©Úmoder   ©Údimr   r    r   éþÿÿÿr   )rM   rN   ÚshapeÚFÚinterpolaterD   ÚcatÚdevicer!   Útor=   rO   rA   rH   rG   ÚrepeatÚonesr>   r   r@   rB   rU   rK   rX   rY   r[   r]   )r^   ÚxÚtimeÚ
text_embedÚ	text_maskÚ	x_low_resÚtime_low_resr   Ú
new_heightÚ	new_widthÚ
batch_sizer•   r!   ÚtÚlowres_tÚ	text_condÚnull_contextÚcontext_embÚcontext_maskÚpooled_text_condÚtext_hiddensÚhÚhsÚmoduleÚh_prevr‡   r‡   rˆ   ÚforwardS  s`   


"






zUNetModel.forwardç      ð?©r›   Ú
cond_scalec                O   óJ   | j |d|i|¤Ž}|dkr|S | j |dt |¡i|¤Ž}||| |  S ©Nr›   r¯   ©r®   rD   Ú
zeros_like©r^   r›   r±   ÚargsÚkwargsÚlogitsÚnull_logitsr‡   r‡   rˆ   Úforward_with_cond_scale™  ó
   z!UNetModel.forward_with_cond_scale)NNNN©Ú__name__Ú
__module__Ú__qualname__Ú__doc__r6   r®   r»   Ú__classcell__r‡   r‡   r…   rˆ   r   $   s<    !ç  
ÿFr   c                       sp   e Zd ZdZddg d¢dddg d¢d	d
g d¢dddddddddf‡ fdd„	Z	ddd„Zdddœdd„Z‡  ZS )ÚEfficientUNetModela¦  
    The full Efficient UNet model with attention and timestep embedding used for Imagen SR model.

    :param embed_dim: Dimension of embeddings. Also used to calculate the number of channels in ResBlock.
    :param image_size: Input image size. Used to calculate where to inject attention layers in UNet.
    :param channels: Input channel number, defaults to 3.
    :param text_embed_dim: Dimension of conditioned text embedding. Different text encoders and different model versions have different values, defaults to 512
    :param channel_mult: Used with embed_dim to calculate the number of channels for each level of UNet, defaults to [1, 1, 2, 4, 8].
    :param num_attn_heads: The number of heads in the attention layer, defaults to 8.
    :param per_head_channels: The number of channels per attention head, defaults to 64.
    :param attention_type: Type of attention layer, defaults to 'fused'.
    :param atnn_enabled_at: Whether to enable attention at each level, defaults to [0, 0, 0, 0, 1].
    :param feature_pooling_type: Type of pooling, defaults to 'attention'.
    :param stride: Stride in ResBlock, defaults to 2.
    :param num_resblocks: Used with num_res_blocks to calculate the number of residual blocks at each level of Efficient-UNet. Defaults to [1, 2, 4, 8, 8].
    :param learned_sinu_pos_emb_dim: Dimension of learned time positional embedding. 0 for unlearned timestep embeddings. Defaults to 16
    :param use_null_token: Whether to create a learned null token for attention, defaults to False.
    :param init_conv_kernel_size: Initial Conv kernel size, defaults to 3.
    :param gradient_checkpointing: Whether to use gradient checkpointing, defaults to False.
    :param scale_shift_norm: Whether to use scale shift norm, defaults to False.
    :param stable_attention: Whether to use numerically-stable attention calculation, defaults to True.
    :param flash_attention: Whether to use flash attention calculation, defaults to False.
    :param skip_connection_scaling: Whether to use 1/sqrt(2) scaling for ResBlock skip connection, defaults to False.
    :param noise_cond_aug: Whether to add noise conditioned augmentation with low-resolution input. Defaults to False.
    r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   FTc           %         s¸  t ƒ  ¡  t|ƒ| _|| _|d }|dksJ ‚|dkr7t|ƒ}|d }t |t ||¡t 	¡ t ||¡¡| _
nt|ƒ}t |t||ƒt 	¡ t||ƒ¡| _
|| _| jrx|dkr_t|ƒ}|d }nt|ƒ}|}t |t ||¡t 	¡ t ||¡¡| _|}|
dks„|
dks„J ‚|
| _|
dkrt t|dt |¡t ||¡¡| _t||ƒ| _t t |¡t ||¡t 	¡ t ||¡¡| _|rËt tjdd|| jjjd¡| _|| _|d }||d  }tj||||d d	| _t ¡ | _t ¡ | _ |}t!|ƒD ]Y\}} || }!|t|ƒd k}"|	| r|nd }#t"|ƒ}$t#||t$| | ƒ||" ||!|#|||||||d
| j|$< t%t$| | ƒ||||" ||!|#|||||||d| j |$< t$| | ƒ}qôt |d | |d¡| _&d S )Nr   r   r   r   r   r   r    r   r"   )r$   r%   r'   r*   Ú	conv_downÚstrideÚnum_resblocksrf   r`   r,   r-   r)   r.   r/   Úskip_connection_scaling)r$   r%   r'   r*   Úconv_uprÅ   rÆ   rf   r`   r,   r-   r)   r.   r/   rÇ   )'r5   r6   rS   Ún_levelsrJ   r	   r9   r:   r;   r<   r=   r
   r   rN   rO   r>   r   r?   r@   rA   rB   rC   rD   rE   rF   r!   rG   rH   rP   rQ   Ú
ModuleDictÚDBlocksÚUBlocksrV   Ústrr   rL   r   r]   )%r^   r_   rJ   r$   r`   rb   rc   rd   rf   Úatnn_enabled_atr>   rÅ   rÆ   rg   rH   ri   rj   rk   r.   r/   rÇ   rN   ro   rp   rq   rt   ru   re   rv   rw   rz   r}   r~   Únum_resblockr   Úlevel_attention_typeÚ	level_keyr…   r‡   rˆ   r6   ¼  sÂ   
%


üÿ


ü
ý

ü 


ñ
ñzEfficientUNetModel.__init__Nc                 C   s@  | j r|d usJ dƒ‚n|d u sJ dƒ‚|j|jks |j|jkrC|j}|j|d}|j|d}|d ur9|j|d}|d urC|j|d}|jd |j}}	|  |¡}
|  |¡}|j|jkrm|j\}}}}tj|||fdd}t	j
||gdd}| j r‚|  |¡}|
|7 }
| jr§| j |dd¡}t	j
||gdd}t	j
|t	 |d¡ |	¡gdd}n|}|}| jd	kr·|jd
d}n
| jdkrÁ|  |¡}|  |¡}|
|7 }
|  |¡}tƒ }t| jƒD ]}t|ƒ}| j| ||
||ƒ}|| jd k rò|||< q×t| jd ddƒD ]}t|ƒ}|| jd k r|||  }| j| ||
||ƒ}qü|  |¡S )Nr‰   rŠ   r    r   r‹   rŒ   r   rŽ   r   r   r   r3   )rN   r!   r–   r‘   r•   r=   rA   r’   r“   rD   r”   rO   rH   rG   r—   r˜   r>   r   r@   rB   rQ   ÚdictrW   rÉ   rÍ   rË   rÌ   r]   )r^   r™   rš   r›   rœ   r   rž   r!   r¡   r•   r¢   r¤   r   rŸ   r    r£   r¥   r¦   r§   r¨   r©   Úfeatsr}   rÑ   r‡   r‡   rˆ   r®   T  s^   


"




€
zEfficientUNetModel.forwardr¯   r°   c                O   r²   r³   r´   r¶   r‡   r‡   rˆ   r»   ™  r¼   z*EfficientUNetModel.forward_with_cond_scale©Nr½   r‡   r‡   r…   rˆ   rÃ   ¡  s4    Þ 
ÿErÃ   Ú__main__r   r   )r_   rJ   c                 c   ó    | ]}|  ¡ V  qd S rÔ   ©Únumel©Ú.0Úpr‡   r‡   rˆ   Ú	<genexpr>¤  ó   € rÜ   r   r   éX   é€   é   c                 c   rÖ   rÔ   r×   rÙ   r‡   r‡   rˆ   rÜ   ±  rÝ   ),rD   Útorch.nnr9   Útorch.nn.functionalÚ
functionalr’   ÚEnemo.collections.multimodal.modules.imagen.diffusionmodules.attentionr   ÚBnemo.collections.multimodal.modules.imagen.diffusionmodules.blocksr   r   r   r   r   r   Ú@nemo.collections.multimodal.modules.imagen.diffusionmodules.embsr	   r
   ÚBnemo.collections.multimodal.modules.imagen.diffusionmodules.layersr   r   r\   r   r   r   ÚModuler   rÃ   r¾   ÚmodelÚsumÚ
parametersÚpytorch_total_paramsÚprintÚrandÚimage_batchr¤   r˜   rœ   rš   Úoutputr‘   Úmodel_srrE   r‡   r‡   r‡   rˆ   Ú<module>   sJ        


ûç