o
    Gi3                     @   s   d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZmZmZ e rQd dlmZ eeZeG dd deZ G dd deeeZ!dS )    )	dataclass)ListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixin)
BaseOutputis_torchvision_availablelogging   )
ModelMixin)CosmosEmbeddingCosmosLearnablePositionalEmbedCosmosPatchEmbedCosmosRotaryPosEmbedCosmosTransformerBlock)
transformsc                   @   s    e Zd ZU dZeej ed< dS )CosmosControlNetOutputz
    Output of [`CosmosControlNetModel`].

    Args:
        control_block_samples (`list[torch.Tensor]`):
            List of control block activations to be injected into transformer blocks.
    control_block_samplesN)__name__
__module____qualname____doc__r   torchTensor__annotations__ r   r   b/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/controlnets/controlnet_cosmos.pyr      s   
 r   c                &       s  e Zd ZdZdZg dZdgZdgZe						
														d8de	de	de	de	de	de	de
de	de	dee	e	e	f dee	e	e	f dee
e
e
f d edB d!e	dB d"e	d#ed$e	d%e	f$ fd&d'Zd(e
ee
 B d)ee
 fd*d+Z	,				d9d-ejd.ejd/ejd0eeej eeej eej f f d1ejd(e
ee
 B d2ejdB d3ejdB d4e	dB d5ed)eeeeej  f fd6d7Z  ZS ):CosmosControlNetModela(  
    ControlNet for Cosmos Transfer2.5.

    This model duplicates the shared embedding modules from the transformer (patch_embed, time_embed,
    learnable_pos_embed, img_context_proj) to enable proper CPU offloading. The forward() method computes everything
    internally from raw inputs.
    T)patch_embedpatch_embed_base
time_embedr   learnable_pos_embed                         @         r   r   r+      r2   g       @      ?r4   NFn_controlnet_blocksin_channelslatent_channelsmodel_channelsnum_attention_headsattention_head_dim	mlp_ratiotext_embed_dimadaln_lora_dim
patch_sizemax_size
rope_scaleextra_pos_embed_typeimg_context_dim_inimg_context_dim_outuse_crossattn_projectioncrossattn_proj_in_channelsencoder_hidden_states_channelsc                    s   t    t|||
dd| _t|||
dd| _t||| _d | _|dkr,t|||
d| _d | _	d urFdkrFt
t
j|ddt
 | _	d | _|rZt
t
j||ddt
 | _t||
|d| _t
 fdd	t|D | _d| _d S )
NF)bias	learnable)hidden_sizer?   r>   r   T)rI   r?   r>   r@   c                    s8   g | ]}t  d dduodk|dkdd
qS )rms_normFNr   T)
r9   r:   cross_attention_dimr;   r=   qk_normout_biasimg_contextbefore_proj
after_proj)r   ).0	block_idxr=   r:   rB   r;   r9   r<   r   r    
<listcomp>o   s    z2CosmosControlNetModel.__init__.<locals>.<listcomp>)super__init__r   r"   r#   r   r$   r%   r   img_context_projnn
SequentialLinearGELUcrossattn_projr   rope
ModuleListrangecontrol_blocksgradient_checkpointing)selfr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   	__class__rS   r    rV   5   s@   

zCosmosControlNetModel.__init__conditioning_scalereturnc                 C   sj   t |tr|}n|gt| j }t|t| jk r3tdt|t| j |t| j d t| j }|S )NzoReceived %d control scales, but control network defines %d blocks. Scales will be trimmed or repeated to match.)
isinstancelistlenr`   loggerwarning)rb   re   scalesr   r   r    _expand_conditioning_scale   s   
z0CosmosControlNetModel._expand_conditioning_scaler4   controls_latentslatentstimestepencoder_hidden_statescondition_maskpadding_maskattention_maskfpsreturn_dictc           )         s  |j \}}}}}|}| jjd }|j d |d k r8|d |j d  }tj|tj|||||f|j|jdgdd}|durFtj||gdd}ntj|t|ddddf gdd}t	j
j|t|j dd t	jjd}tj||d|d|ddgdd}|}|durtj||gdd}t	j
j|t|j dd t	jjd}tj||d|d|ddgdd}| j||	d}| jr| |nd}| |}|dd	}| jj\}}}|| || || | |}|dd	}|jdkr| ||\}}nF|jd
kr9|j \ }}}}|j  d|ddfksJ d|j  | }| ||\}} fdd||fD \}}ntd|j  t|trL|\} }!n|} d}!| jdur[| | } |!durk| jdurk| |!}!| jjdur~| jjdkr~| |!f}"n| }"|dur|dd}| |}#g }$t t!| j"|#D ]9\}%\}&}'t# r| j$r| %|&||"|||||d||%\}}(n|&||"|||||d||%d
\}}(|$&|(|'  q|
s|$fS t'|$dS )a'  
        Forward pass for the ControlNet.

        Args:
            controls_latents: Control signal latents [B, C, T, H, W]
            latents: Base latents from the noising process [B, C, T, H, W]
            timestep: Diffusion timestep tensor
            encoder_hidden_states: Tuple of (text_context, img_context) or text_context
            condition_mask: Conditioning mask [B, 1, T, H, W]
            conditioning_scale: Scale factor(s) for control outputs
            padding_mask: Padding mask [B, 1, H, W] or None
            attention_mask: Optional attention mask or None
            fps: Frames per second for RoPE or None
            return_dict: Whether to return a CosmosControlNetOutput or a tuple

        Returns:
            CosmosControlNetOutput or tuple of control tensors
        r0   )dtypedevice)dimN)interpolationr   )ru   r      z9Expected timestep to have shape [B, 1, T, 1, 1], but got c                 3   s8    | ]}|  d d ddddd dV  qdS )r0   r   N)viewexpandflatten)rQ   x
batch_sizepost_patch_heightpost_patch_num_framespost_patch_widthr   r    	<genexpr>   s    
z0CosmosControlNetModel.forward.<locals>.<genexpr>z@Expected timestep to have shape [B, 1, T, 1, 1] or [T], but got r   )
hidden_statesrq   embedded_timesteptembimage_rotary_embextra_pos_embrt   controlnet_residualro   rR   )r   )(shapeconfigr6   r   catzerosrw   rx   
zeros_liker   
functionalresizerh   InterpolationModeNEAREST	unsqueezerepeatr]   r%   r"   r   r>   r#   ndimr$   
ValueErrorrg   tupler\   rW   rB   rm   	enumeratezipr`   is_grad_enabledra   _gradient_checkpointing_funcappendr   ))rb   rn   ro   rp   rq   rr   re   rs   rt   ru   rv   BCTHWcontrol_hidden_statesvace_in_channelspad_Cpadding_mask_resizedbase_hidden_statesbase_padding_maskr   r   p_tp_hp_wr   r   _
num_framestimestep_flattext_contextrN   processed_encoder_hidden_statesrl   resultrR   blockscalecontrol_projr   r   r    forward   s   













zCosmosControlNetModel.forward)r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r1   r3   NNr)   Fr-   r-   )r4   NNNT)r   r   r   r    _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_keep_in_fp32_modulesr	   intfloatr   strboolrV   rh   r   rm   r   r   r   r   r   r   __classcell__r   r   rc   r    r!   '   s    	
L$
	
r!   )"dataclassesr   typingr   r   r   r   r   torch.nnrX   configuration_utilsr   r	   loadersr
   utilsr   r   r   modeling_utilsr   transformers.transformer_cosmosr   r   r   r   r   torchvisionr   
get_loggerr   rj   r   r!   r   r   r   r    <module>   s    	
