o
    }oiI!                     @   s   d Z ddlmZ ddlmZ ddlZddlZddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZ dd
lmZ ddlmZ G dd deZdS )z/A library for Causal Video Tokenizer inference.    )Path)OptionalN)	get_token)hf_hub_download)
DictConfig)tqdm)get_tokenizer_configload_jit_modelload_pytorch_modelnumpy2tensorpad_video_batchtensor2numpyunpad_video_batch)PretrainedModelInfo)ModelPTc                       s   e Zd ZdZdeddf fddZe								
d"ddZe	 dej
dej
fddZe	 dej
deej
 fddZe	 dej
dej
fddZ	d#dejdedejfddZdee fddZdee fddZedee fd d!Z  ZS )$CausalVideoTokenizerz:Causal Video tokenization with the NVIDIA Cosmos TokenizercfgreturnNc                    s$  t  | t|j}t|d | _t|d | _t|d | _tt	|j
| _d| _|jrit|j}| j|d< |jrCt| j|d| jnd | _|jrRt| j|d| jnd | _|jrdt| j|d| j| _d S d | _d S |jrst| j| jnd | _|jrt| j| jnd | _|jrt| j| jnd | _d S )	Nautoencoder.jitencoder.jitdecoder.jitcudadtypefullencdec)super__init__r   checkpoint_dirstr_full_model_path_enc_model_path_dec_model_pathgetattrtorchr   _dtype_deviceuse_pytorchr   tokenizer_typeload_full_modelr
   _full_modelload_enc_model
_enc_modelload_dec_model
_dec_modelr	   )selfr   
checkpointtokenizer_config	__class__ m/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/video_tokenizers/cosmos_tokenizer.pyr   &   s4   


zCausalVideoTokenizer.__init__Cosmos-Tokenizer-DV4x8x8TFbfloat16c              	   C   s|   d| | _ t }t| j d|d}t| j d|d}	t| j d|d}	|r'd}d}tt|j}
t|
||||||d}| |S )Nznvidia/r   )repo_idfilenametokenr   r   F)r   r   r+   r-   r)   r(   r'   )_hf_model_nameget_hf_tokenr   r   r   parentr   )clsr(   load_encoderload_decoderr)   r'   r   hf_tokenfull_model_path_ckpt_dirr   r4   r4   r5   from_pretrainedH   s@   
z$CausalVideoTokenizer.from_pretrainedinput_tensorc                 C   sN   | j dur|  |}t|tr|d }|S |}|S | |d }| |}|S )zReconstructs a batch of video tensors after embedding into a latent.

        Args:
            video: The input video Bx3xTxHxW layout, range [-1..1].
        Returns:
            The reconstructed video, layout Bx3xTxHxW, range [-1..1].
        Nr   )r*   
isinstancetupleencodedecode)r/   rF   output_tensoroutput_latentr4   r4   r5   
autoencode~   s   
	

zCausalVideoTokenizer.autoencodec                 C   s8   |j dks	J d| |}t|tjr|S |dd S )a	  Encodes a numpy video into a CausalVideo latent or code.

        Args:
            input_tensor: The input tensor Bx3xTxHxW layout, range [-1..1].
        Returns:
            For causal continuous video (CausalCV) tokenizer, the tuple contains:
                - The latent embedding, Bx16x(t)x(h)x(w), where the compression
                rate is (T/t x H/h x W/w), and channel dimension of 16.
            For causal discrete video (CausalDV) tokenizer, the tuple contains:
              1) The indices, Bx(t)x(h)x(w), from a codebook of size 64K, which
                is formed by FSQ levels of (8,8,8,5,5,5).
              2) The discrete code, Bx6x(t)x(h)x(w), where the compression rate
                is again (T/t x H/h x W/w), and channel dimension of 6.
           input video should be of 5D.N)ndimr,   rG   r$   Tensor)r/   rF   rL   r4   r4   r5   rI      s
   
zCausalVideoTokenizer.encodeinput_latentc                 C   s   |j dks	J d| |S )aA  Encodes a numpy video into a CausalVideo latent.

        Args:
            input_latent: The continuous latent Bx16xtxhxw for CausalCV,
                        or the discrete indices Bxtxhxw for CausalDV.
        Returns:
            The reconstructed tensor, layout [B,3,1+(T-1)*8,H*16,W*16] in range [-1..1].
           z@input latent should be of 5D for continuous and 4D for discrete.)rQ   r.   )r/   rS   r4   r4   r5   rJ      s   

zCausalVideoTokenizer.decode   videotemporal_windowc                 C   s   |j dks	J d|jd }g }ttd|d | d D ]:}|| |d | }}|dd||df }t|\}	}
t|	| j| jd}| |}t	|}t
||
}|| qtj|ddS )	a  Reconstructs video using a pre-trained CausalTokenizer autoencoder.
        Given a video of arbitrary length, the forward invokes the CausalVideoTokenizer
        in a sliding manner with a `temporal_window` size.

        Args:
            video: The input video BxTxHxWx3 layout, range [0..255].
            temporal_window: The length of the temporal window to process, default=25.
        Returns:
            The reconstructed video in range [0..255], layout BxTxHxWx3.
        rN   rO      r   N.)r   device)axis)rQ   shaper   ranger   r   r%   r&   rM   r   r   appendnpconcatenate)r/   rV   rW   
num_framesoutput_video_listidxstartendinput_videopadded_input_videocrop_regionrF   rK   padded_output_videooutput_videor4   r4   r5   forward   s   


zCausalVideoTokenizer.forwardtrain_data_configc                 C      d S Nr4   )r/   rk   r4   r4   r5   setup_training_data      z(CausalVideoTokenizer.setup_training_dataval_data_configc                 C   rl   rm   r4   )r/   rp   r4   r4   r5   setup_validation_data   ro   z*CausalVideoTokenizer.setup_validation_datac                 C   rl   rm   r4   )r>   r4   r4   r5   list_available_models   s   z*CausalVideoTokenizer.list_available_models)r6   TTFFr7   )rU   )__name__
__module____qualname____doc__r   r   classmethodrE   r$   no_gradrR   rM   rH   rI   rJ   r^   ndarrayintrj   r   rn   rq   r   rr   __classcell__r4   r4   r2   r5   r   #   s:    "5
!r   )rv   pathlibr   typingr   numpyr^   r$   huggingface_hubr   r<   r   	omegaconfr   r   .nemo.collections.common.video_tokenizers.utilsr   r	   r
   r   r   r   r   nemo.core.classes.commonr   nemo.core.classes.modelPTr   r   r4   r4   r4   r5   <module>   s   	$	