o
    }oi                     @   sd  d dl mZmZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZ de dej!dej"fddZ#ej$dfdej%de&fddZ'dej$dfde deej! dej%de&fddZ(			d$dedee  deeeee f  de&dee& d ee
 d!e)fd"d#Z*dS )%    )ListOptionalUnionN)CommonInferenceParams)AbstractModelInferenceWrapper)InferenceWrapperConfig)Image)AutoProcessor)vlm   )LlavaInferenceWrapper)MllamaInferenceWrapper)	VLMEngine)VLMTextGenerationControllerpathtrainermodelc                 C   s   |  }|| |}|S )z)Setup trainer and restore model from path)	to_fabric
load_model)r   r   r   fabric r   W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/inference/base.py _setup_trainer_and_restore_model%   s   r   i  params_dtype&inference_batch_times_seqlen_thresholdc           	      C   s   | j }| jj }||}|  t|tjrt}|j	j
}nt|tjr,t}|jj
}ntd| ||t||||jd}|S )z&Set up inference wrapper for the modelzUnknown model config: )hidden_sizer   r   padded_vocab_size)configmodulecudatoeval
isinstancer
   MLlamaModelConfigr   language_model_configr   LlavaConfigr   language_transformer_config
ValueErrorr   
vocab_size)	r   	tokenizerr   r   r   mcore_modelwrapper_clsr   inference_wrapped_modelr   r   r   setup_inference_wrapper,   s*   



r-   c           
      C   sN   d}t |}|j}t }tj||d}t| ||d t||||}	|	|fS )zSet up model and tokenizerz(meta-llama/Llama-3.2-11B-Vision-Instruct)r)   )r   r   r   )r	   from_pretrainedr)   r
   MLlamaConfig11BInstructMLlamaModelr   r-   )
r   r   r   r   model_id	processorr)   r   r   r,   r   r   r   setup_model_and_tokenizerO   s   
r3      wrapped_modelpromptsimagesmax_batch_sizerandom_seedinference_paramsreturnc                 C   s>   t | ||d}t|||d}	|ptdd}
|	j|||
d}|S )ae  
    Generates text using a NeMo VLM model.
    Args:
        wrapped_model (AbstractModelInferenceWrapper): The model inference wrapper.
        tokenizer: tokenizer for the input text,
        image_processor: image processor for the input image,
        prompts (list[str]): The list of prompts to generate text for.
        images (list): The list of images to generate text for.
        max_batch_size (int, optional): The maximum batch size. Defaults to 4.
        random_seed (Optional[int], optional): The random seed. Defaults to None.
        inference_params (Optional["CommonInferenceParams"], optional): The inference parameters defined in
            Mcore's CommonInferenceParams. Defaults to None.

    Returns:
        list[Union["InferenceRequest", str]]: A list of generated text,
            either as a string or as an InferenceRequest object.
    )r,   r)   image_processor)text_generation_controllerr8   r9   2   )num_tokens_to_generate)r6   r7   common_inference_params)r   r   r   generate)r5   r)   r<   r6   r7   r8   r9   r:   r=   mcore_enginer@   resultsr   r   r   rA   e   s   rA   )r4   NN)+typingr   r   r   lightning.pytorchpytorchpltorchtorch.distributed/megatron.core.inference.common_inference_paramsr   Qmegatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapperr   Imegatron.core.inference.model_inference_wrappers.inference_wrapper_configr   	PIL.Imager   transformersr	   nemo.lightning	lightningnlnemo.collectionsr
   llava_inference_wrapperr   mllama_inference_wrapperr   
vlm_enginer   vlm_inference_controllerr   strTrainerLightningModuler   bfloat16dtypeintr-   r3   dictrA   r   r   r   r   <module>   sj   

%
	