o
    }oiT.                     @   s   d dl mZmZ d dlZdd Zdd Zdeded	efd
dZdd Z	dddZ
dededededeeeef  ded	efddZdd ZdS )    )ListTupleNc                 C   s*   | | }|| }|| }||r| S d S )zHGet image sequence length given image size, patch size, and class token.r    )img_himg_w	patch_dimadd_class_tokenclass_token_lennum_patches_per_dim_hnum_patches_per_dim_wnum_patchesr   r   _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/llava_next/model/utils.pyget_image_sequence_length   s   r   c                 C   s   ddl }t|ttfs#t|tj|jfstdt| dd|	 }|\}}| j
dd \}}|| }|| }||kr]|| }	t||	 }
||
 d }| dd||| ddf }|S || }	t||	 }|| d }| dddd||| f }|S )a\  
    Unpads a PyTorch tensor of a padded and resized image.
    Args:
        tensor (`torch.Tensor`):
            The image tensor, assumed to be of shape (num_channels, height, width).
        original_size (`tuple`):
            The original size of the image (height, width).
    Returns:
        `torch.Tensor`: The unpadded image tensor.
    r   Nimage_size invalid type: z not valid z2should be either list, tuple, np.ndarray or tensor      )numpy
isinstancelisttupletorchTensorndarray	TypeErrortypetolistshapeint)tensororiginal_sizenporiginal_heightoriginal_widthcurrent_heightcurrent_widthoriginal_aspect_ratiocurrent_aspect_ratioscale_factor
new_heightpaddingunpadded_tensor	new_widthr   r   r   unpad_image   s.   r,   r   possible_resolutionsreturnc                 C   s   | \}}d}d}t d}|D ]=\}}t|| || }	t||	 t||	 }
}t|
| || }|| | }||ksC||krK||k rK|}|}||f}q|S )a  
    Selects the best resolution from a list of possible resolutions based on the original size.
    This is done by calculating the effective and wasted resolution for each possible resolution.
    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
    Args:
        original_size (tuple):
            The original size of the image in the format (height, width).
        possible_resolutions (list):
            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
    Returns:
        tuple: The best fit resolution in the format (height, width).
    Nr   inf)floatminr   )r   r-   r!   r"   best_fitmax_effective_resolutionmin_wasted_resolutionheightwidthscaledownscaled_widthdownscaled_heighteffective_resolutionwasted_resolutionr   r   r   select_best_resolutionG   s    r<   c                 C   st   ddl }t|tstdt| ttfs+t| tj|jfs'tdt|  d| 	 } t
| |\}}|| || fS )a-  
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
    Args:
        image_size (`tuple`):
            The size of the input image in the format (width, height).
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.
    Returns:
        tuple: The shape of the image patch grid in the format (width, height).
    r   Nz2grid_pinpoints should be a list of tuples or listsr   z> not valid, should be either list, tuple, np.ndarray or tensor)r   r   r   r   r   r   r   r   r   r   r<   )
image_sizegrid_pinpoints
patch_sizer    r5   r6   r   r   r   get_anyres_image_grid_shapei   s   
r@   c                 C   s  ddl m} | }g }g }t| D ]\}}	|	jd dkr|	d }
|	dd }	|jj|jj  }}|dkr9|| }n
|dkrC|| d }||
jd krNtdt|| |j	|jj\}}|	
||||d}	|	d	dd
dd }	|	dd
d
d}	t|	|| }	|durtj|	|ddddf jg |	jdd dR  |	jfdd}	|	dd
dd}	tj|
|	fdd}	n|	d }	|durtj|	|d |	fdd}	||	 ||	d qtj|dd} tj|tj| jd}| |fS )a  
    Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
    Args:
        image_features (`List[torch.Tensor]` of length num_images,
        each of shape `(num_patches, image_length, embed_dim)`)
            List of image feature tensor, each contains all the visual feature of all patches.
        image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
            Actual image size of each images (H, W).
        vision_feature_select_strategy (`str`)
            The feature selection strategy used to select the vision feature from the vision backbone.
        image_newline (`torch.Tensor` of shape `(embed_dim)`)
            New line embedding vector.
    Returns:
        image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
        feature_lens (`List[int]`)
            token length of each image in image_features
    r   )LlavaNextConfigr   Ndefaultfullz<The number of patches is not consistent with the image size.   r      )dim)dtypedevice)transformersrA   	enumerater   vision_configr=   r?   
ValueErrorr@   image_grid_pinpointsviewpermute
contiguousflattenr,   r   catexpandtorH   	transposeappendsizer   longrI   )image_featuresimage_sizesvision_feature_select_strategyimage_newlinerA   confignew_image_featuresfeature_lens	image_idximage_featurebase_image_featurer5   r6   expected_num_patchesnum_patch_heightnum_patch_widthr   r   r   pack_image_features   sR   
2
rg   orig_height
orig_widthr5   r6   rN   r?   c                 C   sb   t | |g|\}}|| || }}	|| }
|| }t| ||
|||	\}}|
| }|| | }|S )z
    Calculate the number of image features after the preprocessing for images of any resolution.
    This is used to calculate the number of image tokens.
    )r<   get_unpadded_features)rh   ri   r5   r6   rN   r?   height_best_resolutionwidth_best_resolutionscale_heightscale_widthpatches_heightpatches_widthunpadded_featuresnewline_featuresbase_featuresnum_image_tokensr   r   r   get_number_of_features   s   ru   c                 C   s   || }|| }||  }|| }	||	kr'| | | }
||
 d }||d 8 }n|| |  }|| d }||d 8 }|| }|}||fS )a+  
    Get number of features for a given image with height/width. LLaVA-NeXT is different from LLaVA
    because it divided each image into patches depending on its resolution. Therefore we need to calculate how many
    patches an image is divided into and get the number of features from that.
    r   r   )r5   r6   ro   rp   rm   rn   r#   r$   r%   r&   r(   r)   r+   rq   rr   r   r   r   rj      s   rj   )N)typingr   r   r   r   r,   r   r   r<   r@   rg   r   ru   rj   r   r   r   r   <module>   s.   ("
%C
"