o
    پi^                     @   s`  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	Z
ddlZddlZddlmZ ddlmZmZ ddlmZ ddlmZ d	efd
dZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z dd Z!	 d/d!e"e# d"e#d	e$e"e# e"e# e"e# f fd#d$Z%d%ej&d&ej'j(d	ej&fd'd(Z)d&ej'j(d)ej&d*e"d+ed, fd-d.Z*dS )0a  
Utilities for multi-modal models.

This python file mainly contains utilities that were used in the
image processing logic of llava-next including operations such as
anyres and anyres_max

Currently supports the anyres and anyres_max operation for CLIP and
SigLip. For more information, you may refer to the paper or the blog

LLaVA-NeXT : https://llava-vl.github.io/blog/2024-01-30-llava-next/
LLaVA-Onevision : https://arxiv.org/pdf/2408.03326

    N)BytesIO)Literal)Image)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size) tensor_model_parallel_all_gather)flatten_nested_listreturnc                 C   s0   | d u rdS t | trtdd t| D S dS )NFc                 s   s    | ]}t |V  qd S N)has_valid_data).0item r   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/multimodal/mm_utils.py	<genexpr>7   s    z!has_valid_data.<locals>.<genexpr>T)
isinstancelistanyr   )datar   r   r   r   3   s
   
r   c                 C   s   | \}}d}d}t d}|D ]=\}}t|| || }	t||	 t||	 }
}t|
| || }|| | }||ksC||krK||k rK|}|}||f}q|S )a  
    Selects the best resolution from a list of possible resolutions based on the original size.

    Args:
        original_size (tuple): The original size of the image in the format (width, height).
        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].

    Returns:
        tuple: The best fit resolution in the format (width, height).
    Nr   inf)floatminint)original_sizepossible_resolutionsoriginal_widthoriginal_heightbest_fitmax_effective_resolutionmin_wasted_resolutionwidthheightscaledownscaled_widthdownscaled_heighteffective_resolutionwasted_resolutionr   r   r   select_best_resolution;   s*   r'   c                 C   s   | j \}}|\}}|| }|| }||k r"|}tt|| |}	n|}	tt|| |}| ||	f}
td||fd}|| d }||	 d }||
||f |S )a1  
    Resize and pad an image to a target resolution while maintaining aspect ratio.

    Args:
        image (PIL.Image.Image): The input image.
        target_resolution (tuple): The target resolution (width, height) of the image.

    Returns:
        PIL.Image.Image: The resized and padded image.
    RGB)r   r   r      )sizer   mathceilresizer   newpaste)imagetarget_resolutionr   r   target_widthtarget_heightscale_wscale_h	new_width
new_heightresized_image	new_imagepaste_xpaste_yr   r   r   resize_and_pad_imagec   s   
r<   c           	      C   s^   g }| j \}}td||D ]}td||D ]}|||| || f}| |}|| qq|S )a  
    Divides an image into patches of a specified size.

    Args:
        image (PIL.Image.Image): The input image.
        patch_size (int): The size of each patch.

    Returns:
        list: A list of PIL.Image.Image objects representing the patches.
    r   )r*   rangecropappend)	r0   
patch_sizepatchesr    r!   ijboxpatchr   r   r   divide_to_patches   s   

rF   c                    s   t |trEd|v rE dv sJ dtd|}ttt|d ttt|d fddtd d d	 D } fd
d|D }t|t	u rN|}nt
|}t| |\}}|  |  fS )a  
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

    Args:
        image_size (tuple): The size of the input image in the format (width, height).
        grid_pinpoints (str): A string representation of a list of possible resolutions.
        patch_size (int): The size of each image patch.

    Returns:
        tuple: The shape of the image patch grid in the format (width, height).
    x   iP  i  i  i   1patch_size should be in [224, 336, 384, 448, 512]\((\d+)x(\d+)\)r   c                    0   g | ]}t d   d  d  D ]}||fqqS    r=   r   rB   rC   	range_endrange_startr   r   
<listcomp>       z/get_anyres_image_grid_shape.<locals>.<listcomp>rO   c                       g | ]} fd d|D qS )c                       g | ]}|  qS r   r   r   dimr@   r   r   rU          z:get_anyres_image_grid_shape.<locals>.<listcomp>.<listcomp>r   r   pairr[   r   r   rU          )r   strrefindalltuplemapr   r=   typer   astliteral_evalr'   )
image_sizegrid_pinpointsr@   matchesr   r    r!   r   )r@   rS   rT   r   get_anyres_image_grid_shape   s    

rk   c              
      sx  t |trdd|v rdzjd  W n ty' } zjd  W Y d}~nd}~ww  dv s0J dtd|}ttt|d ttt|d fd	d
t	d d d D } fdd
|D }t
|tu rm|}nt|}t| j|}t| |}djv rjd njd }djv rjd njd }	t||}
| |	|	f}|g|
 }fdd
|D }tj|ddS )a^  
    Process an image with variable resolutions.

    Args:
        image (PIL.Image.Image): The input image to be processed.
        processor: The image processor object.
        grid_pinpoints (str): A string representation of a list of possible resolutions.

    Returns:
        np.array: An np array containing the processed image patches.
    rG   r   shortest_edgeNrH   rJ   rK   rL   c                    rM   rN   rP   rQ   rR   r   r   rU      rV   z(process_anyres_image.<locals>.<listcomp>rO   c                    rW   )c                    rX   r   r   rY   r[   r   r   rU      r\   z3process_anyres_image.<locals>.<listcomp>.<listcomp>r   r]   r[   r   r   rU      r_   	crop_sizer!   c                    s$   g | ]}  |d d d qS )r(   pixel_valuesr   )
preprocessconvert)r   image_patch)	processorr   r   rU          axis)r   r`   r*   	Exceptionra   rb   rc   rd   r   r=   re   r   rf   rg   r'   r<   __dict__rm   rF   r-   npstack)r0   rr   ri   erj   r   best_resolutionimage_paddedrm   rl   rA   image_original_resizeimage_patchesr   )r@   rr   rS   rT   r   process_anyres_image   sJ   









r   c                 C   s   t ttj| ddS )NT)validate)r   openr   pybase64	b64decode)r0   r   r   r   load_image_from_base64  s   r   c                 C   s   | j \}}||kr| S | jdkr| d} ||kr1t| j||f|}|| d|| d f |S t| j||f|}|| || d df |S )NLr(   r   r)   )r*   moderp   r   r.   r/   )pil_imgbackground_colorr    r!   resultr   r   r   expand2square  s   


r   c                 C   s   |\}}| j dd \}}|| }|| }||kr:|| }t|| }	||	 d }
| dd|
||
 ddf }|S || }t|| }|| d }
| dddd|
||
 f }|S )a  
    Unpads a PyTorch tensor of a padded and resized image.

    Args:
    tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
    original_size (tuple): The original size of the image (height, width).

    Returns:
    torch.Tensor: The unpadded image tensor.
    rO   Nr)   )shaper   )tensorr   r   r   current_heightcurrent_widthoriginal_aspect_ratiocurrent_aspect_ratioscale_factorr7   paddingunpadded_tensorr6   r   r   r   unpad_image  s   r   c                 C   s   |\}}|| }||  }||kr*|| }t || }| | d }	| d|	  |f}
|
S | | }t || }|| d }	| |d|	  f}
|
S )z^
    Unpads a PyTorch tensor of a padded and resized image
    and returns the new shape.
    r)   r   )r   r   r   r   r   r   r   r   r7   r   	new_shaper6   r   r   r   unpad_image_shape:  s   r   c                    s   t |dd }g  |dkr-| D ]}t|tdd |jD }||d d } | qnd|v rC| D ]}t|||j} | q3n|| d S t fdd D r[t	j
 dd	  S )
Nimage_aspect_ratiopadc                 s   s    | ]	}t |d  V  qdS )   Nr   r   rG   r   r   r   r   X  s    z!process_images.<locals>.<genexpr>rn   r   anyresc                 3   s     | ]}|j  d  j kV  qdS )r   N)r   r   
new_imagesr   r   r   d  s    rt   )getattrr   rc   
image_meanro   r?   r   image_grid_pinpointsallrx   ry   )imagesimage_processor	model_cfgr   r0   r   r   r   process_imagesR  s*   r   r)   sizesnum_gpusc           
         s   t }|dkrg dg| dg| fS dd t|D }dg|  tt|fdddd}|D ]}tt| fddd	}|| |  |  | 7  < q0tt  }tt  }t|D ]}	|||	  |t ||	  q^|| fS )
aq  
    Generate load balancing assignment and metadata
    for distributing data across GPUs.
    The load is determined by the total image sizes,
    not the number of images.

    Args:
        sizes: The size of each image
        num_gpus: Number of GPUs to balance across

    Returns:
        shuffle_indices:
            Indices to reorder data for balanced loading
        gpu_sample_counts:
            Number of samples assigned to each GPU
        grouped_sizes_per_gpu:
            Total size assigned to each GPU

    Example:
        ```
        sizes = [1000, 100, 200, 50]
        num_gpus = 2
        ```

    r   c                 S   s   g | ]}t t  qS r   )r   r   )r   _r   r   r   rU         z0get_dp_encoder_lb_assignment.<locals>.<listcomp>c                        |  S r
   r   rB   )r   r   r   <lambda>      z.get_dp_encoder_lb_assignment.<locals>.<lambda>T)keyreversec                    r   r
   r   r   )	gpu_loadsr   r   r     r   )r   )lenr=   sortedr   r?   r   r   extend)
r   r   	n_samplesgpu_assignmentslarge_to_small_indicesidxmin_gpushuffle_indicesgpu_sample_countsgpu_idr   )r   r   r   get_dp_encoder_lb_assignmentj  s$   



r   image_inputvision_modelc                 C   s   | j d }t }|| d | }|| | }dd|  d   d|f }tjj| |}t }||| |d | df }	||	}
|
j	 }
t
|
dd}
|
d|df }
|
S )aW  Run a vision model with data parallelism (DP) sharding. The function
    will shard the input image tensor on the first dimension and run the vision
    model

    Args:
        image_input (torch.Tensor): Image input tensor.
        vision_model (torch.nn.Module): Vision model.
    Returns:
        torch.Tensor: Output image embeddings
    r   rO   )r   r)   .rZ   N)r   r   rZ   torchnn
functionalr   r   last_hidden_state
contiguousr   )r   r   
num_chunksmp_world_sizenum_chunks_per_ranknum_padded_chunksr   image_input_paddedrankimage_input_per_rankvision_embeddingsr   r   r   run_dp_sharded_vision_model  s   

r   rn   grid_thw_list	rope_type)rope_3drope_2dc          '         sH  ddl m}m}m} | }|dkr| tdS | }dd D }	dgt|	 t|	|\}
}}dgt|}|
|| ||d   }t	|dkr\t
 fdd|D }ntjdjd fjjd}|d	krz| jd | jd  n| j| j t| }fd
d|D }|d	kr|jd dkr| |t|}t|trtj
|dd}n2t| jdd}tjd|fjjd}n|jd dkr| |t|}ntjd| jfjjd}|jd }||k r!|| }|d	krtj||jd |jd f|j|jd}ntj||jd f|j|jd}tj
||gdd}n|}| j|dd}ttj  }t|D ]}|| }|||   }||||  q5fdd|	D }dgt	 }d}t|D ]6}|| } | dkr|
|||   }!|| }"d}#|!D ]}$||$ }%|"|#|#|%  ||$< |#|%7 }#q|| 7 }qdtj
|dd}&|&S )a  Run a vision model with data parallelism (DP) sharding.
    The function will shard the input image tensor on the
    first dimension and run the vision model.
    This function is used to run the vision model with mrope.

    Args:
        vision_model (torch.nn.Module): Vision model.
        pixel_values (torch.Tensor): Image/Video input tensor.
        grid_thw_list: List of grid dimensions for each image
        rope_type: Type of rope used in the vision model.
                   Different rope types have different dimension to do ViT.
                   "rope_3d" for 3D rope (e.g., Qwen2.5-VL)
                   "rope_2d" for 2D rope (e.g., Kimi-VL)
    Returns:
        torch.Tensor: Output image embeddings

    Example:
        ```
        vision_model.out_hidden_size = 64
        vision_model.spatial_merge_size = 2
        pixel_values.shape = (1350, channel)
        grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]]
        tp_size = 2
        ```

    r   )get_attention_tp_groupget_attention_tp_rankget_attention_tp_sizerO   )grid_thwc                 S   s   g | ]}t |qS r   )r+   prod)r   r   r   r   r   rU     r   z5run_dp_sharded_mrope_vision_model.<locals>.<listcomp>c                    s$   g | ]} |  |d    qS rN   r   r   rB   )cum_patches_per_imagern   r   r   rU     rs   )devicedtyper   c                    s   g | ]} | qS r   r   r   )r   r   r   rU   6  r\   r   hidden_sizeNr)   )r   r   c                    s   g | ]}|  qS r   r   )r   r@   )embed_dim_reduction_factorr   r   rU   |  s    )sglang.srt.layers.dp_attentionr   r   r   r   r   	itertools
accumulater   r   catemptyr   r   r   merge_kernel_sizespatial_merge_sizemaxr   r   r   configout_hidden_size
all_gatherTensorr=   r?   )'r   rn   r   r   r   r   r   tp_sizetp_rank_localpatches_per_imageimage_to_tp_rankr   grouped_pixel_values_lencum_gpu_sample_countsimage_idxs_localpixel_values_localmax_len_per_ranklocal_grid_thw_listimage_embeds_localout_dimcurrent_lenpadding_sizer   image_embeds_local_paddedgathered_embedsrank_embeddingsr   	start_idxend_idxpatches_per_output_imageoriginal_order_embeddingscurrent_idxcountrank_images
rank_embedembed_startimg_idximg_patchesout_embeddingsr   )r   r   r   rn   r   !run_dp_sharded_mrope_vision_model  s   !











r  )r)   )+__doc__rf   r   r+   ra   ior   typingr   numpyrx   r   r   PILr   sglang.srt.distributedr   r   'sglang.srt.distributed.communication_opr   sglang.srt.utilsr   boolr   r'   r<   rF   rk   r   r   r   r   r   r   r   r   rc   r   r   r   Moduler   r  r   r   r   r   <module>   sb   (#(C
F
"