o
    iz                     @   s|  U d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1m2Z2 ee3Z4ee5e5dB f Z6	 eG dd dZ7G dd dZ8dddddede5dedB de9e5 dB dej:f
d d!Z;ed"ej<dej<fd#d$Z=ed"e9ej< de9ej< fd%d$Z=ed"e9ej< ej<B d&ed' dej<fd(d$Z=ed)d*d"e9ej< ej<B d&e>de9ej< ej<B fd+d$Z=d)d*d"e9ej< ej<B d&e>de9ej< ej<B fd,d$Z=d-e(dej<fd.d/Z?d-e(de5fd0d1Z@d2ej<d3eAde9e9eA  fd4d5ZBd6ej<d7e(d8ej<dej<fd9d:ZCd;ej<d<e9eA dej<fd=d>ZDG d?d@ d@ej:ZEeddAdBej:dCeFej: eGeFej: dDf B dB fdEdFZHeddAdBej:dGeej:gej:f dCeFej: eGeFej: dDf B dB fdHdIZIG dJdK dKeZJG dLdM dMejjKZLd aMd aNdNeAddfdOdPZOdBejj:dejj:fdQdRZPdSeAdTeJde5deGeAeAejjQf fdUdVZRi ZSeTeAe9e5 f eUdW< dXejj:de9e5 fdYdZZVd[e5dXejj:de>fd\d]ZWd^e9e5 d_eAfd`daZXde5d[e5de5fdbdcZYdede dB fdddeZZddge5dheAdeAfdidjZ[	kddlej<dme\dej<fdndoZ]dpej<dqeAdreAdeGej<ej<f fdsdtZ^d"ej<dej<fdudvZ_d"ej<dej<fdwdxZ`d"ej<dej<fdydzZae1dxe`eaejbjcfd{ dXej:d[e5ddfd|d}Zdd~eAdSeAdeAfddZedS )    N)CallableIterableMapping)contextmanager)	dataclassfield)AnyLiteralProtocoloverload)functional_call)(register_module_module_registration_hook)PretrainedConfig)
VllmConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)init_logger)QuantizationConfig).support_quantized_model_reload_from_hp_weights)default_weight_loader)supports_any_eagle)NestedTensorsIntermediateTensors)cdiv)is_pin_memory_availableis_uva_available)direct_register_custom_op$get_accelerator_view_from_cpu_tensorc                   @   s   e Zd ZU dZeedZeed< eedZ	eed< eedZ
eed< ddd	Zd
ededB fddZdeeeejf  deeeejf  fddZdee dee fddZdeeef deeef fddZdS )WeightsMapperzBMaps the name of each weight if they match the following patterns.)default_factoryorig_to_new_substrorig_to_new_prefixorig_to_new_suffixotherreturnc                 C   s2   t i | j|ji | j|ji | j|jdS )z7Combine two `WeightsMapper`s by merging their mappings.)r!   r"   r#   )r   r!   r"   r#   )selfr$    r'   V/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/utils.py__or__8   s
   zWeightsMapper.__or__keyNc                 C   s   | j  D ]\}}||v r|d u r d S |||d}q| j D ]\}}||r8|d u r1 d S |||d}q!| j D ]\}}||rW|d u rN d S |||d}q>|S )N   )	r!   itemsreplacer"   
startswithr#   endswithjoinrsplit)r&   r*   substrnew_keyprefixsuffixr'   r'   r(   	_map_name@   s&   

zWeightsMapper._map_nameweightsc                        fdd|D S )Nc                 3   s.    | ]\}} |  d ur |fV  qd S Nr6   ).0namedataout_namer&   r'   r(   	<genexpr>[   s    z&WeightsMapper.apply.<locals>.<genexpr>r'   )r&   r7   r'   r>   r(   applyX   s   zWeightsMapper.applyvaluesc                    r8   )Nc                    s"   g | ]} |  d ur qS r9   r:   )r;   r<   r>   r'   r(   
<listcomp>b   s
    z,WeightsMapper.apply_list.<locals>.<listcomp>r'   r&   rB   r'   r>   r(   
apply_lista   s   zWeightsMapper.apply_listc                    s    fdd|  D S )Nc                    s(   i | ]\}} |  d ur |qS r9   r:   )r;   r<   valuer>   r'   r(   
<dictcomp>i   s
    z,WeightsMapper.apply_dict.<locals>.<dictcomp>)r,   rD   r'   r>   r(   
apply_dicth   s   zWeightsMapper.apply_dict)r$   r   r%   r   )__name__
__module____qualname____doc__r   dictr!   WeightsMapping__annotations__r"   r#   r)   strr6   r   tupletorchTensorrA   listrE   r   rH   r'   r'   r'   r(   r   0   s   
 

	&r   c                       s  e Zd ZdZg dZddddddejdee dB dee dB dee dB d	ee dB d
df fddZ	de
eeejf  d
e
eee
eeejf  f  fddZdeded
efddZded
efddZded
efddZdedejde
eeejf  d
e
e fddZdejdeeejf fddZdedejde
eeejf  d
e
e fd d!Zedd"de
eeejf  d#edB d
ee fd$d%Z  ZS )&AutoWeightsLoadera"  
    Helper class to load weights into a [`torch.nn.Module`][]. It is able
    to automatically detect child modules and parameters while iterating over
    the weights only once.

    The weight loading logic for individual modules can be overridden
    by defining a `load_weights` method.

    Similarly, the weight loading logic for individual parameters can be
    overridden by defining a `weight_loader` method.

    Detailed weight loading information can be viewed by setting the
    environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
    )zrotary_pos_emb.inv_freqzrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedN)skip_prefixesskip_substrsignore_unexpected_prefixesignore_unexpected_suffixesmodulerV   rW   rX   rY   r%   c                   sL   t    || _|pg | _|pg | _|pg | _|pg | _|  j| j7  _d S r9   )super__init__rZ   rV   rW   rX   rY   ROTARY_EMBEDS_UNUSED_WEIGHTS)r&   rZ   rV   rW   rX   rY   	__class__r'   r(   r\      s   
	



zAutoWeightsLoader.__init__r7   c                 c   sD    dd |D }t j|dd dD ]\}}|dd |D fV  qd S )Nc                 s   s$    | ]\}}| d d|fV  qdS ).r+   N)split)r;   weight_nameweight_datar'   r'   r(   r@      s
    
z4AutoWeightsLoader._groupby_prefix.<locals>.<genexpr>c                 S   s   | d d S Nr   r'   xr'   r'   r(   <lambda>       z3AutoWeightsLoader._groupby_prefix.<locals>.<lambda>)r*   c                 s   s0    | ]\}}t |d krdn|d  |fV  qdS )r+    N)len)r;   partsweights_datar'   r'   r(   r@      s
    
)	itertoolsgroupby)r&   r7   weights_by_partsr4   groupr'   r'   r(   _groupby_prefix   s   z!AutoWeightsLoader._groupby_prefixr4   restc                 C   s&   |dkr|S |dkr|S d ||fS )Nri   r`   )r0   )r&   r4   rr   r'   r'   r(   _get_qualname   s
   zAutoWeightsLoader._get_qualnamequalnamec                    s0   t  fdd| jD pt  fdd| jD S )Nc                 3       | ]}  |V  qd S r9   r.   r;   prt   r'   r(   r@          z.AutoWeightsLoader._can_skip.<locals>.<genexpr>c                 3   s    | ]}| v V  qd S r9   r'   )r;   r2   ry   r'   r(   r@      s    
)anyrV   rW   )r&   rt   r'   ry   r(   	_can_skip   s   $zAutoWeightsLoader._can_skipc                    s8    fdd| j D } fdd| jD }t|pt|S )Nc                 3   ru   r9   rv   rw   ry   r'   r(   r@      rz   z;AutoWeightsLoader._can_ignore_unexpected.<locals>.<genexpr>c                 3   ru   r9   )r/   )r;   sry   r'   r(   r@      rz   )rX   rY   r{   )r&   rt   iupiusr'   ry   r(   _can_ignore_unexpected   s   z(AutoWeightsLoader._can_ignore_unexpectedbase_prefixparamc                 c   s    |D ]F\}}|  ||}| |rtd| q|dkr3| |r)td| qtd|d|t|dt}||| td||j |V  qd S )NzSkipping weight %sri   zIgnoring weight %sz Attempted to load nested weight z into a single parameter weight_loaderzLoaded weight %s with shape %s)	rs   r|   loggerdebugr   
ValueErrorgetattrr   shape)r&   r   r   r7   rb   rc   weight_qualnamer   r'   r'   r(   _load_param   s(   


zAutoWeightsLoader._load_paramchild_paramsc              	   C   sL   t |tjtjtjtjtjtjtjfr"|	 }dD ]
}|| ||< qdS dS )z
        Add tensor names that are not in the model params that may be in the
        safetensors, e.g., batch normalization stats.
        )running_meanrunning_varnum_batches_trackedN)

isinstancennBatchNorm1dBatchNorm2dBatchNorm3dLazyBatchNorm1dLazyBatchNorm2dLazyBatchNorm3dSyncBatchNorm
state_dict)r&   rZ   r   module_state_dict	stat_namer'   r'   r(   _add_loadable_non_param_tensors   s    z1AutoWeightsLoader._add_loadable_non_param_tensorsc                 #   s   t |ttfr
d S |jkr5t|dd }t|r5||}|d u r(td| nt fdd|E d H  t	|
 }t	|jdd}|| |D ]\}}	 |}
||v rw|
d rjtd|
 qN|
|| |	E d H  qN||v r|
rtd	|
 qN|
|| |	E d H  qN|
d }|
}|s|rtd
|
 qN|
d }|
}|s|rtd|
 qN fdd|jddD }d|
dj  d  d|  d| 
}t|d S )Nload_weightsz1Unable to collect loaded parameters for module %sc                    s     | S r9   )rs   re   r   r&   r'   r(   rg     rh   z0AutoWeightsLoader._load_module.<locals>.<lambda>F)recurser`   zSkipping module %szSkipping param %szSkipping missing %szIgnoring missing %sc                    s   h | ]\}} | qS r'   r'   )r;   k_)r   r'   r(   	<setcomp>=  s    z1AutoWeightsLoader._load_module.<locals>.<setcomp>Tz&There is no module or parameter named z in z(. The available parameters belonging to z (z) are: )r   StageMissingLayerPPMissingLayerrZ   r   callabler   warningmaprM   named_childrennamed_parametersr   rq   rs   r|   r   _load_moduler   r   	_get_namer   )r&   r   rZ   r7   module_load_weightsloaded_paramschild_modulesr   child_prefixchild_weightsr4   can_skip_modulecan_skip_paramcan_ignore_modulecan_ignore_paramdesc_param_keysmsgr'   r   r(   r      sv   








zAutoWeightsLoader._load_module)mapperr   c                   s<   |d ur	| |} fdd|D }t d j|}|S )Nc                 3   s&    | ]\}}  |s||fV  qd S r9   )r|   )r;   r<   weightr&   r'   r(   r@   R  s    
z1AutoWeightsLoader.load_weights.<locals>.<genexpr>ri   )rA   setr   rZ   )r&   r7   r   autoloaded_weightsr'   r   r(   r   H  s   

zAutoWeightsLoader.load_weights)rI   rJ   rK   rL   r]   r   ModulerT   rP   r\   r   rQ   rR   rS   rq   rs   boolr|   r   	Parameterr   rM   r   r   r   r   r   r   __classcell__r'   r'   r^   r(   rU   p   sx    





 

OrU   ri   )r4   	hf_configarchitecturesvllm_configr4   r   r   r%   c                C   sF   ddl m} |du r|dur| jj}|dur| j||d} || |dS )z
    Helper function to initialize an inner model registered to vLLM,
    based on the arguments passed to the outer vLLM model.
    r   )initialize_modelN)r   )r   r4   )&vllm.model_executor.model_loader.utilsr   model_configr   with_hf_config)r   r4   r   r   r   r'   r'   r(   init_vllm_registered_modelZ  s   r   rf   c                 C      d S r9   r'   re   r'   r'   r(   
flatten_bnq     r   c                 C   r   r9   r'   re   r'   r'   r(   r   u  r   concatTc                C   r   r9   r'   rf   r   r'   r'   r(   r   y     F)r   c                C   r   r9   r'   r   r'   r'   r(   r     r   c                C   s4   t | tjr| ddS |rt| S dd | D S )z
    Flatten the `B` and `N` dimensions of batched multimodal inputs.

    The input tensor should have shape `(B, N, ...)`.
    r   r+   c                 S   s   g | ]	}|D ]}|qqS r'   r'   )r;   x_bx_nr'   r'   r(   rC     s    zflatten_bn.<locals>.<listcomp>)r   rR   rS   flattencatr   r'   r'   r(   r     s
   


embeddingsc                 C   s0   t | tjr| ddS ttdd | D S )z`
    Recursively flattens and concatenates NestedTensors on all but the last
    dimension.
    r   c                 s       | ]}t |V  qd S r9   )_flatten_embeddings)r;   tr'   r'   r(   r@         z&_flatten_embeddings.<locals>.<genexpr>)r   rR   rS   r   r   rQ   r   r'   r'   r(   r     s   r   c                 C   s>   t | tjrddd | jdd D S ddd | D S )	ze
    Constructs a debugging representation of the number of embeddings in the
    NestedTensors.
    z x c                 S   s   g | ]}t |qS r'   )rP   )r;   dimr'   r'   r(   rC     s    z/_embedding_count_expression.<locals>.<listcomp>Nz + c                 s   r   r9   )_embedding_count_expression)r;   innerr'   r'   r(   r@     r   z._embedding_count_expression.<locals>.<genexpr>)r   rR   rS   r0   r   r   r'   r'   r(   r     s   r   lstintervalc                 C   sB   dd t t| | d D }| D ]}|| }|| | q|S )Nc                 S   s   g | ]}g qS r'   r'   r;   r   r'   r'   r(   rC     s    z*split_list_into_ranges.<locals>.<listcomp>r+   )rangemaxappend)r   r   rangesnumindexr'   r'   r(   split_list_into_ranges  s
   r   inputs_embedsmultimodal_embeddingsis_multimodalc           	      C   s   t |dkr| S t|}| j}z| |d|j|d W | S  tyQ } z&t |}|  }||krHt	|}t
d| d| d| d|t
d|d	}~ww )
z
    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
    positions in `inputs_embeds` corresponding to placeholder tokens in
    `input_ids`.

    Note:
        This updates `inputs_embeds` in place.
    r   r   )dtypezAttempted to assign z = z multimodal tokens to z placeholdersz%Error during masked scatter operationN)rj   r   r   masked_scatter_	unsqueezetoRuntimeErrorsumitemr   r   )	r   r   r   mm_embeds_flatinput_dtypeenum_actual_tokensnum_expected_tokensexprr'   r'   r(   _merge_multimodal_embeddings  s0   
r   elementstest_elements_listc                 C   s(   t j|t dj| jdd}t | |S )N)
pin_memoryT)devicenon_blocking)rR   tensorr   r   r   isin)r   r   test_elementsr'   r'   r(   	isin_list  s   r   c                       sV   e Zd ZddedejdB ddf fddZdefdd	Zd
d ZdefddZ	  Z
S )r   N
stage_namerZ   r%   c                    s   t    || _|| jd< d S NrZ   )r[   r\   r   __dict__)r&   r   rZ   r^   r'   r(   r\     s   
zStageMissingLayer.__init__r<   c                 C   s   t | jd |S r  )r   r  )r&   r<   r'   r'   r(   __getattr__   s   zStageMissingLayer.__getattr__c                 O   s   t |  d)Nz should not be called)r   r&   argskwargsr'   r'   r(   __call__     zStageMissingLayer.__call__c                 C   s   d| j S )Nzstage_name=)r   r   r'   r'   r(   
extra_repr  s   zStageMissingLayer.extra_reprr9   )rI   rJ   rK   rP   r   r   r\   r  r  r	  r   r'   r'   r^   r(   r     s
    "	r   )targetsrZ   r
  .c                #   s    t t   |du r4dtjdtdtjf fdd}t|  V  W d   dS 1 s-w   Y  dS  V   D ]\}}t||rI | q;dS )aA  
    Within this context, collect all direct child assignments to `module`,
    returning a list of children names that is internally updated until the
    context is exited.

    If `targets` is set, instead collect descendents of `module`
    that are an instance of `targets`, even if they aren't direct children.
    Nmodule_r<   	submodulec                    s   | u r  | d S d S r9   )r   r  r<   r  children_namesrZ   r'   r(   hook  s   zcollect_children.<locals>.hook)rT   rP   r   r   r   named_modulesr   r   )rZ   r
  r  r<   r  r'   r  r(   collect_children
  s   
 
"

r  placeholderc             	   #   s    du rLdt jdtdt jf fdd}t|) td dV  W d   n1 s-w   Y  W d   dS W d   dS 1 sEw   Y  dS dt jdtdt jffdd}t| dV  W d   dS 1 sow   Y  dS )	al  
    Within this context, prevent weight initialization from using device memory and
    replace direct child assignments to `module` with the result of `placeholder()`.

    If `targets` is set, instead prevent weight initialization and
    replace assignments where the child is an instance of `targets`,
    even if they aren't direct children of `module`.
    Nr  r<   r  c                    s   |  u r|S |S r9   r'   r  )rZ   r  r'   r(   r  ;  s   zno_init_weights.<locals>.hookmetac                    s4   t | r
|d t |r|d  |S |S )Nr  )r   r   r  )r  r
  r'   r(   r  E  s   



)r   r   rP   r   rR   r   )rZ   r  r
  r  r'   )rZ   r  r
  r(   no_init_weights*  s    P 
"r  c                   @   s"   e Zd ZdedejjfddZdS )LayerFnr4   r%   c                 C   r   r9   r'   )r&   r4   r'   r'   r(   r  U  s    zLayerFn.__call__N)rI   rJ   rK   rP   rR   r   r   r  r'   r'   r'   r(   r  T  s    r  c                       s(   e Zd ZdZ fddZdd Z  ZS )r   zN
    A placeholder layer for missing layers in a pipeline parallel model.
    c                    s   t    d S r9   )r[   r\   r  r^   r'   r(   r\   ]  r  zPPMissingLayer.__init__c                 O   s   |r|d S t t| S )z>Return the first arg from args or the first value from kwargs.r   )nextiterrB   r  r'   r'   r(   forward`  s   zPPMissingLayer.forward)rI   rJ   rK   rL   r\   r  r   r'   r'   r^   r(   r   X  s    r   	max_bytesc                 C   s   da | ad S rd   )_CPU_OFFLOAD_BYTES_CPU_OFFLOAD_MAX_BYTES)r  r'   r'   r(   set_cpu_offload_max_bytesi  s   r  c              	      s  t  d  }d u rS |j  tdkrS ttkrS t }t }|s+J dd}d} D ]?}ttkr; n8tj|j	
 |j	 |j	j|j	jd|d}||j	 |s\||_	n||_t||_	t|j	 |j	  7 ad}q3|r|sj fdd_S )Ncpuz3V1 CPU offloading requires uva (pin memory) supportTF)sizestrider   layoutr   r   c                     s:   _  fdd  D }t|| |d}_ |S )Nc                    s    i | ]\}}||j  d dqS )T)r   )r   )r;   r   v)r   r'   r(   rG     s    z9maybe_offload_to_cpu.<locals>.forward.<locals>.<dictcomp>)r  r  )r  r   r,   r   )r  r  device_stateoutputr   r  rZ   original_forwardr'   r(   r    s   

z%maybe_offload_to_cpu.<locals>.forward)r  
parametersr   rR   r  r  r   r   empty_stridedr=   r  r   r   r!  copy__vllm_offloaded_cpu_datar   numelelement_sizer  )rZ   paramsr   uva_availableuva_offloadingoffloaded_parametersrx   cpu_datar'   r%  r(   maybe_offload_to_cpuo  sF   
r2  num_hidden_layerslayer_fnc                    s   ddl m} ddlm} || | j| j\}}tjdd t	|D  fddt	||D  dd t	|| D  }|||fS )zgMake a list of layers with the given layer function, taking
    pipeline parallelism into account.
    r   )get_pp_group)get_pp_indicesc                 S      g | ]}t  qS r'   r   r   r'   r'   r(   rC         zmake_layers.<locals>.<listcomp>c                    s$   g | ]}t   d | dqS )r`   )r4   )r2  )r;   idxr4  r4   r'   r(   rC     s    c                 S   r7  r'   r8  r   r'   r'   r(   rC     r9  )
vllm.distributed.parallel_stater5  vllm.distributed.utilsr6  rank_in_group
world_sizerR   r   
ModuleListr   )r3  r4  r4   r5  r6  start_layer	end_layermodulesr'   r;  r(   make_layers  s   
rD   _model_to_pp_missing_layer_namesmodelc                 C   sV   t | }|tv rt| S g }|  D ]\}}t|ttfr$||d  q|t|< |S )zAGet the names of the missing layers in a pipeline parallel model.r`   )idrE  r  r   r   r   r   )rF  model_idmissing_layer_namesr<   rZ   r'   r'   r(   get_pp_missing_layer_names  s   rJ  r<   c                    s,   t |ttfr	dS t fddt|D S )z=Check if a parameter is missing in a pipeline parallel model.Tc                 3   ru   r9   rv   )r;   missing_layer_namer<   r'   r(   r@     s
    
z*is_pp_missing_parameter.<locals>.<genexpr>)r   r   r   r{   rJ  )r<   rF  r'   rL  r(   is_pp_missing_parameter  s
   rM  keyshidden_sizec                    s(   dt dtjdtjdtf fdd}|S )N
batch_sizer   r   r%   c                    s   t  fddD S )Nc                    s"   i | ]}|t j fd qS )r   r   )rR   zeros)r;   r*   )rP  r   r   rO  r'   r(   rG     s    zdmake_empty_intermediate_tensors_factory.<locals>.make_empty_intermediate_tensors.<locals>.<dictcomp>r   )rP  r   r   rO  rN  )rP  r   r   r(   make_empty_intermediate_tensors  s
   zPmake_empty_intermediate_tensors_factory.<locals>.make_empty_intermediate_tensors)intrR   r   r   r   )rN  rO  rT  r'   rS  r(   'make_empty_intermediate_tensors_factory  s   rV  c                 C   s   | s|S |  d| S )a  Add a prefix to a name if the prefix is non-empty.

    Args:
        prefix: The prefix to add. If empty, no prefix will be added.
        name: The name to potentially prefix.

    Returns:
        The string "prefix.name" if prefix was non-empty, otherwise just "name".
    r`   r'   )r4   r<   r'   r'   r(   maybe_prefix  s   
rW  c                 C   s"   | j j}| j}|rt||S dS )ae  Get quantization config for Draft models.

    Draft models should use their own quantization config instead of the verifier/target
    model's config. This helper retrieves the draft model's quantization config.

    Args:
        vllm_config: The vLLM configuration object.

    Returns:
        The draft model's config if available, None otherwise.
    N)speculative_configdraft_model_configload_configr   get_quantization_config)r   rY  draft_load_configr'   r'   r(   get_draft_quant_config	  s   r]  r+   
layer_namenum_attn_modulec              	   C   s   |  d}g }|D ]}z	|t| W q	 ty   Y q	w |dks&d| vr8t|dks4J d|  d|d S t|dksFJ d|  dt|dkrX|d | |d  }|S |d }|S )	z
    Extract the layer index from the module name.
    Examples:
    - "encoder.layers.0" -> 0
    - "encoder.layers.1.self_attn" -> 1
    - "2.self_attn" -> 2
    - "model.encoder.layers.0.sub.1" -> ValueError if num_attn_module == 1
    r`   r+   attnzlayer name z  should only contain one integerr      z! should contain most two integers)ra   r   rU  r   rj   )r^  r_  subnamesint_valssubnamelayer_indexr'   r'   r(   extract_layer_index!  s.   
	

rf    tensorsoffsetc                 C   s@   |    s|   rt| jj| }tj| | |d} | S )N)minr   )isinfr{   isnanrR   finfor   r   clamp)rh  ri  clamp_valuer'   r'   r(   cast_overflow_tensorsC  s   rp  rB   topkr   c                 C   s(   |dkrt j| |ddS t j| ||dS )a!  
    Optimized topk implementation that uses torch.max for k=1 case.

    This function provides better performance for the common case of k=1
    by using torch.max instead of the more general torch.topk.

    Args:
        values: Input tensor to find top-k values from
        topk: Number of top values to return (k). Must be > 0.
        dim: Dimension along which to compute topk

    Returns:
        Tuple of (values, indices) where values are the top-k values
        and indices are their corresponding indices in the input tensor
    r+   T)r   keepdim)r   )rR   r   rq  )rB   rq  r   r'   r'   r(   	fast_topkM  s   rs  c                 C   s   t jj| S r9   )rR   opsvllmsequence_parallel_chunk_implre   r'   r'   r(   sequence_parallel_chunkk  r  rw  c           	      C   sp   t  }t }| d}|| }|dkr#|| }tj| ddd|f}n| }|jd | }|| }t|d||S rd   )	r   r   r  r   
functionalpadr   rR   narrow)	rf   tp_sizetp_rankseq_len	remainderpad_lenychunkstartr'   r'   r(   rv  o  s   
rv  c                 C   s@   t  }t| d|}t| j}||d< tj|| j| jd}|S )Nr   rQ  )	r   r   r  rT   r   rR   emptyr   r   )rf   r{  r}  r   outr'   r'   r(   !sequence_parallel_chunk_impl_fake  s   
r  )op_nameop_func	fake_impltagsc                 C   s0   t | sdS d|v rd| _d|v rd| _dS dS )a%  
    Update EAGLE model flags based on loaded weight name.
    This should be called during weight loading to detect if a model
    has its own lm_head or embed_tokens weight.
    Args:
        model: The model instance (must support EAGLE)
        name: The name of the weight to process
    Nlm_headTembed_tokens)r   has_own_lm_headhas_own_embed_tokens)rF  r<   r'   r'   r(   process_eagle_weight  s   
r  feature_layer_indexc                 C   s   | dk r
||  d S | S )a  Given a signed vision feature layer, get the number of hidden layers
       needed to leverage it.

    Args:
        feature_layer_index: Index of a required layer in the visual encoder.
        num_hidden_layers: The total number of hidden layers in the visual encoder.
    r   r+   r'   )r  r3  r'   r'   r(   get_layer_index  s   r  )r+   )rg  )frm   collections.abcr   r   r   
contextlibr   dataclassesr   r   typingr   r	   r
   r   rR   torch.nnr   
torch.funcr   torch.nn.modules.moduler   transformersr   vllm.configr   vllm.distributedr   r   vllm.loggerr   3vllm.model_executor.layers.quantization.base_configr   'vllm.model_executor.model_loader.reloadr   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   vllm.multimodalr   vllm.sequencer   vllm.utils.math_utilsr   vllm.utils.platform_utilsr   r   vllm.utils.torch_utilsr   r   rI   r   rP   rN   r   rU   rT   r   r   rS   r   r   r   r   rU  r   r   r   r   typerQ   r  r  r  Identityr   r  r  r  r2  r@  rD  rE  rM   rO   rJ  rM  rV  rW  r]  rf  floatrp  rs  rw  rv  r  Tagneeds_fixed_stride_orderr  r  r'   r'   r'   r(   <module>   sP  
? n

 

 
-
)C

$


	
