o
    
۾i[                     @   s  d dl mZmZmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z: ddl;m<Z< ddl;m=Z> ddl?m@Z@mAZAmBZB ddlCmDZDmEZEmFZF ddlGmHZHmIZImJZJmKZK G dd de9ZLG dd  d e>eBZMG d!d" d"e	jNZOG d#d$ d$e	jNZPG d%d& d&eZQG d'd( d(e	jNZRG d)d* d*eDZSG d+d, d,eFeBZTG d-d. d.e3ZUG d/d0 d0e1eU ZVG d1d2 d2e2eU ZWe)jXeWeUeVd3G d4d5 d5e	jNeAZYdS )6    )IterableMappingSequence)	AnnotatedLiteralN)
AriaConfigAriaTextConfigBatchFeature)AriaCrossAttention)AriaProcessor)
VllmConfig)BaseDummyOptions)get_tensor_model_parallel_rank)
get_act_fn)SharedFusedMoE)ColumnParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHead)default_weight_loadermaybe_remap_kv_scale_name)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItems)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)TensorSchemaTensorShape   )Idefics2VisionConfig)Idefics2VisionTransformer)MultiModalEmbeddingsSupportsMultiModalSupportsQuant)LlamaDecoderLayerLlamaMLP
LlamaModel)AutoWeightsLoaderWeightsMapperis_pp_missing_parametermaybe_prefixc                   @   sX   e Zd ZU dZed ed< eeje	ddddf ed< eejdB e	dddf ed	< dS )
AriaImagePixelInputsz
    Dimensions:
        - b: Batch size
        - n: Number of images
        - c: Number of channels
        - h: Height of each image
        - w: Width of each image
    pixel_valuestypebn   hwN
pixel_mask)
__name__
__module____qualname____doc__r   __annotations__r   torchTensorr$    rA   rA   S/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/aria.pyr2   8   s   
 	
r2   c                	       sh   e Zd Zdg diZ		ddededB deddf fd	d
Zdee	ee
jf  dee fddZ  ZS )AriaVisionTransformerqkv_projq_projk_projv_projN configquant_configprefixreturnc                    s    t  j|||d t | _d S )NrK   rL   )super__init__nnIdentitypost_layernormselfrJ   rK   rL   	__class__rA   rB   rP   R   s   zAriaVisionTransformer.__init__weightsc                 C   s   g d}t |  }t }|D ]>\}}d|v rq|D ]\}}}	||vr$q|||}|| }
|
j}||
||	  n|| }
t|
dt}||
| || q|S )N))rD   rF   q)rD   rG   k)rD   rH   vrS   weight_loader)dictnamed_parameterssetreplacer\   getattrr   add)rU   rX   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr\   rA   rA   rB   load_weights^   s&   
z"AriaVisionTransformer.load_weights)NrI   )r:   r;   r<   packed_modules_mappingr&   r   strrP   r   tupler?   r@   r_   rl   __classcell__rA   rA   rV   rB   rC   O   s    ,rC   c                       sL   e Zd Z	ddededededdf
 fdd	Zd
ejdejfddZ  Z	S )AriaProjectorMLPrI   in_featureshidden_features
output_dimrL   rM   Nc                    sH   t    t||d| dd| _t||d| dd| _td| _d S )NFz
.linear_in)biasrL   z.linear_outgelu_new)rO   rP   r   	linear_inr   
linear_outr   act)rU   rr   rs   rt   rL   rV   rA   rB   rP   }   s   
zAriaProjectorMLP.__init__hidden_statesc                 C   s*   |  |\}}| |}| |\}}|S N)rw   ry   rx   )rU   rz   _rA   rA   rB   forward   s   
zAriaProjectorMLP.forwardrI   )
r:   r;   r<   intrn   rP   r?   r@   r}   rp   rA   rA   rV   rB   rq   |   s    rq   c                       sT   e Zd ZdZddededdf fddZ	dd	ejd
ejdB dejfddZ	  Z
S )AriaProjectora  
    A projection module with one cross attention layer and one FFN layer, which
    projects ViT's outputs into MoE's inputs.

    Args:
        config: [AriaConfig](https://huggingface.co/docs/transformers/main/model_doc/aria#transformers.AriaConfig)
            containing projector configuration parameters.

    Outputs:
        A tensor with the shape of (batch_size, query_number, output_dim)
    rI   rJ   rL   rM   Nc                    s   t    |j| _|jj| _|jj| _|jj| _	|j
j| _|j
j| _tt|j| j| _t|| _t| j| _t| j| j| j| dd| _d S )Nz.feed_forwardrL   )rO   rP   projector_patch_to_query_dictpatch_to_query_dictvision_confighidden_sizerr   num_attention_heads	num_headskv_dimtext_configrs   rt   rQ   	Parameterr?   empty'max_value_projector_patch_to_query_dictqueryr
   
cross_attn	LayerNorm
layer_normrq   feed_forward)rU   rJ   rL   rV   rA   rB   rP      s(   






zAriaProjector.__init__x	attn_maskc           	      C   s   |j d |j d }}|| jvrtd| d| j  d| j| }| jd | d|dd}|d urJ|| jd}|d	d|
dd}| j|||d}| | |}|S )Nr   r%   zNumber of patches z: not found in patch_to_query_dict amongst possible values .)r   )shaper   KeyErrorkeysr   	unsqueezerepeatrepeat_interleaver   expandsizer   r   r   )	rU   r   r   
batch_sizenum_patches	query_numqueriesattention_outoutrA   rA   rB   r}      s   

zAriaProjector.forwardr~   r{   )r:   r;   r<   r=   r   rn   rP   r?   r@   r}   rp   rA   rA   rV   rB   r      s    r   c                   @   s*   e Zd ZdejdejdeddfddZdS )AriaFusedMoErk   rg   rj   rM   Nc                 C   s   t  }|dkrJ| jdkr>|jddd\}}|j| jdd| }|j| jdd| }tj||gdddd}	|j|	 d S |j|dd d S |dkru| jdkri|j| jdd| }
|j|
dd d S |j|dd d S d S )Nw13r%      r   dimw2)r   tp_sizechunkr?   cat	transposedatacopy_)rU   rk   rg   rj   tp_rankupgateup_current_rankgate_current_rankup_and_gatedown_current_rankrA   rA   rB   r\      s&   

zAriaFusedMoE.weight_loader)	r:   r;   r<   rQ   r   r?   r@   rn   r\   rA   rA   rA   rB   r      s    r   c                	       sP   e Zd ZdZ	ddededB deddf fdd	Zd
ej	dej	fddZ
  ZS )AriaTextMoELayera  
    Mixture of Experts (MoE) Layer for the AriaMoE model.

    This layer implements the MoE mechanism, which routes input tokens to
    different experts based on a routing algorithm, processes them through the
    experts, and then combines the outputs.
    rI   rJ   rK   NrL   rM   c              
      s|   t    || _tt| jj| jjf| _	t
|j|j|j d||jd| _t| j|j|j|j|j|d| dd| _d S )Nsilu)rK   ru   Tz.experts)shared_expertsnum_expertstop_kr   intermediate_sizerK   reduce_resultsrL   )rO   rP   rJ   rQ   r   r?   r   moe_num_expertsr   router_weightr,   r   moe_num_shared_expertsmlp_biasr   r   moe_topkexpertsrT   rV   rA   rB   rP     s,   

zAriaTextMoELayer.__init__rz   c                 C   s<   t jj|| j}| ||}| jdur|d |d  S |S )a  
        Forward pass of the MoE Layer.

        Args:
            hidden_states: Input tensor of shape
                (batch_size, sequence_length, hidden_size).

        Returns:
            torch.Tensor: Output tensor after passing through the MoE layer.
        Nr   r%   )r?   rQ   
functionallinearr   r   r   )rU   rz   router_outputsparse_expert_outputrA   rA   rB   r}   "  s
   
zAriaTextMoELayer.forwardr~   )r:   r;   r<   r=   r   r   rn   rP   r?   r@   r}   rp   rA   rA   rV   rB   r      s     r   c                       s0   e Zd ZdZd	dededdf fddZ  ZS )
AriaTextDecoderLayerz
    Custom Decoder Layer for the AriaMoE model which modifies the standard
    `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of
    Experts (MoE) Layer.
    rI   vllm_configrL   rM   Nc                    s6   t  || |jj}|j}t||| dd| _d S )Nz.mlprN   )rO   rP   model_config	hf_configrK   r   mlp)rU   r   rL   rJ   rK   rV   rA   rB   rP   ?  s   zAriaTextDecoderLayer.__init__r~   )r:   r;   r<   r=   r   rn   rP   rp   rA   rA   rV   rB   r   8  s    $r   c                       sn   e Zd ZdZg dddgdgdgdZdd	d
edef fddZdee	ee
jf  dee fddZ  ZS )AriaTextModelz
    Custom LlamaModel for the AriaMoE model which modifies the standard
    LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
    rE   	gate_projup_projexperts.fc1.weightexperts.fc2.weight)rD   gate_up_projexperts.w13_weightexperts.w2_weightrI   r   r   rL   c                   s   t  j||td d S )N)r   rL   
layer_type)rO   rP   r   )rU   r   rL   rV   rA   rB   rP   W  s   
zAriaTextModel.__init__rX   rM   c                 C   sb  g d}t |  }t }|D ]\}}d|v rqd|v s d|v r!q| jd urO| j| }rO|| }t|dt}	| dkr@|n|d }|	|| || q|D ].\}
}}||vr[qQ|	||
}|
drk||vrkqQt|| rqqQ|| }|j}	|	|||  n)|
dr||vrqt||}|d u rqt|| rq|| }t|dt}	|	|| || q|S )N))	.qkv_projz.q_projrY   )r   z.k_projrZ   )r   z.v_projr[   ).gate_up_projz
.gate_projr   )r   z.up_projr%   )r   r   r   )r   r   r   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedr\   r   z.bias)r]   r^   r_   rK   get_cache_scalera   r   r   rb   r`   endswithr0   r\   r   )rU   rX   rc   rd   re   rf   rg   
scale_namerk   r\   rh   ri   rj   rA   rA   rB   rl   ^  sT   







zAriaTextModel.load_weights)r:   r;   r<   r=   rm   r   rn   rP   r   ro   r?   r@   r_   rl   rp   rA   rA   rV   rB   r   J  s    ,r   c                   @   sR   e Zd Zdd Zdd ZdefddZdeee	d	B f fd
dZ
de	fddZd	S )AriaProcessingInfoc                 C   s   | j tS r{   )ctxget_hf_configr   rU   rA   rA   rB   r     s   z AriaProcessingInfo.get_hf_configc                 C   s
   |   jS r{   )r   r   r   rA   rA   rB   get_vision_config  s   
z$AriaProcessingInfo.get_vision_configkwargsc                 K   s   | j jtfi |S r{   )r   get_hf_processorr   )rU   r   rA   rA   rB   r     s   z#AriaProcessingInfo.get_hf_processorrM   Nc                 C   s   dd iS )NimagerA   r   rA   rA   rB   get_supported_mm_limits  s   z*AriaProcessingInfo.get_supported_mm_limitsc                 C   s   |   }t|j S r{   )r   maxr   values)rU   r   rA   rA   rB   get_num_image_tokens  s   z'AriaProcessingInfo.get_num_image_tokens)r:   r;   r<   r   r   objectr   r   rn   r   r   r   rA   rA   rA   rB   r     s    r   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )AriaDummyInputsBuilder	mm_countsrM   c                 C   s&   | dd}| j }|jj}|| S )Nr   r   )getinfor   	tokenizerimage_token)rU   r   
num_images	processorr   rA   rA   rB   get_dummy_text  s   
z%AriaDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sD   | j  }|j}|dd}|r|dnd }d| j||||diS )Nr   r   )widthheightr   	overrides)r   r   
image_sizer   _get_dummy_images)rU   r   r   r   r   max_image_sizer   image_overridesrA   rA   rB   get_dummy_mm_data  s   
z(AriaDummyInputsBuilder.get_dummy_mm_datar{   )
r:   r;   r<   r   rn   r   r   r   r   r   rA   rA   rA   rB   r     s    
r   c                	   @   sX   e Zd Zdedeeef deeef fddZde	deeef de
dee fdd	Zd
S )AriaMultiModalProcessor	hf_inputshf_processor_mm_kwargsrM   c                 C   s   t tdtddS )Nr   )r3   r9   )r]   r   batched)rU   r   r   rA   rA   rB   _get_mm_fields_config  s   z-AriaMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                 C   s2   | j  }|j}| j  }td|g|g| dgS )Nr   )modalitytargetreplacement)r   r   image_token_indexr   r    )rU   r   r   r  r   image_token_idnum_image_tokensrA   rA   rB   _get_prompt_updates  s   

z+AriaMultiModalProcessor._get_prompt_updatesN)r:   r;   r<   r	   r   rn   r   r   r   r   r   r   r!   r  rA   rA   rA   rB   r     s"    




r   )r   dummy_inputsc                       sB  e Zd ZdZedddddddd	id
ZededededB fddZ		d-de
def fddZdededB fddZdejdB dejdB fddZdedeejejf fddZdedefdd Z		d.d!ejdB d"ejd#edB d$ejdB dedejeB fd%d&Zd'ejdejdB fd(d)Zd*eeeejf  fd+d,Z  ZS )/AriaForConditionalGenerationz
    Aria model for conditional generation tasks.

    This model combines a vision tower, a multi-modal projector, and a language
    model to perform tasks that involve both image and text inputs.
    zlanguage_model.model.zvision_tower.zmulti_modal_projector.language_modellm_head)zmodel.language_model.zmodel.vision_tower.zmodel.multi_modal_projector.language_model.modelzlanguage_model.lm_headzrouter.weightr   )orig_to_new_prefixorig_to_new_suffixr  irM   Nc                 C   s   | drdS td)Nr   z#<|fim_prefix|><|img|><|fim_suffix|>z Only image modality is supported)
startswith
ValueError)clsr  r  rA   rA   rB   get_placeholder_str  s   
z0AriaForConditionalGeneration.get_placeholder_strrI   r   rL   c                    s   t    |jj}|j}|| _| |d t|j|| dd| _	t
|t|dd| _W d    n1 s6w   Y  | |6 t||jt|dd| _t|jj|jj|t|dd| _t|d	d
}t|jj|d| _W d    d S 1 syw   Y  d S )Nr   z.vision_towerrN   multi_modal_projectorr   r  )r   rL   r  logit_scaleg      ?)scale)rO   rP   r   r   rK   rJ   _mark_tower_modelrC   r   vision_towerr   r1   r  _mark_language_modelr   with_hf_configr   r  r   
vocab_sizer   r  ra   r   logits_processor)rU   r   rL   rJ   rK   r  rV   rA   rB   rP     s<   





"z%AriaForConditionalGeneration.__init__r   c                 K   s2   | dd }| dd }|d u rd S td||dS )Nr3   r9   )r4   r3   r9   )popr2   )rU   r   r3   r9   rA   rA   rB   _parse_and_validate_image_input;  s   z<AriaForConditionalGeneration._parse_and_validate_image_inputr9   c                 C   sT   |d u rd S |j d| jjj| jjjdj d| jjj| jjjd}|jdddk S )Nr%   )	dimensionr   stepr   )r   r   r   )unfoldr  rJ   
patch_sizesumbool)rU   r9   patches_subgridrA   rA   rB   _create_patch_attention_maskJ  s   	z9AriaForConditionalGeneration._create_patch_attention_maskimage_inputc                 C   sT   |d }|d }|  |}| j||d}d }|d ur$|d}t|}| ||S )Nr3   r9   )r3   patch_attention_maskr%   )r(  r  flattenr?   logical_notr  )rU   r)  r3   r9   r*  image_outputsimage_attn_maskflattened_maskrA   rA   rB   _process_image_input\  s   


z1AriaForConditionalGeneration._process_image_inputc                 K   s*   | j di |}|d u rg S | |}|S )NrA   )r  r0  )rU   r   r)  multimodal_embeddingsrA   rA   rB   embed_multimodalo  s
   
z-AriaForConditionalGeneration.embed_multimodal	input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s"   |d urd }| j ||||d}|S )N)r6  )r  )rU   r3  r4  r5  r6  r   rz   rA   rA   rB   r}   v  s   z$AriaForConditionalGeneration.forwardrz   c                 C   s   |  | j|}|S r{   )r  r  )rU   rz   logitsrA   rA   rB   compute_logits  s   z+AriaForConditionalGeneration.compute_logitsrX   c                 C   s   t | }|j|| jd d S )N)mapper)r.   rl   hf_to_vllm_mapper)rU   rX   loaderrA   rA   rB   rl     s   z)AriaForConditionalGeneration.load_weightsr~   )NN)r:   r;   r<   r=   r/   r:  classmethodrn   r   r  r   rP   r   r2   r  r?   r@   r(  ro   r0  r(   r2  r"   r}   r8  r   rl   rp   rA   rA   rV   rB   r
    sr    
	'




$r
  )Zcollections.abcr   r   r   typingr   r   r?   torch.nnrQ   transformersr   r   r	   &transformers.models.aria.modeling_ariar
   (transformers.models.aria.processing_ariar   vllm.configr   vllm.config.multimodalr   vllm.distributedr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.fused_moer   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   vllm.multimodal.processingr   r   r   r    r!   vllm.sequencer"   vllm.utils.tensor_schemar#   r$   idefics2_vision_modelr&   r'   Idefics3VisionTransformer
interfacesr(   r)   r*   llamar+   r,   r-   utilsr.   r/   r0   r1   r2   rC   Modulerq   r   r   r   r   r   r   r   r   register_processorr
  rA   rA   rA   rB   <module>   sZ   -D ?V 