o
    
۾i                  	   @   s  d Z ddlZddlmZmZ ddlmZmZ ddlZ	ddl
Z
ddlm  mZ ddl
mZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZEmFZFmGZG ddddddZHG dd  d e:ZIG d!d" d"e0ZJG d#d$ d$e/eJ ZKG d%d& d&e.eJ ZLG d'd( d(ejMZNG d)d* d*ejMZOG d+d, d,ejMZPG d-d. d.ejMZQG d/d0 d0ejMZRG d1d2 d2ejMZSG d3d4 d4ejMZTe$jUeKeJeLd5G d6d7 d7ejMeAeBe@eCZVdS )8z(Inference-only IBM Granite speech model.    N)IterableMapping)	AnnotatedLiteral)nn)BatchFeaturePretrainedConfig)CacheConfigModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)
PromptTypeTokensPrompt)ColumnParallelLinearRowParallelLinear)QuantizationConfig)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)AudioProcessorItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)cached_processor_from_config)TensorSchemaTensorShape   )Blip2QFormerModel)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsTranscription)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefixEnglishFrenchGerman
PortugueseSpanish)enfrdeptesc                   @   s`   e Zd ZU dZeejedddf ed< 	 eejeddf ed< 	 ee	e
 edf ed< d	S )
GraniteSpeechAudioInputsa(  
    Audio input features for Granite Speech model.

    Dimensions:
        - b: Batch size
        - fi: Number of input features from the Mel spectrogram.
        - fo: Number of output features, i.e. the embedding size.
        - 160: Fixed feature dimension for Mel spectrogram features
    bfi   input_featuresfoinput_features_maskaudio_embed_sizesN)__name__
__module____qualname____doc__r   torchTensorr$   __annotations__listint rJ   rJ   ]/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/granite_speech.pyr9   Z   s   
 
r9   c                   @   s>   e Zd Zdd ZdeeedB f fddZdd Zd	d
 Z	dS )%GraniteSpeechMultiModalProcessingInfoc                 C   s    |   j}t|jd |  dS )Nsample_rate)	target_srexpected_hidden_size)get_hf_processoraudio_processorr   melspec_kwargs_get_expected_hidden_size)selffeature_extractorrJ   rJ   rK   get_data_parserp   s
   
z5GraniteSpeechMultiModalProcessingInfo.get_data_parserreturnNc                 C   s   ddiS )Naudior%   rJ   rT   rJ   rJ   rK   get_supported_mm_limitsx   s   z=GraniteSpeechMultiModalProcessingInfo.get_supported_mm_limitsc                 C      dS )Ni  rJ   rY   rJ   rJ   rK   get_max_audio_tokens      z:GraniteSpeechMultiModalProcessingInfo.get_max_audio_tokensc                 C   r[   )Ni z rJ   rY   rJ   rJ   rK   get_max_audio_len   r]   z7GraniteSpeechMultiModalProcessingInfo.get_max_audio_len)
rA   rB   rC   rV   r   strrI   rZ   r\   r^   rJ   rJ   rJ   rK   rL   o   s
    rL   c                
       s   e Zd Zdedeeef deeef fddZde	deeef de
dee fdd	Zd
edeeef deeef deeef def
 fddZ  ZS ) GraniteSpeechMultiModalProcessor	hf_inputshf_processor_mm_kwargsrW   c                 C   s   t tdtddS )NrX   )r=   r@   )dictr   batched)rT   ra   rb   rJ   rJ   rK   _get_mm_fields_config   s   z6GraniteSpeechMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc           	         sf   | j jdi |}| j  }|j| }t|dd}||  dtf fdd}td g|dgS )	Naudio_token	<|audio|>item_idxc                    s:    dt}|| }|jd }|gd } g| S )NrX   r   )	get_itemsr   getshape_get_num_audio_features)rj   audiosrX   audio_lengthnum_projector_featuresaudio_token_idrU   rf   rJ   rK   get_replacement   s   


zMGraniteSpeechMultiModalProcessor._get_prompt_updates.<locals>.get_replacementrX   )modalitytargetreplacementrJ   )inforP   get_tokenizerrQ   	get_vocabgetattrrI   r   )	rT   rf   rb   rg   	processor	tokenizervocabrh   ru   rJ   rs   rK   _get_prompt_updates   s   

z4GraniteSpeechMultiModalProcessor._get_prompt_updatespromptmm_data	mm_kwargs
tok_kwargsc                    sb   t |}|dg }|r||d< t j||||d}d|v r/| j j}|d |kd|d< |S )Nrp   rX   )r   r   r   r   	input_idsrk   r@   )rc   popsuper_call_hf_processorry   get_hf_configaudio_token_indexsum)rT   r   r   r   r   rp   processed_outputsr   	__class__rJ   rK   r      s    
z3GraniteSpeechMultiModalProcessor._call_hf_processor)rA   rB   rC   r   r   r_   objectr   re   r   r   rH   r   r   r   __classcell__rJ   rJ   r   rK   r`      s8    





 


r`   c                	   @   sX   e Zd Z	d
dedeeef deeef dB defddZdeeef defdd	Z	dS )GraniteSpeechDummyInputsBuilderNseq_len	mm_counts
mm_optionsrW   c                 C   s8   | dd}|r| dnd }d| j| j ||diS )NrX   r   )length
num_audios	overrides)rm   _get_dummy_audiosry   r^   )rT   r   r   r   r   audio_overridesrJ   rJ   rK   get_dummy_mm_data   s   z1GraniteSpeechDummyInputsBuilder.get_dummy_mm_datac                 C   s*   | dd}| j }t|dd}|| S )NrX   r   rh   ri   )rm   ry   rP   r|   )rT   r   r   hf_processorrh   rJ   rJ   rK   get_dummy_text   s   
z.GraniteSpeechDummyInputsBuilder.get_dummy_textN)
rA   rB   rC   rI   r   r_   r   r   r   r   rJ   rJ   rJ   rK   r      s    

r   c                	       sN   e Zd Z		ddedededB def fddZd	ej	d
ej	fddZ
  ZS )GraniteSpeechEncoderProjectorN configcache_configquant_configprefixc                    s   t    |jj| _|j| _|j| _|j|j | _tt	
d| j|jj| _t|j||| dd| _t|jj|jj| _d S )Nr%   z.qformer)r   r   r   )r   __init__projector_confighidden_sizedownsample_ratewindow_sizenum_queriesr   	ParameterrE   zerosqueryr&   qformerLineartext_configlinear)rT   r   r   r   r   r   rJ   rK   r      s"   


z&GraniteSpeechEncoderProjector.__init__hidden_statesrW   c           	      C   s   |  \}}}t|| j }|| j | }tj|ddd|fdd}||| | j|}| j| j	j
|d}| |||| j | j d}|S )Nr   constant)query_embedsencoder_hidden_statesrk   )sizemathceilr   r   
functionalpadviewr   r   datar   r   )	rT   r   
batch_sizer   dimnblocksr   last_hidden_state
query_projrJ   rJ   rK   forward  s"   z%GraniteSpeechEncoderProjector.forwardNr   )rA   rB   rC   r   r	   r   r_   r   rE   rF   r   r   rJ   rJ   r   rK   r      s    r   c                       sN   e Zd ZdZ		ddededB def fddZd	ej	d
ej	fddZ
  ZS )!GraniteSpeechConformerFeedForwardz0Feedforward module for conformer encoder blocks.Nr   r   r   r   c                    sj   t    t|j| _t|j|j|j || dd| _t	 | _
t|j|j |j|| dd| _d S )Nz.up_proj)
input_sizeoutput_sizer   r   z
.down_proj)r   r   r   	LayerNorm
hidden_dimpre_normr   feedforward_multup_projSiLUsilur   	down_proj)rT   r   r   r   r   rJ   rK   r   +  s   



z*GraniteSpeechConformerFeedForward.__init__r   rW   c                 C   s4   |  |}| |\}}| |}| |\}}|S r   )r   r   r   r   )rT   r   _rJ   rJ   rK   r   C  s
   

z)GraniteSpeechConformerFeedForward.forwardr   )rA   rB   rC   rD   r   r   r_   r   rE   rF   r   r   rJ   rJ   r   rK   r   (  s    r   c                       H   e Zd ZdZddedef fddZdejdejd	ejfd
dZ	  Z
S )GraniteSpeechConformerAttentionzAttention for conformer blocks using Shaw's relative positional
    embeddings. See the following [paper](https://arxiv.org/pdf/1803.02155)
    for more details.
    r   r   r   c                    s   t    |j|j }|j| _|j| _|j| _|j| _| jd | _t|j	| _
tj|j	|dd| _tj|j	|d dd| _t||j	| _td| j d | j| _| jdks^| j| jkrktd| j d| j d	d S )
Ng      Fbias   r%   r   z/Context size should be > 0 and <= max_pos_emb (z), got .)r   r   dim_head	num_headsmax_pos_embcontext_sizescaler   r   r   r   r   to_qto_kvto_out	Embeddingrel_pos_emb
ValueErrorrT   r   r   	inner_dimr   rJ   rK   r   Q  s*   
z(GraniteSpeechConformerAttention.__init__r   attention_distsrW   c                 C   s  |  |}|j\}}}t|| j }|| j }|dkr+tjj|ddd| j| f}| 	|}| 
|jddd\}	}
|||| j| jddd}|	||| j| jddd}	|
||| j| jddd}
||j}| |}|g dt|j }tj|d| dd| j }|dkrtj| j| jt|jd}d|d |d |f< t|jj }|d d dd d f || tjjtjjjj  t!j"||	|
|| jd	}W d    n1 sw   Y  |dd||jd
 d}| #|d d d |d d f S )Nr   r   rk   r      )r%   r%   r%   )dtypedevice)	attn_maskr   r%   )$r   rn   r   r   r   rE   r   r   r   r   r   chunkreshaper   	transposetor   r   r   rH   r   	unsqueezer   onesboolfinfor   maxmasked_fill_	attentionsdpa_kernel
SDPBackendMATHFscaled_dot_product_attentionr   )rT   r   r   bsznum_featuresr   
num_blocks	remainderquery_states
key_statesvalue_statesdistr   rel_pos_emb_expandedpos_attnmask
mask_valueoutrJ   rJ   rK   r   g  sj   



 z'GraniteSpeechConformerAttention.forwardr   rA   rB   rC   rD   r   r_   r   rE   rF   r   r   rJ   rJ   r   rK   r   K  s    r   c                	       sJ   e Zd ZdZddedededef fddZd	ejd
ejfddZ	  Z
S )%GraniteSpeechConformerDepthWiseConv1dz,Wrapper for padded 1D pointwise convolution.r   chan_inchan_outkernel_sizer   c                    sF   t    |d }|d d }||| f| _tj||||dd| _d S )Nr   r%   F)groupsr   )r   r   paddingr   Conv1dconv)rT   r	  r
  r  r   r   
pad_offsetr   rJ   rK   r     s   

z.GraniteSpeechConformerDepthWiseConv1d.__init__r   rW   c                 C   s   t || j}| |S r   )r   r   r  r  rT   r   rJ   rJ   rK   r     s   
z-GraniteSpeechConformerDepthWiseConv1d.forwardr  )rA   rB   rC   rD   rI   r_   r   rE   rF   r   r   rJ   rJ   r   rK   r    s     r  c                       sB   e Zd ZdZddedef fddZdejdejfd	d
Z	  Z
S ) GraniteSpeechConformerConvModulezZConformer conv module consisting of several 1D/depthwise 1D
    convolutional layers.
    r   r   r   c                    s   t    |j|j }t|j| _t|j|d d| _tj	dd| _
t|||j| dd| _t | _t|| _t||jd| _d S )Nr   r%   r   z.depth_conv)r  r   )r   r   r   conv_expansion_factorr   r   normr  up_convGLUglur  conv_kernel_size
depth_convr   r   BatchNorm1d
batch_norm	down_convr   r   rJ   rK   r     s   

z)GraniteSpeechConformerConvModule.__init__r   rW   c                 C   sZ   |  |}| |ddd}| |}| |}| | |}| |ddd}|S )Nr   r   r%   )r  r  permuter  r  r   r  r  r  rJ   rJ   rK   r     s   


z(GraniteSpeechConformerConvModule.forwardr  r  rJ   rJ   r   rK   r    s    r  c                       r   )GraniteSpeechConformerBlockz^Conformer block, consisting largely of linear layers,
    attention, and convolutional layers.r   r   r   c                    sl   t    t|| dd| _t|| dd| _t|| dd| _t|| dd| _t	
|j| _d S )Nz.ff1r   z.attnz.convz.ff2)r   r   r   ff1r   attnr  r  ff2r   r   r   	post_norm)rT   r   r   r   rJ   rK   r     s   
z$GraniteSpeechConformerBlock.__init__r   r   rW   c                 C   sR   d|  | | }| j||d| }| || }d| | | }| |}|S )Ng      ?r   )r   r!  r  r"  r#  )rT   r   r   rJ   rJ   rK   r     s   
z#GraniteSpeechConformerBlock.forwardr  r  rJ   rJ   r   rK   r    s    r  c                       sF   e Zd ZdZ	ddedededB f fddZdej	fd	d
Z
  ZS )GraniteSpeechCTCEncoderzECTC Encoder comprising conformer blocks and additional linear layers.Nr   r   r   c                    s   t     | _t j}|dd|dd }t| j  j j | _	t
j j jdd| _t
 fddt jD | _t j jd| dd| _t j jd| d	d| _t
jdd
| _ j| _d S )Nrk   r%   Tr   c                    s"   g | ]}t   d | dqS )z.layers.r  )r  ).0idxr   r   rJ   rK   
<listcomp>  s    z4GraniteSpeechCTCEncoder.__init__.<locals>.<listcomp>z.out)r   r   r   r   r   z.out_midr   )r   r   r   rE   aranger   r   clampr   r   r   r   	input_dimr   input_linear
ModuleListrange
num_layerslayersr   
output_dimr  r   out_midSoftmaxsoftmax)rT   r   r   r   seqrelpos_distr   r(  rK   r     s<   

z GraniteSpeechCTCEncoder.__init__r   c                 C   sx   |  |}t| jddD ]-\}}||| jd}|| jd kr9| }| |\}}| |}| |\}}||7 }q|S )Nr%   )startr$  r   )	r-  	enumerater1  r   r0  cloner  r5  r3  )rT   r   r'  layerhidden_states_midr   rJ   rJ   rK   r   "  s   

zGraniteSpeechCTCEncoder.forwardr   )rA   rB   rC   rD   r   r_   r   r   rE   rF   r   r   rJ   rJ   r   rK   r%    s    .r%  )ry   dummy_inputsc                       s  e Zd ZeZg dddgdZededededB fd	d
Z	ddde
def fddZdededB fddZdejdejfddZdeej dejfddZdedeej fddZdedefddZ	dCddd d!ejd"edB d#ejdB d$edejf
 fd%d&Z		dDd!ejdB d'ejd(edB d)ejdB dedejeB fd*d+Zd,ejdejdB fd-d.Zd/eeeejf  dee fd0d1Zde fd2d3Z!ed4e"j#d5e$d6e%d7edB d8e&d9 d:ed;edB de'fd<d=Z(ed>e)d6e%d5e$dedB fd?d@Z*ed5e$d8ede%fdAdBZ+  Z,S )E%GraniteSpeechForConditionalGeneration)q_projk_projv_proj	gate_projr   )qkv_projgate_up_projrv   irW   Nc                 C   s   | drdS td)NrX   ri   z Only audio modality is supported)
startswithr   )clsrv   rE  rJ   rJ   rK   get_placeholder_strJ  s   
z9GraniteSpeechForConditionalGeneration.get_placeholder_strr   r  vllm_configr   c                   s   t    |jj}|j}|j}|| _|| _|| _| | t||j	t
|dd| _W d    n1 s4w   Y  | |d  t|j|| dd| _t|||| dd| _W d    n1 sbw   Y  | jj| _d S )Nlanguage_model)rI  	hf_configr   rX   z.encoder)r   r   r   z
.projector)r   r   r   r   )r   r   model_configrK  r   r   r   _mark_language_modelr-   r   r.   rJ  _mark_tower_modelr%  encoder_configencoderr   	projectormake_empty_intermediate_tensors)rT   rI  r   r   r   r   r   rJ   rK   r   Q  s:   


z.GraniteSpeechForConditionalGeneration.__init__kwargsc                 K   s  | dd }| dd }| dd }|d u rd S |d u r!| |}t|tjtfs2tdt| |d urEt|tjsEtdt| t|tjrpt|j	dkrW|
d}t|j	dkrftd	|j	 || jjjj}nd
d |D }| || jjjj}t|||  dS )Nr=   r?   r@   z2Incorrect type of audio input features. Got type: z7Incorrect type of audio input features mask. Got type:    r%   r   z6Squeezed input features should be 3D but are of shape c                 S   s"   g | ]}|j d kr|jddqS )r   r   r   )ndimr   )r&  featrJ   rJ   rK   r)    s    zYGraniteSpeechForConditionalGeneration._parse_and_validate_audio_input.<locals>.<listcomp>)r=   r?   r@   )r   _build_input_features_mask
isinstancerE   rF   rH   r   typelenrn   squeezer   rP  r-  weightr   _pad_and_stack_input_featuresr9   flattentolist)rT   rS  r=   r?   r@   rJ   rJ   rK   _parse_and_validate_audio_inputw  sX   



zEGraniteSpeechForConditionalGeneration._parse_and_validate_audio_inputr@   c                 C   s:   t | }t j||jddd}||ddk }|S )a  Calculate the input features mask, which will generally be used
        to mask the padded features for all entries in the batch except
        for those with the most audio features.

        Args:
            audio_embed_sizes: torch.Tensor
                Tensor of num features in each seq in the batch.
        Returns:
            torch.Tensor: Mask of shape (bsz, num_features) to be applied to
            the audio features prior to splitting the audio embeddings.
        )r   r%   rk   )rE   r   itemr*  r   r   )rT   r@   most_audio_featuresmask_indicesr?   rJ   rJ   rK   rW    s   z@GraniteSpeechForConditionalGeneration._build_input_features_maskr=   c                    sP   dd |D   fdd D }dd t ||D }tj|dd|d }|S )a  Given a list of input features of varying length, pad them to the
        same length and stack them into a torch.Tensor.

        NOTE: Usually, padding is done in the input processor/feature extractor
        and zero padded prior to the computation of the Mel features; the
        resulting values are only constant within a batch and generally nonzero
        (i.e., slightly negative nums); we should validate that this is okay
        since we don't use a feature attention mask, but the more important
        thing is that we apply the input_features_mask with variable len
        batches.

        Args:
            input_features: list[torch.Tensor]
                3D Input features to be coerced into a tensor.
        Returns:
            torch.Tensor: Tensor of shape [bsz, num_features, 160], where
            num_features is the max number of features of any entry in the
            batch.
        c                 S   s   g | ]}|j d  qS )r%   )rn   )r&  featsrJ   rJ   rK   r)    s    zWGraniteSpeechForConditionalGeneration._pad_and_stack_input_features.<locals>.<listcomp>c                    s   g | ]}t  | qS rJ   )r   )r&  r   	feat_lensrJ   rK   r)    s    c                 S   s,   g | ]\}}t jj|d d d |d d fqS )r   )rE   r   r   r   )r&  rd  r   rJ   rJ   rK   r)    s    r   r   )ziprE   catr   )rT   r=   r  paddedstacked_featuresrJ   re  rK   r]    s   zCGraniteSpeechForConditionalGeneration._pad_and_stack_input_featuresaudio_inputc                 C   s4   |  |d }| |}||d  }t||d S )an  Compute the audio features to be merged into the LLM embeddings.

        Args:
            audio_input: GraniteSpeechAudioInputs
                Audio inputs object containing Mel features, an input features
                mask, and the (flattened) number of audio tokens per instance.
        Returns:
            tuple[torch.Tensor]: List of length bsz.
        r=   r?   r@   )rP  rQ  rE   split)rT   rk  encoder_embedsprojected_embedsmasked_embedsrJ   rJ   rK   _process_audio_input  s   
z:GraniteSpeechForConditionalGeneration._process_audio_inputc                 K   s*   | j di |}|du rg S | |}|S )z9Compute the audio embeddings if audio inputs are present.NrJ   )r`  rp  )rT   rS  rk  audio_featuresrJ   rJ   rK   embed_multimodal
  s
   
z6GraniteSpeechForConditionalGeneration.embed_multimodalT)is_multimodalhandle_oov_mm_tokenr   multimodal_embeddingsrs  rt  c                   s0   |d u s|d u rt  |S t  j||||dS )N)ru  rs  rt  )r   embed_input_ids)rT   r   ru  rs  rt  r   rJ   rK   rv    s   
z5GraniteSpeechForConditionalGeneration.embed_input_ids	positionsintermediate_tensorsinputs_embedsc                 K   s    |d urd }|  ||||}|S r   )rJ  )rT   r   rw  rx  ry  rS  model_outputrJ   rJ   rK   r   *  s   z-GraniteSpeechForConditionalGeneration.forwardr   c                 C   s   | j |S r   )rJ  compute_logitsr  rJ   rJ   rK   r{  :  s   z4GraniteSpeechForConditionalGeneration.compute_logitsweightsc                 C   s   t | }||S r   )r,   load_weights)rT   r|  loaderrJ   rJ   rK   r}  @  s   
z2GraniteSpeechForConditionalGeneration.load_weightsc                 C   s   t jddddS )z+Get the module prefix in multimodal models.rJ  rQ  rP  )rJ  	connectortower_model)r   from_string_fieldrY   rJ   rJ   rK   get_mm_mappingG  s
   z4GraniteSpeechForConditionalGeneration.get_mm_mappingrX   rL  
stt_configlanguage	task_type)
transcribe	translaterequest_promptto_languagec                 C   s   |  dd}|dkr| j||}	| d|	 }
n|dkr#| d}
ntd| t|}td|
d	g}|j|d
dd}||}t|d|idS )z@Get the generation prompt to be used for transcription requests.rX   r   r  ztranslate the speech to r  z4can you transcribe the speech into a written format?zUnsupported task type user)rolecontentFT)tokenizeadd_generation_prompt)prompt_token_idsmulti_modal_data)	rH  supported_languagesrm   r   r!   rc   apply_chat_templateencoder   )rG  rX   rL  r  r  r  r  r  	audio_tokfull_lang_name_touser_promptr~   chatr   r  rJ   rJ   rK   get_generation_promptP  s(   
z;GraniteSpeechForConditionalGeneration.get_generation_promptaudio_duration_sc                 C   s`   t |}|jjd }|jj}|jj}|| }||j }	|	| d }
|
d }t|| }|| S )z<Get the number of audio tokens for an audio duration in sec.
hop_lengthr%   r   )r"   rQ   rR   projector_window_sizeprojector_downsample_raterM   r   r   )rG  r  r  rL  r}   r  proj_win_sizeds_rateeffective_window_size
raw_length
mel_lengthencoder_lengthr   rJ   rJ   rK   get_num_audio_tokensy  s   
z:GraniteSpeechForConditionalGeneration.get_num_audio_tokensc                 C   s   t  S )z"Get the stt config for this model.)r   )rG  rL  r  rJ   rJ   rK   get_speech_to_text_config  s   z?GraniteSpeechForConditionalGeneration.get_speech_to_text_configr   )NN)-rA   rB   rC   ISO639_1_SUPPORTED_LANGSr  packed_modules_mappingclassmethodr_   rI   rH  r   r   r   r9   r`  rE   rF   rW  rH   r]  tuplerp  r'   rr  r   rv  r    r   r{  r   setr}  r   r  npndarrayr
   r   r   r   r  floatr  r  r   rJ   rJ   r   rK   r>  0  s    &
C

#




		(r>  )WrD   r   collections.abcr   r   typingr   r   numpyr  rE   torch.nn.functionalr   r   r   transformersr   r   vllm.configr	   r
   r   r   vllm.config.multimodalr   vllm.inputs.datar   r   !vllm.model_executor.layers.linearr   r   'vllm.model_executor.layers.quantizationr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   r   vllm.multimodal.processingr   r   r   r   r   vllm.sequencer    vllm.tokenizersr!   !vllm.transformers_utils.processorr"   vllm.utils.tensor_schemar#   r$   blip2r&   
interfacesr'   r(   r)   r*   r+   utilsr,   r-   r.   r  r9   rL   r`   r   Moduler   r   r   r  r  r  r%  register_processorr>  rJ   rJ   rJ   rK   <module>   sv   


M8#Z ?

