o
    
۾iH                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlm  m	Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* ddl+m,Z, eeddZ-		d-dej.de/e0e0f de1de2dej.f
ddZ3G dd  d ej4Z5ej6d!e d"e0de7e  fd#d$Z8G d%d& d&eZ9G d'd( d(ej:Z;G d)d* d*ej:Z<G d+d, d,ej:Z=dS ).    N)replace)partial)nn)CacheConfig
VllmConfig)$get_tensor_model_parallel_world_size)	Attention)RMSNorm)QKVParallelLinearRowParallelLinear)QuantizationConfig)get_rope)
MistralMLP)WhisperPosEmbedType)AttentionBackendAttentionMetadataAttentionTypeCommonAttentionMetadata)subclass_attention_backend_with_overrides)FlashAttentionBackend)get_attn_backend)AttentionSpec   )make_layersgh㈵>)epsconstant        xpaddingsmodevaluereturnc                 C   s   | j d }|\}}|dkr|dksJ ||f|dkrKt||}d}||kr4|| d }t| d|f} t| |||}	|	j d | }
|	dd|
f S t| |||S )zTiny wrapper around F.pad, just to allow for
    reflect padding on small input.
    If this is the case, we insert extra 0 padding
    to the right before the reflection happen.
    r   reflectr   .N)shapemaxFpad)r   r   r   r    lengthpadding_leftpadding_rightmax_pad	extra_padpaddedend r/   ]/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/whisper_causal.py_pad1d)   s   

r1   c                       s\   e Zd Z			ddededededed	ed
df fddZdejd
ejf fddZ  Z	S )WhisperCausalConv1dr   r   Tin_channelsout_channelskernel_sizestridepaddingbiasr!   Nc                    sN   t  j||||||d | jd | _|d | jd  d | _| j| j | _d S )N)r6   r7   r8   r   r   )super__init__r6   _stridedilation_effective_kernel_size_padding_total)selfr3   r4   r5   r6   r7   r8   	__class__r/   r0   r:   E   s   	zWhisperCausalConv1d.__init__r   c                    sn   |j d | j | j | j d }t|d | j | j| j  }||j d  }t|| j|fdd}t |S )Nr"   r   r   )r   )	r$   r=   r>   r;   mathceilr1   r9   forward)r?   r   n_framestarget_lengthextra_paddingr@   r/   r0   rD   Z   s   
zWhisperCausalConv1d.forward)r   r   T)
__name__
__module____qualname__intboolr:   torchTensorrD   __classcell__r/   r/   r@   r0   r2   D   s(    "r2   underlying_attn_backendblock_pool_sizec                    s   d}  } }G fddd| G fddd|tts*t dt| fddfd	dfd
dddd}|S )N'WhisperCausalAttentionWithBlockPooling_c                	       sZ   e Zd Zdedee dedejf fddZ		dde
d	ed
edef fddZ  ZS )zjcreate_whisper_attention_backend_with_block_pooling.<locals>.WhisperCausalAttentionWithBlockPoolingBuilderkv_cache_speclayer_namesvllm_configdevicec                    sB   |j  dks	J t||j |j  d}t |||| d S )Nr   )
block_sizenum_kv_heads)rX   r   rW   r9   r:   )r?   rS   rT   rU   rV   rA   rQ   r/   r0   r:   o   s   zscreate_whisper_attention_backend_with_block_pooling.<locals>.WhisperCausalAttentionWithBlockPoolingBuilder.__init__Fcommon_prefix_lencommon_attn_metadata
fast_buildr!   c                    s   t |}| j9  _| j9  _| j9  _| j9  _| j9  _| j9  _| j9  _| j	9  _	|j
}|9 }|d tj|jd  jdd|_
t |||S )Nr   )rV   r"   )min)copydeepcopyquery_start_locquery_start_loc_cpuseq_lens_seq_lens_cpu_num_computed_tokens_cpunum_actual_tokensmax_query_lenmax_seq_lenslot_mapping	unsqueezerM   arangerV   flattenclampr9   build)r?   rZ   r[   r\   new_common_attn_metadataoriginal_slot_mappingrY   r/   r0   rm   ~   s*   
zpcreate_whisper_attention_backend_with_block_pooling.<locals>.WhisperCausalAttentionWithBlockPoolingBuilder.build)F)rH   rI   rJ   r   liststrr   rM   rV   r:   rK   r   rL   r   rm   rO   r/   rQ   r@   r0   -WhisperCausalAttentionWithBlockPoolingBuildern   s(    rs   c                       sp   e Zd Z			ddejjdejdejdejdejdedejdB d	ejdB d
ejdB dejf fddZ  Z	S )zgcreate_whisper_attention_backend_with_block_pooling.<locals>.WhisperCausalAttentionWithBlockPoolingImplNlayerquerykeyr    kv_cacheattn_metadataoutputoutput_scaleoutput_block_scaler!   c
           
         sX   j s|d ur|jd u r|d ur|d ur| |||||j t |||||||||		S N) forward_includes_kv_cache_updatekv_sharing_target_layer_namedo_kv_cache_updaterh   r9   rD   )
r?   rt   ru   rv   r    rw   rx   ry   rz   r{   )rA   rP   r/   r0   rD      s(   
zocreate_whisper_attention_backend_with_block_pooling.<locals>.WhisperCausalAttentionWithBlockPoolingImpl.forward)NNN)
rH   rI   rJ   rM   r   ModulerN   r   rD   rO   r/   )rP   r@   r0   *WhisperCausalAttentionWithBlockPoolingImpl   s2    		
r   zR is not yet supported.Contributions to support more backends are much appreciated.c                          S r|   r/   r/   )rs   r/   r0   <lambda>       zEcreate_whisper_attention_backend_with_block_pooling.<locals>.<lambda>c                      r   r|   r/   r/   )r   r/   r0   r      r   c                    s   d| |  |  |fS )N   r/   )
num_blocksrW   rX   	head_sizecache_dtype_strrr   r/   r0   r      s   T)get_builder_clsget_impl_clsget_kv_cache_shaper}   )name_prefixattention_backend_cls	overrides)r   r   
issubclassr   NotImplementedErrorr   )rP   rQ   prefixunderlying_builderunderlying_implattn_backendr/   )rs   r   rQ   rP   r0   3create_whisper_attention_backend_with_block_poolingf   s&   1
$


r   c                       s   e Zd ZdZdddddddejdddfdededededB d	ee dB d
e	dB de
dB dedB dedB dedededB dedee dB ddf fddZdef fddZ  ZS )&WhisperCausalAttentionWithBlockPoolingz#Attention layer with block pooling.N r   	num_headsr   scalerX   alibi_slopescache_configquant_configlogits_soft_capper_layer_sliding_windowr   	attn_typer~   rQ   r   r!   c                    sz   || _ t }|d ur|j}|j}nd}d}t|||||d}t||}t jd|||||||||	|
|||d| d S )Nauto   )r   )r   r   r   rX   r   r   r   r   r   r   r   r~   r   r/   )	rQ   rM   get_default_dtypecache_dtyperW   r   r   r9   r:   )r?   r   r   r   rX   r   r   r   r   r   r   r   r~   rQ   r   extra_impl_argsdtypekv_cache_dtyperW   rP   r@   r/   r0   r:      sD   
z/WhisperCausalAttentionWithBlockPooling.__init__rU   c                    s2   t  |}t|tsJ t|| j|j d}|S )N)rX   )r9   get_kv_cache_spec
isinstancer   r   rQ   rX   )r?   rU   rS   r@   r/   r0   r     s   
z8WhisperCausalAttentionWithBlockPooling.get_kv_cache_spec)rH   rI   rJ   __doc__r   DECODERrK   floatrp   r   r   rq   typer   r:   r   r   rO   r/   r/   r@   r0   r      sZ    
	

8r   c                       s   e Zd Zdejdddddfdedededed	ed
ededB dededB dedB de	f fddZ
deddfddZ			dded	ededB de	ddf
ddZ	ddejdejdB fddZ  ZS )WhisperCausalAttentionTNr   r   	embed_dimr   head_dimmax_position_embeddingsr8   r   r   rQ   r   r   r   c                    sH  t    || _t }|| _| j| dksJ | j| | _| j|kr,| j| dks+J n	|| j dks5J td| j| | _|| _| j| j | _	| j| j | _
|| _| jd | _| j|||
|d t| j| j |||
| dd| _|dks~J d| dt| j| j| j| j|	|
| d	tj||d

| _|d usJ d| | d S )Nr   r   g      ࿩r   z	.out_proj)
input_sizeoutput_sizer8   r   r   z6Causal attention only supports block_pool_size>1, not .z.attn)rX   r   r   r   r   r   rQ   z7rope can only used in combination with a sliding window)r9   r:   r   r   total_num_headsr   r%   rX   r   q_sizekv_sizer   scaling	_init_qkvr   out_projr   r   r   attn_init_rotary_emb)r?   r   r   r   r   r8   r   r   rQ   r   r   r   tp_sizer@   r/   r0   r:   )  sT   





zWhisperCausalAttention.__init__r!   c                 C   s   t | j|dddid| _d S )NF
rope_thetag    .A)max_positionis_neox_stylerope_parameters)r   r   
rotary_emb)r?   r   r/   r/   r0   r   j  s   z'WhisperCausalAttention._init_rotary_embc              	   C   s(   t || j| j| j||| dd| _d S )Nz	.qkv_proj)hidden_sizer   r   total_num_kv_headsr8   r   r   )r
   r   r   qkv_proj)r?   r   r8   r   r   r/   r/   r0   r   r  s   z WhisperCausalAttention._init_qkvhidden_states	positionsc           
      C   sl   |  |\}}|j| j| j| jgdd\}}}|d usJ | |||\}}| |||}| |\}	}|	S )Nr"   )dim)r   splitr   r   r   r   r   )
r?   r   r   qkv_qkvattn_outputry   r/   r/   r0   rD     s    zWhisperCausalAttention.forward)TNr   r|   )rH   rI   rJ   r   r   rK   rL   r   r   rq   r:   r   r   rM   rN   rD   rO   r/   r/   r@   r0   r   (  sh    	
A
r   c                       sJ   e Zd Zdddedef fddZ	ddejd	ejdB fd
dZ  Z	S )WhisperCausalEncoderLayerr   r   rU   r   c                   s   t    |jj}t|dd }|j}|dksJ |j}|j}|j| _	| j	|j
 | _t| j	|j
|j|j||||| dd	| _t| j	| _t|j|jd|dd| dd	| _t| j	| _d S )
Nsliding_windowr   z
.self_attn)	r   r   r   r   rQ   r   r   r   r   siluTFz.mlp)r   intermediate_size
hidden_actr   r8   gate_up_proj_biasr   )r9   r:   model_config	hf_configgetattrrQ   r   r   d_modelr   encoder_attention_headsr   r   encoder_head_dimr   	self_attnCausalRMSNormself_attn_layer_normr   encoder_ffn_dimmlpfinal_layer_norm)r?   rU   r   configr   rQ   r   r   r@   r/   r0   r:     s>   
	z"WhisperCausalEncoderLayer.__init__Nr   r   c                 C   sH   |}|  |}| j||d}|| }|}| |}| |}|| }|S )N)r   r   )r   r   r   r   )r?   r   r   residualr/   r/   r0   rD     s   


z!WhisperCausalEncoderLayer.forwardr|   )
rH   rI   rJ   r   rq   r:   rM   rN   rD   rO   r/   r/   r@   r0   r     s    'r   c                       sh   e Zd Zdddedef fddZdejeej B dejfd	d
Z	dejdejdejfddZ
  ZS )WhisperCausalEncoderr   r   rU   r   c                   s   t     jj}|j}t|jtjksJ |jsJ |j	| _	|j
| _
|jr+t|nd| _t| j	|dd| _t||ddd| _| jjd | jjd  | _t|j fdd| d	d
\| _| _| _t|j| _d S )Ng      ?   )r5   r   )r6   r5   r   c                    s   t  |  ddS )N.layers)rU   r   )r   r   rU   r/   r0   r     s    
z/WhisperCausalEncoder.__init__.<locals>.<lambda>r   r   )r9   r:   r   r   r   r   	pos_embedROPE	is_causalnum_mel_binsmax_source_positionsscale_embeddingrB   sqrtembed_scaler2   conv1conv2r6   total_strider   encoder_layersstart_layer	end_layerlayersr   
layer_norm)r?   rU   r   r   r   r@   r   r0   r:     s"   


zWhisperCausalEncoder.__init__input_featuresr!   c                 C   s^   g }|D ]#}t j| |}t j| |}|dd|j}|| qt	
|}|S )Nr"   )r   
functionalgelur   r   	transposetor   appendrM   cat)r?   r   r   featuresembedsr/   r/   r0   forward_conv  s   
z!WhisperCausalEncoder.forward_convr   r   c                 C   s$   | j D ]}|||}q| |}|S r|   )r   r   )r?   r   r   encoder_layerr/   r/   r0   rD     s   

zWhisperCausalEncoder.forward)rH   rI   rJ   r   rq   r:   rM   rN   rp   r  rD   rO   r/   r/   r@   r0   r     s    
r   )r   r   )>r^   	functoolsrB   dataclassesr   r   rM   torch.nn.functionalr   r   r&   vllm.configr   r   vllm.distributedr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr	   !vllm.model_executor.layers.linearr
   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   "vllm.model_executor.models.mistralr   "vllm.model_executor.models.whisperr   vllm.v1.attention.backendr   r   r   r   r   %vllm.v1.attention.backends.flash_attnr   vllm.v1.attention.selectorr   vllm.v1.kv_cache_interfacer   utilsr   r   rN   tuplerK   rq   r   r1   Conv1dr2   	lru_cacher   r   r   r   r   r   r   r/   r/   r/   r0   <module>   sb   

"|Em6