o
    
۾i
P                     @   s  U d Z ddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 ee/B Zee8d< de9dej:fddZ;G dd de	j<Z=G d d! d!e	j<Z>G d"d# d#e	j<Z?eG d$d% d%e	j<Z@G d&d' d'e	j<e1ZAdS )(zPyTorch Falcon model.    N)Iterable)islice)	TypeAlias)nn)	LayerNorm)FalconConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)
get_act_fn)	Attention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors)RWConfig   )
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixr   total_num_headsreturnc                 C   s   dt t |  }tjddt |d     tjd}tjdd| tjd}t||}|| kritjddt d| d     tjd}t	|| | }tjddd|  dtjd}tj
|t||gdd}|S )N      )dtyper   r   dim)mathfloorlog2torchtensorfloat32arangeint32powmincat)r#   closest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powers r<   U/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/falcon.py_get_alibi_slopesG   s$    r>   c                	       Z   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	fddZ
  ZS )FalconAttentionN configcache_configquant_configprefixc              
      st  t    |j| _t }|j| _| j| dksJ | j| | _| j| j | _| j| j | jks1J |j| _|j	| _	| jrA|j
| _n| j	rHd| _n| j| _| j|kr[| j| dksZJ n	|| j dksdJ td| j| | _
t| j| j| j| j|jd|| dd| _| j| j | _| j
| j | _dt| j | _|jp|j | _t| j| j|jd|| j| dd| _|j| _|j| _| jr| jrJ d	| jrt|d
d}t| j||jd| _ t!| j| j| j| j
|| dd| _"d S | jr%t# }|| j }|d | j }	t$| j| j }
|
||	 % }
t!| j| j| j| j
|
|| dd| _"d S t!| j| j| j| j
||| dd| _"d S )Nr   r   Tz.query_key_valuebiasskip_bias_addrD   rE   g      ?z.dense)rG   rH   rD   reduce_resultsrE   z(Rotary and alibi are mutually exclusive.max_position_embeddingsi    )max_positionrope_parametersz.attn)num_kv_headsrD   rE   )rM   alibi_slopesrD   rE   )scalerM   rC   rD   rE   )&super__init__hidden_sizer   num_attention_headsr#   	num_headshead_dimnew_decoder_architecturemulti_queryrM   total_num_kv_headsmaxr   rG   query_key_valueq_sizekv_sizer*   sqrtinv_norm_factorparallel_attnreduce_row_parallel_resultsr   denserotary
use_rotaryalibi	use_alibigetattrr   rL   
rotary_embr   attnr   r>   tolist)selfrB   rC   rD   rE   tp_sizerJ   tp_rank
head_starthead_endrN   	__class__r<   r=   rQ   _   s   







zFalconAttention.__init__	positionshidden_statesr$   c           	      C   sz   |  |\}}|d ur||7 }|j| j| j| jgdd\}}}| jr+| |||\}}| |||}| |\}}||fS )Nr(   )rZ   splitr[   r\   rc   rg   rh   ra   )	rj   rq   rr   qkvrG   qkvattn_outputr<   r<   r=   forward   s    zFalconAttention.forwardNNrA   __name__
__module____qualname__r   r	   r   strrQ   r-   Tensorrz   __classcell__r<   r<   ro   r=   r@   ^   s(    rr@   c                       sJ   e Zd Z		ddededB def fddZdejd	ejfd
dZ	  Z
S )	FalconMLPNrA   rB   rD   rE   c              	      sv   t    |j}t|d| |jd|| dd| _td| _|jp#|j	 | _
td| ||jd| j
|| dd| _d S )N   Tz.dense_h_to_4hrF   geluz.dense_4h_to_h)rG   rH   rI   rD   rE   )rP   rQ   rR   r   rG   dense_h_to_4hr   actrV   r_   r`   r   dense_4h_to_h)rj   rB   rD   rE   rR   ro   r<   r=   rQ      s,   


zFalconMLP.__init__xr$   c                 C   s>   |  |\}}|d ur||7 }| |}| |\}}||fS N)r   r   r   )rj   r   rG   r<   r<   r=   rz     s   
zFalconMLP.forward)NrA   )r}   r~   r   r   r   r   rQ   r-   r   rz   r   r<   r<   ro   r=   r      s    r   c                	       r?   )FalconDecoderLayerNrA   rB   rC   rD   rE   c                    s   t    |j}|j| _t|||| dd| _t||| dd| _|| _	t
|ds.d |_|jd u r9|jr9d|_|jsMt||jd| _t||jd| _n|jdkrct||jd| _t||jd| _nt||jd| _|jpp|j | _d S )Nz.self_attentionrE   z.mlpnum_ln_in_parallel_attnr%   eps)rP   rQ   rR   rS   rT   r@   self_attentionr   mlprB   hasattrr   rV   r_   r   layer_norm_epsilonpost_attention_layernorminput_layernormln_attnln_mlpr`   )rj   rB   rC   rD   rE   rR   ro   r<   r=   rQ     s2   




zFalconDecoderLayer.__init__rq   rr   r$   c                 C   s  |}| j jdkr| |}| |}n| |}| j||d\}}| jr,|d ur,||7 }| j js@| j jr7|}n	||7 }| 	|}| j jrP| j jrP| j jdkrP|}| 
|\}}	| jrb|	d urb||	7 }| js}||7 }t|}|d uru||7 }|	d ur}||	7 }|| }
|
S )Nr%   )rq   rr   r   )rB   r   r   r   r   r   r`   rV   r_   r   r   r   )rj   rq   rr   residualattention_layernorm_outmlp_layernorm_outattention_outputattention_bias
mlp_outputmlp_biasoutputr<   r<   r=   rz   6  sD   



zFalconDecoderLayer.forwardr{   r|   r<   r<   ro   r=   r     s(    *r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )FalconModelrA   r   vllm_configrE   c                   s   t    |jj|j |j| _j| _j	| _
j| _tj| j| _tj fdd| dd\| _| _| _t| jjd| _tdgj| _d S )Nc                    s   t  | dS )Nr   )r   r   rC   rB   rD   r<   r=   <lambda>  s    z&FalconModel.__init__.<locals>.<lambda>z.hr   r   rr   )rP   rQ   model_config	hf_configrC   rD   rB   rR   	embed_dimrS   rT   rd   re   r   
vocab_sizeword_embeddingsr!   num_hidden_layersstart_layer	end_layerhr   r   ln_fr    make_empty_intermediate_tensors)rj   r   rE   ro   r   r=   rQ   o  s*   
	
zFalconModel.__init__	input_idsr$   c                 C   s
   |  |S r   )r   rj   r   r<   r<   r=   embed_input_ids  s   
zFalconModel.embed_input_idsNrq   intermediate_tensorsinputs_embedsc                 C   sn   t  jr|d ur|}n
| |}n|d }t| j| j| jD ]}|||}qt  js0td|iS | 	|}|S )Nrr   )
r   is_first_rankr   r   r   r   r   is_last_rankr   r   )rj   r   rq   r   r   rr   layerr<   r<   r=   rz     s   
zFalconModel.forwardweightsc                 C   s  | j j}| j jr| j j}n	| j jrd}n|}|| }t| jdd}t }|D ]\}}|dr5||vr5q't	|| r;q'|| }	d|v rt
|	dd }
|j}|
d ur||d |
 ||d df ||
d d   }||
d d	|jg |d |
 d||
d d  R  }||
d |djg |d |
 d||
d d  R  }||
d |d djg |d |
 d||
d d  R  }tj|||g|
d
}t
|	dt}||	| || q'|S )Nr   F)remove_duplicatez.biasrZ   
output_dimr%   rs   r   r(   weight_loader)rB   rS   rV   rM   rW   dictnamed_parameterssetendswithr   rf   shapeviewnarrowreshaper-   r4   r   add)rj   r   r#   rX   num_query_heads_per_kv_headparams_dictloaded_paramsnameloaded_weightparamr   loaded_weight_shapewqwkwvr   r<   r<   r=   load_weights  s   








zFalconModel.load_weightsr   )r}   r~   r   r
   r   rQ   r-   r   r   r   rz   r   tupler   r   r   r<   r<   ro   r=   r   m  s     !
,r   c                       s   e Zd ZddgiZdddedef fddZdejd	ejfd
dZ			ddej
dejdedB dejdB d	ejf
ddZdejd	ejdB fddZdeeeejf  d	ee fddZ  ZS )FalconForCausalLMrZ   rA   r   r   rE   c                   s   t    |jj}|j}|| _|| _t|t|dd| _|j	d ur$|j	nd| _	| j	r0| jj
| _nt|j|j|t|dd| _t|j| _| jj| _d S )Ntransformer)r   rE   Tlm_head)rD   rE   )rP   rQ   r   r   rD   rB   r   r"   r   tie_word_embeddingsr   r   r   r   rR   r   logits_processorr   )rj   r   rE   rB   rD   ro   r<   r=   rQ     s.   


zFalconForCausalLM.__init__r   r$   c                 C   s   | j |S r   )r   r   r   r<   r<   r=   r     s   z!FalconForCausalLM.embed_input_idsNrq   r   r   c                 C   s   |  ||||}|S r   )r   )rj   r   rq   r   r   rr   r<   r<   r=   rz     s   zFalconForCausalLM.forwardrr   c                 C   s   |  | j|}|S r   )r   r   )rj   rr   logitsr<   r<   r=   compute_logits  s   z FalconForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   rB   r   r   )rj   r   loaderr<   r<   r=   r     s
   
zFalconForCausalLM.load_weights)NN)r}   r~   r   packed_modules_mappingr
   r   rQ   r-   r   r   
LongTensorr   rz   r   r   r   r   r   r   r<   r<   ro   r=   r     s0    

,r   )B__doc__r*   collections.abcr   	itertoolsr   typingr   r-   r   torch.nnr   transformersr   HF_FalconConfigvllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   r   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   vllm.transformers_utils.configsr   
interfacesr   utilsr   r   r    r!   r"   __annotations__intr   r>   Moduler@   r   r   r   r   r<   r<   r<   r=   <module>   sB    *br