o
    	۷i7                     @   s  d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ e%e&Z'dZ(G dd de Z)G dd deZ*G dd deZ+G dd deZ,G dd deZ-G dd deZ.G dd  d eZ/G d!d" d"eZ0G d#d$ d$eZ1G d%d& d&eZ2g d'Z3dS )(zPyTorch Qwen3 model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )GemmaMLP)LlamaAttention)
Qwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassification
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                   @      e Zd ZdS )Qwen3RMSNormN__name__
__module____qualname__ r"   r"   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/qwen3/modular_qwen3.pyr   4       r   c                   @   r   )Qwen3MLPNr   r"   r"   r"   r#   r%   8   r$   r%   c                       s   e Zd Zdedef fddZedddd				dd
ejde	ejejf de
ej de
e de
ej dee de	eje
ej f fddZ  ZS )Qwen3Attentionconfig	layer_idxc                    sV   t  || t| j|jd| _t| j|jd| _|j| dkr&|j| _d S d | _d S )N)epssliding_attention)	super__init__r   head_dimrms_norm_epsq_normk_normlayer_typessliding_window)selfr'   r(   	__class__r"   r#   r,   =   s   $zQwen3Attention.__init__past_key_valuepast_key_valuesz4.58)new_nameversionNhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                 K   s4  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t}| jjdkrkt| jj }|| |	|
||f| jswdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr   r   )sincosr=   eagerg        )dropoutscalingr2   )shaper-   r/   q_projview	transposer0   k_projv_projr   updater(   r   r'   _attn_implementationr   trainingattention_dropoutrE   r2   reshape
contiguouso_proj)r3   r:   r;   r<   r7   r=   r>   input_shapehidden_shapequery_states
key_statesvalue_statesrB   rA   cache_kwargsattention_interfaceattn_outputattn_weightsr"   r"   r#   forwardC   s:   
	

zQwen3Attention.forward)NN)r   r    r!   r   intr,   r   torchTensortupler   r   
LongTensorr	   r   r\   __classcell__r"   r"   r4   r#   r&   <   s(    r&   c                   @   r   )Qwen3DecoderLayerNr   r"   r"   r"   r#   rc   q   r$   rc   c                   @   r   )Qwen3PreTrainedModelNr   r"   r"   r"   r#   rd   u   r$   rd   c                   @   r   )
Qwen3ModelNr   r"   r"   r"   r#   re   y   r$   re   c                       s*   e Zd Zdee def fddZ  ZS )Qwen3ForCausalLMsuper_kwargsr?   c                    s   t  jdi |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nr"   )r+   r\   )r3   rg   r4   r"   r#   r\   ~   s   zQwen3ForCausalLM.forward)r   r    r!   r	   r
   r   r\   rb   r"   r"   r4   r#   rf   }   s    rf   c                   @   r   )Qwen3ForSequenceClassificationNr   r"   r"   r"   r#   rh      r$   rh   c                   @   r   )Qwen3ForTokenClassificationNr   r"   r"   r"   r#   ri      r$   ri   c                   @   r   )Qwen3ForQuestionAnsweringNr   r"   r"   r"   r#   rj      r$   rj   )rf   rj   rd   re   rh   ri   )4__doc__typingr   r   r^   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   r   utils.deprecationr   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r%   r&   rc   rd   re   rf   rh   ri   rj   __all__r"   r"   r"   r#   <module>   s6   0
5