o
    eiM                     @   sJ  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZ e e!Z"dZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'G dd deZ(G dd deZ)G dd deZ*G d d! d!eZ+g d"Z,dS )#zPyTorch Qwen3 model.    )CallableN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )GemmaMLP)LlamaAttention)Qwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassificationQwen2RMSNormQwen2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                   @      e Zd ZdS )Qwen3RMSNormN__name__
__module____qualname__ r   r   e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/qwen3/modular_qwen3.pyr   0       r   c                   @   r   )Qwen3MLPNr   r   r   r   r   r!   4   r    r!   c                   @   r   )Qwen3RotaryEmbeddingNr   r   r   r   r   r"   8   r    r"   c                       s   e Zd Zdedef fddZ		ddejdeejejf dejdB d	e	dB d
ej
dB dee deejejdB f fddZ  ZS )Qwen3Attentionconfig	layer_idxc                    sl   t |dr
|j| nd | _t || t| j|jd| _t| j|jd| _	| jdkr1|j
| _
d S d | _
d S )Nlayer_types)epssliding_attention)hasattrr&   
layer_typesuper__init__r   head_dimrms_norm_epsq_normk_normsliding_window)selfr$   r%   	__class__r   r   r,   =   s
    zQwen3Attention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc                 K   s(  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}t| jjt}|| |	|
||f| jsqdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr   r   )sincosr9   g        )dropoutscalingr1   )shaper-   r/   q_projview	transposer0   k_projv_projr   updater%   r   get_interfacer$   _attn_implementationr   trainingattention_dropoutr@   r1   reshape
contiguouso_proj)r2   r5   r6   r7   r8   r9   r:   input_shapehidden_shapequery_states
key_statesvalue_statesr>   r=   cache_kwargsattention_interfaceattn_outputattn_weightsr   r   r   forwardD   s:   		

zQwen3Attention.forward)NN)r   r   r   r   intr,   torchTensortupler   
LongTensorr   r   rX   __classcell__r   r   r3   r   r#   <   s&    r#   c                       s*   e Zd Zdee def fddZ  ZS )Qwen3ForCausalLMsuper_kwargsr;   c                    s   t  jdi |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nr   )r+   rX   )r2   r`   r3   r   r   rX   r   s   zQwen3ForCausalLM.forward)r   r   r   r   r	   r   rX   r^   r   r   r3   r   r_   q   s    r_   c                   @   r   )Qwen3ForSequenceClassificationNr   r   r   r   r   ra      r    ra   c                   @   r   )Qwen3ForTokenClassificationNr   r   r   r   r   rb      r    rb   c                   @   r   )Qwen3ForQuestionAnsweringNr   r   r   r   r   rc      r    rc   )r_   rc   Qwen3PreTrainedModel
Qwen3Modelra   rb   )-__doc__collections.abcr   rZ   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr	   r
   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r!   r"   r#   r_   ra   rb   rc   __all__r   r   r   r   <module>   s0   (

5