o
    ei#                     @   s  d dl Z d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) G dd de%Z*G dd de!Z+dd Z,G dd de'Z-G dd deZ.G dd deZ/eG dd  d e Z0eG d!d" d"eZ1eG d#d$ d$eZ2g d%Z3dS )&    N)Callable   )initialization)CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring   )CLIPMLP)Gemma2ForCausalLM)LlamaDecoderLayer
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Llama4TextL2Norm)Qwen3Attention   )NanoChatConfigc                   @      e Zd ZdS )NanoChatRMSNormN__name__
__module____qualname__ r"   r"   k/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/nanochat/modular_nanochat.pyr   +       r   c                   @   r   )NanoChatRotaryEmbeddingNr   r"   r"   r"   r#   r%   /   r$   r%   c                 C   sH   | dd| j d d f }| d| j d d df }tj|| fddS )zJRotates half the hidden dims of the input with flipped signs for NanoChat..Nr   )dim)shapetorchcat)xx1x2r"   r"   r#   rotate_half3   s   r.   c                       s   e Zd Zdedef fddZ				ddejdeejejf dB dejdB d	e	dB d
ej
dB dee deejejdB f fddZ  ZS )NanoChatAttentionconfig	layer_idxc                    s6   t  || | `| `t|jd| _t|jd| _d S N)eps)super__init__sliding_window
layer_typer   rms_norm_epsq_normk_normselfr0   r1   	__class__r"   r#   r5   ;   s
   zNanoChatAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc                 K   s,  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
| |	}	| 	|
}
|d ura|||d}|
|
|| j|\}
}t| jjt}|| |	|
||f| jsudn| j| jd|\}}|jg |dR   }| |}||fS )Nr&   r   r   )sincosrC           )dropoutscaling)r(   head_dimq_projview	transposek_projv_projr   r9   r:   updater1   r
   get_interfacer0   _attn_implementationr   trainingattention_dropoutrJ   reshape
contiguouso_proj)r<   r?   r@   rA   rB   rC   rD   input_shapehidden_shapequery_states
key_statesvalue_statesrG   rF   cache_kwargsattention_interfaceattn_outputattn_weightsr"   r"   r#   forwardC   s<   	



zNanoChatAttention.forward)NNNN)r   r    r!   r   intr5   r)   Tensortupler   
LongTensorr   r   rb   __classcell__r"   r"   r=   r#   r/   :   s*    r/   c                       s   e Zd Z fddZ  ZS )NanoChatMLPc                    s<   t  | tj|j|jdd| _tj|j|jdd| _d S )NF)bias)r4   r5   nnLinearhidden_sizeintermediate_sizefc1fc2r<   r0   r=   r"   r#   r5   t   s   zNanoChatMLP.__init__)r   r    r!   r5   rg   r"   r"   r=   r#   rh   s   s    rh   c                       s&   e Zd Zdedef fddZ  ZS )NanoChatDecoderLayerr0   r1   c                    s*   t    t|jd| _t|jd| _d S r2   )r4   r5   r   r8   input_layernormpost_attention_layernormr;   r=   r"   r#   r5   {   s   
zNanoChatDecoderLayer.__init__)r   r    r!   r   rc   r5   rg   r"   r"   r=   r#   rq   z   s    rq   c                   @   s    e Zd ZdejddfddZdS )NanoChatPreTrainedModelmodulerE   Nc                 C   sH   t | | t|tr"tj|jjd| jj	t
d| jj  d d S d S )NrH   r   )meanstd)r   _init_weights
isinstancer/   initnormal_rX   weightr0   initializer_rangemathsqrtnum_hidden_layers)r<   ru   r"   r"   r#   rx      s   

z%NanoChatPreTrainedModel._init_weights)r   r    r!   rj   Modulerx   r"   r"   r"   r#   rt      s    rt   c                       s   e Zd Zdef fddZ							ddejdB dejdB dejdB dedB d	ej	dB d
ejdB de
dB dee defddZ  ZS )NanoChatModelr0   c                    s   t  | t|jd| _d S r2   )r4   r5   r   r8   normrp   r=   r"   r#   r5      s   zNanoChatModel.__init__N	input_idsrA   position_idsrB   inputs_embedsrC   	use_cacherD   rE   c              	   K   s  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r<|d ur-| nd}	tj|jd |jd|	 }|d u rE|	d}t
| j|||||d}
|}| j||d}| |}| jd | jj D ]}||f|
||||d|}qg| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embeds)r0   r   r   )device)r0   r   rA   rC   rB   r   )r   )rA   r@   r   rB   rC   )last_hidden_staterB   )
ValueErrorembed_tokensr   r0   get_seq_lengthr)   aranger(   r   	unsqueezer   
rotary_embr   layersr   r   )r<   r   rA   r   rB   r   rC   r   rD   past_seen_tokenscausal_maskr?   r@   decoder_layerr"   r"   r#   rb      sP   

	


zNanoChatModel.forward)NNNNNNN)r   r    r!   r   r5   r)   rf   rd   r   FloatTensorboolr   r   r   rb   rg   r"   r"   r=   r#   r      s8    	
r   c                       s*   e Zd ZddiZdef fddZ  ZS )NanoChatForCausalLMlm_headcolwise_gather_outputrE   c                    s   t  jdi | dS )ak  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, AutoModelForCausalLM

        >>> model = AutoModelForCausalLM.from_pretrained("karpathy/nanochat-d32")

        >>> tokenizer = AutoTokenizer.from_pretrained("karpathy/nanochat-d32")

        >>> conversation = [
                {"role": "user", "content": "What is the capital of France?"},
            ]

        >>> inputs = tokenizer.apply_chat_template(
                conversation, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
            ).to(device)

        >>> with torch.no_grad():
        >>>     outputs = model.generate(**inputs, max_new_tokens=64, do_sample=False)

        >>> generated_tokens = outputs[0, inputs["input_ids"].shape[1] :]
        >>> output = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        ```Nr"   )r4   rb   )r<   super_kwargsr=   r"   r#   rb      s   zNanoChatForCausalLM.forward)r   r    r!   _tp_planr	   rb   rg   r"   r"   r=   r#   r      s    r   )rt   r   r   )4r~   collections.abcr   r)   torch.nnrj    r   rz   cache_utilsr   r   masking_utilsr   modeling_outputsr   r	   modeling_utilsr
   r   processing_utilsr   utilsr   r   clip.modeling_clipr   gemma2.modeling_gemma2r   llama.modeling_llamar   r   r   r   r   r   llama4.modeling_llama4r   qwen3.modeling_qwen3r   configuration_nanochatr   r   r%   r.   r/   rh   rq   rt   r   r   __all__r"   r"   r"   r#   <module>   s<    9B