o
    wiy                     @   s  d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ eeZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%g dZ&dS )zPyTorch BitNet model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )GemmaMLP)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaRMSNormapply_rotary_pos_embeager_attention_forward   )BitNetConfigc                   @      e Zd ZdS )BitNetRMSNormN__name__
__module____qualname__ r   r   f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/bitnet/modular_bitnet.pyr   *       r   c                       s*   e Zd Zdef fddZdd Z  ZS )	BitNetMLPconfigc                    s"   t  | t|j|jd| _d S N)eps)super__init__r   intermediate_sizerms_norm_epsffn_sub_norm)selfr    	__class__r   r   r$   /   s   zBitNetMLP.__init__c              	   C   s*   |  | | | || | }|S )N)	down_projr'   act_fn	gate_projup_proj)r(   xr+   r   r   r   forward3   s   &zBitNetMLP.forward)r   r   r   r   r$   r0   __classcell__r   r   r)   r   r   .   s    r   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )BitNetAttentionr    	layer_idxc                    s$   t  || t|j|jd| _d S r!   )r#   r$   r   hidden_sizer&   attn_sub_norm)r(   r    r3   r)   r   r   r$   9   s   zBitNetAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                 K   s.  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| jd|\}}|jg |dR   }| |}| |}||fS )Nr   r   )sincosr:   eagerg        )dropoutscaling)shapehead_dimq_projview	transposek_projv_projr   updater3   r   r    _attn_implementationr   trainingattention_dropoutrB   reshape
contiguousr5   o_proj)r(   r6   r7   r8   r9   r:   r;   input_shapehidden_shapequery_states
key_statesvalue_statesr?   r>   cache_kwargsattention_interfaceattn_outputattn_weightsr   r   r   r0   =   s:   	


zBitNetAttention.forward)NN)r   r   r   r   intr$   torchTensortupler   r   
LongTensorr	   r   r0   r1   r   r   r)   r   r2   8   s&    	r2   c                   @   r   )BitNetDecoderLayerNr   r   r   r   r   r_   k   r   r_   c                   @   r   )BitNetModelNr   r   r   r   r   r`   o   r   r`   c                       s0   e Zd ZdgZdZdZdef fddZ  ZS )BitNetForCausalLMzlm_head.weightNr<   c                    s   t  jdi |S )a$  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BitNetForCausalLM

        >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T")

        >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: '
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=100)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?"
        ```Nr   )r#   r0   )r(   super_kwargsr)   r   r   r0   x   s   zBitNetForCausalLM.forward)	r   r   r   _tied_weights_keys_tp_plan_pp_planr   r0   r1   r   r   r)   r   ra   s   s    ra   )ra   r`   BitNetPreTrainedModel)'__doc__typingr   r   r[   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   configuration_bitnetr   
get_loggerr   loggerr   r   r2   r_   r`   ra   __all__r   r   r   r   <module>   s(   $	

3"