o
    ei                     @   s  d dl mZ d dlmZmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, G dd dej-Z.dd Z/eddGddZ0dej1de2dej1fd d!Z3	"dHd#ej-d$ej1d%ej1d&ej1d'ej1dB d(e4d)e4d*e!e# fd+d,Z5G d-d. d.ej-Z6G d/d0 d0ej-Z7G d1d2 d2ej-Z8G d3d4 d4ej-Z9G d5d6 d6ej-Z:G d7d8 d8ej-Z;G d9d: d:eZ<G d;d< d<eZ=e$G d=d> d>e=Z>		?	dId@ej1e?ej1 B dB dAe2dB d'ej1dB dej1e2B fdBdCZ@G dDdE dEe=eZAg dFZBdS )J    )Callable)AnyOptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_func_from_hub)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )
DbrxConfigc                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )DbrxRotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr    rope_parametersr!   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr    devicerope_init_fnr   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dbrx/modeling_dbrx.pyr&   /   s   


zDbrxRotaryEmbedding.__init__r0   ztorch.deviceseq_lenreturnztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r      dtype)r0   r=   )	r*   getattrhidden_sizenum_attention_headstorcharangeint64tofloat)r    r0   r6   basedimattention_factorr   r4   r4   r5   r+   ?   s   
&z3DbrxRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r   mpscpuF)device_typeenabledr;   rG   r<   )r   rE   expandshaperD   r0   
isinstancetypestrr   	transposerA   catcosr,   sinr=   )
r/   xposition_idsinv_freq_expandedposition_ids_expandedrL   freqsembrV   rW   r4   r4   r5   forward]   s   0&zDbrxRotaryEmbedding.forwardNNNN)__name__
__module____qualname__rA   Tensor__annotations__r   r&   staticmethodr   inttuplerE   r+   no_gradr   r^   __classcell__r4   r4   r2   r5   r   ,   s&   
 

r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrI   r;   rN   )rP   rA   rU   )rX   x1x2r4   r4   r5   rotate_halfm   s   rm   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerm   )qkrV   rW   unsqueeze_dimq_embedk_embedr4   r4   r5   apply_rotary_pos_embt   s
   

ru   hidden_statesn_repr7   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rP   rO   reshape)rv   rw   batchnum_key_value_headsslenr9   r4   r4   r5   	repeat_kv   s
   0r|           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr;   r   rI   rG   r=   ptrainingr   )r|   num_key_value_groupsrA   matmulrT   r   
functionalsoftmaxfloat32rD   r=   r   r   
contiguous)r~   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr4   r4   r5   eager_attention_forward   s   
r   c                       s~   e Zd ZdZ	ddedB f fddZ				ddejdejdB dejdB d	e	dB d
ejdB de
ejejf fddZ  ZS )DbrxAttentionzYModular DBRX attention component that can be reused across different model architectures.N	layer_idxc                    s   t    || _|j| _|j| _| j| j | _|j| _	|| _
|j}|j| _|j| _|j| _| j| j | _| jd | _|j| _d| _tj| j| jd| j | j  dd| _tj| j| jdd| _d S )Ng      Tr;   Fbias)r%   r&   r    d_modelr?   n_heads	num_headsr9   max_seq_lenr'   r   attn_config
attn_pdropattention_dropoutclip_qkv
kv_n_headsrz   r   r   r8   	is_causalr   LinearWqkvout_proj)r/   r    r   r   r   r2   r4   r5   r&      s&   
zDbrxAttention.__init__rv   r   position_embeddingspast_key_valuescache_positionr7   c                 K   sb  |j d d }g |d| jR }| |}	| jd ur| j nd }
|	j|
| jd}	|	j| j| j| j | j| j gdd\}}}||	dd}||	dd}||	dd}|\}}t
||||\}}|d ur||||d}|||| j|\}}t| jjt}|| ||||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )	NrI   )minmaxr;   rN   r   )rW   rV   r   r}   )r   r   )rP   r9   r   r   clampsplitr?   rz   viewrT   ru   updater   r   get_interfacer    _attn_implementationr   r   r   r   rx   r   r   )r/   rv   r   r   r   r   r   input_shapehidden_shape
qkv_statesmin_valquery_statesr   r   rV   rW   cache_kwargsattention_interfacer   r   r4   r4   r5   r^      sL   	


	

zDbrxAttention.forwardr_   NNNN)ra   rb   rc   __doc__rg   r&   rA   rd   
LongTensorr	   rh   r^   rj   r4   r4   r2   r5   r      s.    r   c                
       sD   e Zd Z fddZdejdejdejdejdejf
dd	Z  ZS )
DbrxExpertGLUc                    s   t    |j| _|j| _|j| _tt| j| j | j| _	tt| j| j | j| _
tt| j| j | j| _|jdd}t| | _d S )Nnamesilu)r%   r&   r?   ffn_hidden_sizemoe_num_expertsr   	ParameterrA   emptyw1v1w2
ffn_act_fngetr   activation_fn)r/   r    act_fn_namer2   r4   r5   r&     s   
zDbrxExpertGLU.__init__rX   	expert_w1	expert_v1	expert_w2r7   c           	      C   s8   | |}| |}| |}|| }| | }|S r_   )r   r   t)	r/   rX   r   r   r   	gate_projup_projintermediate_states	down_projr4   r4   r5   r^     s   


zDbrxExpertGLU.forwardra   rb   rc   r&   rA   rd   r^   rj   r4   r4   r2   r5   r     s    r   c                       s>   e Zd Z fddZdejdejdejdejfddZ  ZS )	DbrxExpertsc                    s0   t    t|| _|j| _|j| _|j| _d S r_   )r%   r&   r   mlpr?   r   r   num_expertsr/   r    r2   r4   r5   r&   %  s
   

zDbrxExperts.__init__rv   top_k_indextop_k_weightsr7   c              	   C   sp  |j d }|d| j}tj||j|jd}t % tjj	j
|| jd}|ddd}t|jddd }W d    n1 sBw   Y  d| j| jf}|D ]]}	|	d }	t  t||	 \}
}W d    n1 snw   Y  | jj||	 }| jj||	 }| jj||	 }| || |||}|d| j|||
d f  }|d|| qP||d| j}|S )	Nr   rI   )r=   r0   )num_classesr;   r   )rI   rN   )rP   rx   r   rA   
zeros_liker=   r0   ri   r   r   one_hotr   permutegreatersumnonzeror?   wherer   r   r   r   r   
index_add_)r/   rv   r   r   
batch_sizenext_statesexpert_mask
expert_hitsplit_expert_shape
expert_idxidx	token_idxr   r   r   statesr4   r4   r5   r^   ,  s,   


zDbrxExperts.forwardr   r4   r4   r2   r5   r   $  s    r   c                       s@   e Zd Z fddZdejdeejejejf fddZ  Z	S )
DbrxRouterc                    s4   t    |j| _|j| _tj| j|jdd| _d S NFr   )	r%   r&   r   r?   moe_jitter_epsr   r   r   layerr   r2   r4   r5   r&   L  s   
zDbrxRouter.__init__rv   r7   c                 C   sR   | j r| jd ur|t|d| j d| j 9 }|d|jd }| |}|S )Nr:   rI   )r   r   rA   
empty_likeuniform_r   rP   r   )r/   rv   router_logitsr4   r4   r5   r^   R  s   
zDbrxRouter.forward)
ra   rb   rc   r&   rA   rd   rh   r   r^   rj   r4   r4   r2   r5   r   K  s    ,r   c                       sH   e Zd ZdZ fddZdd Zdejdeejejf fdd	Z	  Z
S )
DbrxFFNz0Modular DBRX MLP/FFN component with MoE support.c                    s:   t    t|j| _t|j| _|jj| _|jj| _	d S r_   )
r%   r&   r   
ffn_configrouterr   expertsmoe_normalize_expert_weights	moe_top_ktop_k)r/   r    r   r2   r4   r5   r&   _  s
   

zDbrxFFN.__init__c                 C   sV   t jjj|d|jd}t j|| jdd\}}| jd ur'|t j|| jddd }||fS )Nr   r   rI   rN   T)r   rG   keepdim)	rA   r   r   r   r=   topkr   r   norm)r/   r   router_top_valuerouter_indicesr4   r4   r5   route_tokens_to_expertsg  s   

zDbrxFFN.route_tokens_to_expertsrv   r7   c                 C   s*   |  |}| |\}}| |||}|S r_   )r   r   r   )r/   rv   r   r   r   outputr4   r4   r5   r^   p  s   
zDbrxFFN.forward)ra   rb   rc   r   r&   r   rA   rd   rh   r^   rj   r4   r4   r2   r5   r   \  s
    (	r   c                       sz   e Zd ZddededB f fddZ			ddejdejdejdB d	e	dB d
ejdB de
deejejf fddZ  ZS )DbrxNormAttentionNormNr    r   c                    sN   t    || _|j| _tj|jdd| _t||d| _	tj|jdd| _
d S )NFr   r    r   )r%   r&   r   resid_pdropr   	LayerNormr   norm_1r   attnnorm_2r/   r    r   r2   r4   r5   r&   x  s   
zDbrxNormAttentionNorm.__init__rv   r   r   r   r   r   r7   c           	      K   sr   |}|  ||j}| jd|||||d|\}}tjj|| j| jd}|| }|}| 	||j}||fS N)rv   r   r   r   r   r   r4   )
r  rD   r=   r  r   r   r   r  r   r  )	r/   rv   r   r   r   r   r   residual_states_r4   r4   r5   r^     s    	
	zDbrxNormAttentionNorm.forwardr_   r`   )ra   rb   rc   r   rg   r&   rA   rd   r   r	   r   rh   r^   rj   r4   r4   r2   r5   r   w  s(    r   c                       sj   e Zd Zdedef fddZ				ddejdejdB dejdB d	e	dB d
ejdB de
fddZ  ZS )	DbrxBlockr    r   c                    s>   t    |j| _|j| _|| _t||d| _t|d| _	d S )Nr  r    )
r%   r&   r   r?   r  r   r   norm_attn_normr   ffnr  r2   r4   r5   r&     s   
zDbrxBlock.__init__Nrv   r   r   r   r   r   c                 K   sL   | j d|||||d|\}}| |}tjj|| j| jd}|| }|S r  )r  r  r   r   r   r  r   )r/   rv   r   r   r   r   r   resid_statesr4   r4   r5   r^     s   	

	zDbrxBlock.forwardr   )ra   rb   rc   r   rg   r&   rA   rd   r   r	   r   r^   rj   r4   r4   r2   r5   r    s&    r  c                       sh   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZe dejf fd	d
Z  ZS )DbrxPreTrainedModelr    transformerTr  r   F)rv   
attentionsr~   c                    s\   t  | | jj}t|tr,tj|jd|d tj|j	d|d tj|j
d|d d S d S )Nr}   )meanstd)r%   _init_weightsr    initializer_rangerQ   r   initnormal_r   r   r   )r/   r~   r  r2   r4   r5   r    s   
z!DbrxPreTrainedModel._init_weights)ra   rb   rc   r   re   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flex_attn_supports_attention_backend_supports_flash_attn_supports_sdpa_can_compile_fullgraphr  r   _can_record_outputsrA   ri   r   Moduler  rj   r4   r4   r2   r5   r    s    
 r  c                       s   e Zd ZdZdef fddZdejfddZdejfd	d
Z	e
ee							ddejdB dejdB dejdB dedB dejdB dedB dejdB dee defddZ  ZS )	DbrxModela  Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.

    Args:
        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    r    c                    s   t     j| _ j| _ j| _t | _t	 j j
| j| _t fddt jD | _tj j
dd| _d| _|   d S )Nc                    s   g | ]}t  |qS r4   )r  ).0r   r  r4   r5   
<listcomp>      z&DbrxModel.__init__.<locals>.<listcomp>Fr   )r%   r&   pad_token_idpadding_idx
vocab_size	emb_pdropr   
rotary_embr   	Embeddingr   wte
ModuleListrangen_layersblocksr  norm_fgradient_checkpointing	post_initr   r2   r  r5   r&     s   
 zDbrxModel.__init__r7   c                 C      | j S r_   r.  r/   r4   r4   r5   get_input_embeddings     zDbrxModel.get_input_embeddingsr   c                 C   
   || _ d S r_   r7  r/   r   r4   r4   r5   set_input_embeddings     
zDbrxModel.set_input_embeddingsN	input_idsr   rY   r   inputs_embeds	use_cacher   r   c              
   K   s   |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|}| ||}| jd | jj D ]}||f||
||||d|}qb| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r   )r0   )r    r@  r   r   r   rY   )r   r   rY   r   rA  r   )last_hidden_stater   )
ValueErrorr
   r    r.  get_seq_lengthrA   rB   rP   r0   ro   r   r,  r2  num_hidden_layersr3  r   )r/   r?  r   rY   r   r@  rA  r   r   past_seen_tokenscausal_maskrv   r   decoder_layerr4   r4   r5   r^     sR   

	
zDbrxModel.forward)NNNNNNN)ra   rb   rc   r   r   r&   r   r-  r9  r=  r   r   r   rA   r   rd   r	   FloatTensorboolr   r   r   r^   rj   r4   r4   r2   r5   r$    sD    	
r$  r;   gate_logitsr   c                    s  | du s	t | tsdS t | tr#| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rStj
| dd}	tj
|dd}
ng|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||fd| }tj|| ddtj|dd }
t|	|
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r4   )rD   )r%  
layer_gatecompute_devicer4   r5   r&  `  r'  z,load_balancing_loss_func.<locals>.<listcomp>rN   rI   )rQ   rh   r0   rA   rU   r   r   r   r   r   r  rE   rP   rO   rx   rD   r   ro   )rK  r   r   r   concatenated_gate_logitsrouting_weightsr
  selected_expertsr   tokens_per_expertrouter_prob_per_expertr   sequence_lengthrE  expert_attention_mask router_per_expert_attention_maskoverall_lossr4   rM  r5   load_balancing_loss_func>  s>   



rX  c                       s0  e Zd ZddiZddiZddgdgfiZdef fdd	Zd
ej	fddZ
dej	fddZd
ejfddZdejfddZdefddZd
efddZee										d)dejdB dejdB dejdB dedB d ejdB d!ejdB d"edB d#edB d$ejdB d%eejB d&ee d
efd'd(Z  ZS )*DbrxForCausalLMzlm_head.weightztransformer.wte.weightlm_headcolwise_gather_outputrv   logitsr    c                    s^   t  | t|| _|j| _tj|j|jdd| _|j	j
| _|j	j| _|j	j| _|   d S r   )r%   r&   r$  r  r*  r   r   r?   rZ  r   moe_loss_weightrouter_aux_loss_coefr   r   r   num_experts_per_tokr5  r   r2   r4   r5   r&     s   



zDbrxForCausalLM.__init__r7   c                 C   s
   | j  S r_   )r  r9  r8  r4   r4   r5   r9    r>  z$DbrxForCausalLM.get_input_embeddingsr   c                 C   s   | j | d S r_   )r  r=  r<  r4   r4   r5   r=    s   z$DbrxForCausalLM.set_input_embeddingsc                 C   r6  r_   rZ  r8  r4   r4   r5   get_output_embeddings  r:  z%DbrxForCausalLM.get_output_embeddingsnew_embeddingsc                 C   r;  r_   r`  )r/   rb  r4   r4   r5   set_output_embeddings  r>  z%DbrxForCausalLM.set_output_embeddingsdecoderc                 C   r;  r_   r  )r/   rd  r4   r4   r5   set_decoder  r>  zDbrxForCausalLM.set_decoderc                 C   r6  r_   re  r8  r4   r4   r5   get_decoder  r:  zDbrxForCausalLM.get_decoderNr   r?  r   rY   r   r@  labelsrA  output_router_logitsr   logits_to_keepr   c                 K   s   |dur|n| j j}| jd||||||||	d|}|j}t|
tr)t|
 dn|
}| |dd|ddf }d}|durK| j||| j	fi |}d}|rht
|j| j| j|}|durh|| j||j 7 }t||||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, DbrxForCausalLM

        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
        N)r?  r   rY   r   r@  rA  ri  r   )lossaux_lossr\  r   rv   r  r   r4   )r    ri  r  rB  rQ   rg   slicerZ  loss_functionr*  rX  r   r   r_  r^  rD   r0   r   r   rv   r  )r/   r?  r   rY   r   r@  rh  rA  ri  r   rj  r   outputsrv   slice_indicesr\  rk  rl  r4   r4   r5   r^     sN   (	zDbrxForCausalLM.forward)
NNNNNNNNNr   ) ra   rb   rc   _tied_weights_keys_tp_plan_pp_planr   r&   r   r-  r9  r=  r   ra  rc  r$  rf  rg  r   r   rA   r   rd   r	   rI  rJ  rg   r   r   r   r^   rj   r4   r4   r2   r5   rY    s`    
	
rY  )rY  r$  r  )r   )r}   )Nr;   N)Ccollections.abcr   typingr   r   rA   r    r   r  activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_dbrxr   r#  r   rm   ru   rd   rg   r|   rE   r   r   r   r   r   r   r   r  r  r$  rh   rX  rY  __all__r4   r4   r4   r5   <module>   s   A
X'*$`
Rx