o
    eiX                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ	 ddl
mZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, G dd de&Z-G dd dej.Z/G dd dej.Z0G dd dej.Z1G d d! d!ej.Z2G d"d# d#ej.Z3G d$d% d%ej.Z4G d&d' d'eZ5G d(d) d)eZ6eG d*d+ d+e6Z7G d,d- d-e6eZ8g d.Z9dS )/z"Modular components for DBRX model.    )Callable)AnyN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )LlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)load_balancing_loss_func   )
DbrxConfigc                   @   s   e Zd ZdS )DbrxRotaryEmbeddingN)__name__
__module____qualname__ r"   r"   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dbrx/modular_dbrx.pyr   -   s    r   c                       s~   e Zd ZdZ	ddedB f fddZ				ddejdejdB dejdB d	e	dB d
ejdB de
ejejf fddZ  ZS )DbrxAttentionzYModular DBRX attention component that can be reused across different model architectures.N	layer_idxc                    s   t    || _|j| _|j| _| j| j | _|j| _	|| _
|j}|j| _|j| _|j| _| j| j | _| jd | _|j| _d| _tj| j| jd| j | j  dd| _tj| j| jdd| _d S )Ng      Tr   Fbias)super__init__configd_modelhidden_sizen_heads	num_headshead_dimmax_seq_lenmax_position_embeddingsr%   attn_config
attn_pdropattention_dropoutclip_qkv
kv_n_headsnum_key_value_headsnum_key_value_groupsscaling
rope_theta	is_causalr   LinearWqkvout_proj)selfr*   r%   kwargsr2   	__class__r"   r#   r)   4   s&   
zDbrxAttention.__init__hidden_statesattention_maskposition_embeddingspast_key_valuescache_positionreturnc                 K   sb  |j d d }g |d| jR }| |}	| jd ur| j nd }
|	j|
| jd}	|	j| j| j| j | j| j gdd\}}}||	dd}||	dd}||	dd}|\}}t
||||\}}|d ur||||d}|||| j|\}}t| jjt}|| ||||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )	N)minmaxr   dimr   )sincosrG           )dropoutr9   )shaper/   r=   r5   clampsplitr,   r7   view	transposer   updater%   r   get_interfacer*   _attn_implementationr   trainingr4   r9   reshape
contiguousr>   )r?   rC   rD   rE   rF   rG   r@   input_shapehidden_shape
qkv_statesmin_valquery_states
key_statesvalue_statesrO   rN   cache_kwargsattention_interfaceattn_outputattn_weightsr"   r"   r#   forwardP   sL   	


	

zDbrxAttention.forwardNNNNN)r   r    r!   __doc__intr)   torchTensor
LongTensorr   tuplerh   __classcell__r"   r"   rA   r#   r$   1   s.    r$   c                
       sD   e Zd Z fddZdejdejdejdejdejf
dd	Z  ZS )
DbrxExpertGLUc                    s   t    |j| _|j| _|j| _tt| j| j | j| _	tt| j| j | j| _
tt| j| j | j| _|jdd}t| | _d S )Nnamesilu)r(   r)   r,   ffn_hidden_sizemoe_num_expertsr   	Parameterrm   emptyw1v1w2
ffn_act_fngetr   activation_fn)r?   r*   act_fn_namerA   r"   r#   r)      s   
zDbrxExpertGLU.__init__x	expert_w1	expert_v1	expert_w2rH   c           	      C   s8   | |}| |}| |}|| }| | }|S ri   )matmulr~   t)	r?   r   r   r   r   	gate_projup_projintermediate_states	down_projr"   r"   r#   rh      s   


zDbrxExpertGLU.forwardr   r    r!   r)   rm   rn   rh   rq   r"   r"   rA   r#   rr      s    rr   c                       s>   e Zd Z fddZdejdejdejdejfddZ  ZS )	DbrxExpertsc                    s0   t    t|| _|j| _|j| _|j| _d S ri   )r(   r)   rr   mlpr,   ru   rv   num_expertsr?   r*   rA   r"   r#   r)      s
   

zDbrxExperts.__init__rC   top_k_indextop_k_weightsrH   c              	   C   sp  |j d }|d| j}tj||j|jd}t % tjj	j
|| jd}|ddd}t|jddd }W d    n1 sBw   Y  d| j| jf}|D ]]}	|	d }	t  t||	 \}
}W d    n1 snw   Y  | jj||	 }| jj||	 }| jj||	 }| || |||}|d| j|||
d f  }|d|| qP||d| j}|S )	Nr   rI   )dtypedevice)num_classesr   r   )rI   rL   )rR   r[   ru   rm   
zeros_liker   r   no_gradr   
functionalone_hotr   permutegreatersumnonzeror,   wherer   rz   rU   ry   r{   
index_add_)r?   rC   r   r   
batch_sizenext_statesexpert_mask
expert_hitsplit_expert_shape
expert_idxidx	token_idxrz   ry   r{   statesr"   r"   r#   rh      s,   


zDbrxExperts.forwardr   r"   r"   rA   r#   r      s    r   c                       s@   e Zd Z fddZdejdeejejejf fddZ  Z	S )
DbrxRouterc                    s4   t    |j| _|j| _tj| j|jdd| _d S NFr&   )	r(   r)   ru   r,   moe_jitter_epsr   r<   rv   layerr   rA   r"   r#   r)      s   
zDbrxRouter.__init__rC   rH   c                 C   sR   | j r| jd ur|t|d| j d| j 9 }|d|jd }| |}|S )Ng      ?rI   )rZ   r   rm   
empty_likeuniform_rU   rR   r   )r?   rC   router_logitsr"   r"   r#   rh      s   
zDbrxRouter.forward)
r   r    r!   r)   rm   rn   rp   ro   rh   rq   r"   r"   rA   r#   r      s    ,r   c                       sH   e Zd ZdZ fddZdd Zdejdeejejf fdd	Z	  Z
S )
DbrxFFNz0Modular DBRX MLP/FFN component with MoE support.c                    s:   t    t|j| _t|j| _|jj| _|jj| _	d S ri   )
r(   r)   r   
ffn_configrouterr   expertsmoe_normalize_expert_weights	moe_top_ktop_k)r?   r*   r@   rA   r"   r#   r)      s
   

zDbrxFFN.__init__c                 C   sV   t jjj|d|jd}t j|| jdd\}}| jd ur'|t j|| jddd }||fS )Nr   )rM   r   rI   rL   T)prM   keepdim)	rm   r   r   softmaxr   topkr   r   norm)r?   r   router_top_valuerouter_indicesr"   r"   r#   route_tokens_to_experts   s   

zDbrxFFN.route_tokens_to_expertsrC   rH   c                 C   s*   |  |}| |\}}| |||}|S ri   )r   r   r   )r?   rC   r   r   r   outputr"   r"   r#   rh      s   
zDbrxFFN.forward)r   r    r!   rk   r)   r   rm   rn   rp   rh   rq   r"   r"   rA   r#   r      s
    (	r   c                       sz   e Zd ZddededB f fddZ			ddejdejdejdB d	e	dB d
ejdB de
deejejf fddZ  ZS )DbrxNormAttentionNormNr*   r%   c                    sN   t    || _|j| _tj|jdd| _t||d| _	tj|jdd| _
d S )NFr&   r*   r%   )r(   r)   r%   resid_pdropr   	LayerNormr+   norm_1r$   attnnorm_2r?   r*   r%   rA   r"   r#   r)      s   
zDbrxNormAttentionNorm.__init__rC   rE   rD   rF   rG   r@   rH   c           	      K   sr   |}|  ||j}| jd|||||d|\}}tjj|| j| jd}|| }|}| 	||j}||fS N)rC   rD   rE   rF   rG   )r   rZ   r"   )
r   tor   r   r   r   rQ   r   rZ   r   )	r?   rC   rE   rD   rF   rG   r@   residual_states_r"   r"   r#   rh     s    	
	zDbrxNormAttentionNorm.forwardri   )NNN)r   r    r!   r   rl   r)   rm   rn   ro   r   r   rp   rh   rq   r"   r"   rA   r#   r      s(    r   c                       sj   e Zd Zdedef fddZ				ddejdejdB dejdB d	e	dB d
ejdB de
fddZ  ZS )	DbrxBlockr*   r%   c                    s>   t    |j| _|j| _|| _t||d| _t|d| _	d S )Nr   r*   )
r(   r)   r+   r,   r   r%   r   norm_attn_normr   ffnr   rA   r"   r#   r)      s   
zDbrxBlock.__init__NrC   rD   rE   rF   rG   r@   c                 K   sL   | j d|||||d|\}}| |}tjj|| j| jd}|| }|S r   )r   r   r   r   rQ   r   rZ   )r?   rC   rD   rE   rF   rG   r@   resid_statesr"   r"   r#   rh   +  s   	

	zDbrxBlock.forwardrj   )r   r    r!   r   rl   r)   rm   rn   ro   r   r   rh   rq   r"   r"   rA   r#   r     s&    r   c                       sh   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZe dejf fd	d
Z  ZS )DbrxPreTrainedModelr*   transformerTr   rF   F)rC   
attentionsmodulec                    s\   t  | | jj}t|tr,tj|jd|d tj|j	d|d tj|j
d|d d S d S )NrP   )meanstd)r(   _init_weightsr*   initializer_range
isinstancerr   initnormal_ry   rz   r{   )r?   r   r   rA   r"   r#   r   S  s   
z!DbrxPreTrainedModel._init_weights)r   r    r!   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flex_attn_supports_attention_backend_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   r$   _can_record_outputsrm   r   r   Moduler   rq   r"   r"   rA   r#   r   C  s    
 r   c                       s   e Zd ZdZdef fddZdejfddZdejfd	d
Z	e
ee							ddejdB dejdB dejdB dedB dejdB dedB dejdB dee defddZ  ZS )	DbrxModela  Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.

    Args:
        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    r*   c                    s   t     j| _ j| _ j| _t | _t	 j j
| j| _t fddt jD | _tj j
dd| _d| _|   d S )Nc                    s   g | ]}t  |qS r"   )r   ).0r%   r   r"   r#   
<listcomp>n  s    z&DbrxModel.__init__.<locals>.<listcomp>Fr&   )r(   r)   pad_token_idpadding_idx
vocab_size	emb_pdropr   
rotary_embr   	Embeddingr+   wte
ModuleListrangen_layersblocksr   norm_fgradient_checkpointing	post_initr   rA   r   r#   r)   g  s   
 zDbrxModel.__init__rH   c                 C      | j S ri   r   r?   r"   r"   r#   get_input_embeddingsu     zDbrxModel.get_input_embeddingsvaluec                 C   
   || _ d S ri   r   r?   r   r"   r"   r#   set_input_embeddingsx     
zDbrxModel.set_input_embeddingsN	input_idsrD   position_idsrF   inputs_embeds	use_cacherG   r@   c              
   K   s   |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|}| ||}| jd | jj D ]}||f||
||||d|}qb| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )r*   r  rD   rG   rF   r  )rE   rD   r  rF   r  rG   )last_hidden_staterF   )
ValueErrorr	   r*   r   get_seq_lengthrm   arangerR   r   	unsqueezer   r   r   num_hidden_layersr   r   )r?   r  rD   r  rF   r  r  rG   r@   past_seen_tokenscausal_maskrC   rE   decoder_layerr"   r"   r#   rh   {  sR   

	
zDbrxModel.forward)NNNNNNN)r   r    r!   rk   r   r)   r   r   r   r   r   r   r   rm   ro   rn   r   FloatTensorboolr   r   r   rh   rq   r"   r"   rA   r#   r   ]  sD    	
r   c                       s0  e Zd ZddiZddiZddgdgfiZdef fdd	Zd
ej	fddZ
dej	fddZd
ejfddZdejfddZdefddZd
efddZee										d)dejdB dejdB dejdB dedB d ejdB d!ejdB d"edB d#edB d$ejdB d%eejB d&ee d
efd'd(Z  ZS )*DbrxForCausalLMzlm_head.weightztransformer.wte.weightlm_headcolwise_gather_outputrC   logitsr*   c                    s^   t  | t|| _|j| _tj|j|jdd| _|j	j
| _|j	j| _|j	j| _|   d S r   )r(   r)   r   r   r   r   r<   r,   r  r   moe_loss_weightrouter_aux_loss_coefrv   r   r   num_experts_per_tokr   r   rA   r"   r#   r)     s   



zDbrxForCausalLM.__init__rH   c                 C   s
   | j  S ri   )r   r   r   r"   r"   r#   r     r  z$DbrxForCausalLM.get_input_embeddingsr   c                 C   s   | j | d S ri   )r   r   r   r"   r"   r#   r     s   z$DbrxForCausalLM.set_input_embeddingsc                 C   r   ri   r  r   r"   r"   r#   get_output_embeddings  r   z%DbrxForCausalLM.get_output_embeddingsnew_embeddingsc                 C   r   ri   r  )r?   r  r"   r"   r#   set_output_embeddings  r  z%DbrxForCausalLM.set_output_embeddingsdecoderc                 C   r   ri   r   )r?   r  r"   r"   r#   set_decoder  r  zDbrxForCausalLM.set_decoderc                 C   r   ri   r  r   r"   r"   r#   get_decoder  r   zDbrxForCausalLM.get_decoderNr   r  rD   r  rF   r  labelsr  output_router_logitsrG   logits_to_keepr@   c                 K   s   |dur|n| j j}| jd||||||||	d|}|j}t|
tr)t|
 dn|
}| |dd|ddf }d}|durK| j||| j	fi |}d}|rht
|j| j| j|}|durh|| j||j 7 }t||||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >> from transformers import AutoTokenizer, DbrxForCausalLM

        >> model = DbrxForCausalLM.from_pretrained("transformers-community/dbrx-instruct")
        >> tokenizer = AutoTokenizer.from_pretrained("transformers-community/dbrx-instruct")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```
        N)r  rD   r  rF   r  r  r!  rG   )lossaux_lossr  rF   rC   r   r   r"   )r*   r!  r   r  r   rl   slicer  loss_functionr   r   r   r   r  r  r   r   r   rF   rC   r   )r?   r  rD   r  rF   r  r   r  r!  rG   r"  r@   outputsrC   slice_indicesr  r#  r$  r"   r"   r#   rh     sN   (	zDbrxForCausalLM.forward)
NNNNNNNNNr   ) r   r    r!   _tied_weights_keys_tp_plan_pp_planr   r)   r   r   r   r   r<   r  r  r   r  r  r   r   rm   ro   rn   r   r  r  rl   r   r   r   rh   rq   r"   r"   rA   r#   r    s`    
	
r  )r  r   r   ):rk   collections.abcr   typingr   rm   r    r   r   activationsr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   mixtral.modeling_mixtralr   configuration_dbrxr   r   r   r$   rr   r   r   r   r   r   r   r   r  __all__r"   r"   r"   r#   <module>   sB   X'*$^x