o
    ei4                     @   s  d dl Z d dl mZ ddlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* G dd deZ+G dd deZ,G dd deZ-G dd de Z.G dd dej/Z0G dd  d e"Z1G d!d" d"e%Z2eG d#d$ d$e#eZ3eG d%d& d&e'Z4G d'd( d(e&Z5g d)Z6dS )*    N)nn   )initialization)ACT2FN)CacheDynamicCache)create_causal_mask)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstring)can_return_tuplemerge_with_config_defaults)capture_outputs   )GraniteRMSNormGraniteRotaryEmbedding)JetMoeParallelExpertsJetMoeTopKGating)LlamaAttentionLlamaPreTrainedModel)MixtralDecoderLayerMixtralForCausalLMMixtralModelload_balancing_loss_func   )GraniteMoeConfigc                   @      e Zd ZdS )GraniteMoeRMSNormN__name__
__module____qualname__ r%   r%   o/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/granitemoe/modular_granitemoe.pyr    $       r    c                   @   r   )GraniteMoeRotaryEmbeddingNr!   r%   r%   r%   r&   r(   (   r'   r(   c                   @   r   )GraniteMoeParallelExpertsNr!   r%   r%   r%   r&   r)   ,   r'   r)   c                   @   r   )GraniteMoeTopKGatingNr!   r%   r%   r%   r&   r*   0   r'   r*   c                       s.   e Zd ZdZdef fddZdd Z  ZS )GraniteMoeMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    configc                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr   )
input_sizenum_expertstop_k)super__init__hidden_sizer-   intermediate_sizer   
hidden_act
activationr)   num_local_expertsinput_linearoutput_linearr*   num_experts_per_tokrouterselfr,   	__class__r%   r&   r1   =   s   
zGraniteMoeMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}|| }	| |	|}
|
jddd}| |d |d  }
| |
|}||d d d f  }tj|| | j	f|j
|jd}|d||}|||| j	}|S )Nr   )dimr   r   )dtypedevice)sizereshaper:   r7   chunkr5   r8   torchzerosr-   rA   rB   	index_addview)r<   layer_inputbszlengthemb_size_batch_indexbatch_gatesexpert_sizeexpert_inputshidden_stateschunked_hidden_statesexpert_outputsrG   layer_outputr%   r%   r&   forwardL   s   zGraniteMoeMoE.forward)r"   r#   r$   __doc__r   r1   rW   __classcell__r%   r%   r=   r&   r+   4   s    r+   c                       s&   e Zd Zdedef fddZ  ZS )GraniteMoeAttentionr,   	layer_idxc                    s   t  | || |j| _d S N)r0   r1   attention_multiplierscalingr<   r,   r[   r=   r%   r&   r1   `   s   zGraniteMoeAttention.__init__)r"   r#   r$   r   intr1   rY   r%   r%   r=   r&   rZ   _   s    rZ   c                       sv   e Zd Zdedef fddZ				ddejdejdB dedB d	ej	dB d
e
ejejf dB dejfddZ  ZS )GraniteMoeDecoderLayerr,   r[   c                    sd   t  || t||d| _t|| _t|j|jd| _	t|j|jd| _
| `t|| _|j| _d S )N)r,   r[   eps)r0   r1   rZ   	self_attnr+   block_sparse_moer    r2   rms_norm_epsinput_layernormpost_attention_layernormmlpresidual_multiplierr_   r=   r%   r&   r1   f   s   

zGraniteMoeDecoderLayer.__init__NrS   attention_maskpast_key_valuescache_positionposition_embeddingsreturnc           	      K   sf   |}|  |}| jd|||||d|\}}||| j  }|}| |}| |}||| j  }|S )N)rS   rk   rl   rm   rn   r%   )rg   rd   rj   rh   re   )	r<   rS   rk   rl   rm   rn   kwargsresidualrN   r%   r%   r&   rW   p   s"   	



zGraniteMoeDecoderLayer.forward)NNNN)r"   r#   r$   r   r`   r1   rF   Tensorr   
LongTensortuplerW   rY   r%   r%   r=   r&   ra   e   s&    ra   c                   @   sF   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZe dd Zd	S )
GraniteMoePreTrainedModelr,   modelTra   rl   Fc                 C   s4   t | | t|trtj|jd| jjd d S d S )Ng        )meanstd)	r   _init_weights
isinstancer)   initnormal_weightr,   initializer_range)r<   moduler%   r%   r&   ry      s   
z'GraniteMoePreTrainedModel._init_weightsN)r"   r#   r$   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphrF   no_gradry   r%   r%   r%   r&   ru      s   
 ru   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )GraniteMoeModelr,   c                    sJ   t    t fddt jD | _t j j	d| _
 j| _d S )Nc                    s   g | ]}t  |qS r%   )ra   ).0r[   r,   r%   r&   
<listcomp>   s    z,GraniteMoeModel.__init__.<locals>.<listcomp>rb   )r0   r1   r   
ModuleListrangenum_hidden_layerslayersr    r2   rf   normembedding_multiplierr;   r=   r   r&   r1      s   zGraniteMoeModel.__init__N	input_idsrk   position_idsrl   inputs_embeds	use_cacherm   rp   ro   c              
   K   s  |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| j|||||d}
|| j }|}| ||}| jd | jj D ]}||f||
||||d|}qg| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )rB   )r,   r   rk   rm   rl   r   )rn   rk   r   rl   r   rm   )last_hidden_staterl   )
ValueErrorr   r,   embed_tokensget_seq_lengthrF   arangeshaperB   	unsqueezer   r   
rotary_embr   r   r   r
   )r<   r   rk   r   rl   r   r   rm   rp   past_seen_tokenscausal_maskrS   rn   decoder_layerr%   r%   r&   rW      sT   



zGraniteMoeModel.forward)NNNNNNN)r"   r#   r$   r   r1   r   r   r   rF   rs   rr   r   FloatTensorboolr   r   r
   rW   rY   r%   r%   r=   r&   r      s>    	
r   c                       s   e Zd Zdef fddZee									ddejdB dej	dB dejdB d	e
dB d
ejdB dejdB dedB dejdB deej	B deeB fddZ  ZS )GraniteMoeForCausalLMr,   c                    s"   t  | t|| _|j| _d S r\   )r0   r1   r   rv   logits_scalingr;   r=   r%   r&   r1      s   
zGraniteMoeForCausalLM.__init__Nr   r   rk   r   rl   r   labelsoutput_router_logitsrm   logits_to_keepro   c
              	   K   s   |dur|n| j j}| jd||||||d|
}|j}t|	tr't|	 dn|	}| |dd|ddf }|| j j }d}|durQ| j	||fd| j j
i|
}d}|rnt|j| j| j|}|durn|| j||j 7 }t||||j|j|j|jdS )al  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeForCausalLM

        >>> model = GraniteMoeForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r   rk   r   rl   r   rm   
vocab_size)lossaux_losslogitsrl   rS   
attentionsrouter_logitsr%   )r,   r   rv   r   rz   r`   slicelm_headr   loss_functionr   r   r   r.   r9   router_aux_loss_coeftorB   r	   rl   rS   r   )r<   r   rk   r   rl   r   r   r   rm   r   rp   outputsrS   slice_indicesr   r   r   r%   r%   r&   rW      sZ   &zGraniteMoeForCausalLM.forward)	NNNNNNNNr   )r"   r#   r$   r   r1   r   r   rF   rs   rr   r   r   r   r`   rt   r	   rW   rY   r%   r%   r=   r&   r      sD    	
r   )r   r   ru   )7rF   r    r   r{   activationsr   cache_utilsr   r   masking_utilsr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   r   utils.output_capturingr   granite.modeling_graniter   r   jetmoe.modeling_jetmoer   r   llama.modeling_llamar   r   mixtral.modeling_mixtralr   r   r   r   configuration_granitemoer   r    r(   r)   r*   Moduler+   rZ   ra   ru   r   r   __all__r%   r%   r%   r&   <module>   s<   +&J^