o
    ei                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZmZ ddlm	Z
 ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ ddlm Z  e!e"Z#d)ddZ$G dd dej%Z&G dd dej%Z'G dd deZ(eG dd deZ)eG dd de)Z*eddG dd  d e)eZ+ed!dG d"d# d#e)Z,eG d$d% d%e)Z-eG d&d' d'e)Z.g d(Z/dS )*zPyTorch MPT model.    N)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MptConfig   c                 C   s   t jd| dt j|dddd|}dtt|  }t jd|d t j|d }|||  }dt 	d| }|d|dd}|| krjt j
|ddddddf |ddddddf gddddd| df }|| }|dS )	a  
    Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice   g      ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopes r1   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensor*   s   $L
r3   c                       sj   e Zd ZdZddededB f fddZ			ddejdejd	e	dB d
ejdB dejdB f
ddZ
  ZS )MptAttentionzzMulti-head self attention.
    Using torch or triton attention implementation enables user to also use additive bias.
    Nconfig	layer_idxc                    s   t    |j| _|j| _|j| _| j| j | _|jj| _| jd u r.dt	
| j| j  | _|jj| _|jj| _tj| jd| j dd| _tj| j| jdd| _|| _d S )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler"   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr6   )selfr5   r6   	__class__r1   r2   r:   F   s   





zMptAttention.__init__hidden_statesposition_biaspast_key_valuesattention_maskcache_positionc                 C   s   |j d d \}}| |}| jr|j| j | jd}|jddd\}	}
}|	||| j| jdd}	|
||| j| jdd}
|||| j| jdd}|d urbd|i}|	|
|| j
|\}
}t|	|
dd| j }|d u ru|n||  }|d urt|j dkrtd	t|j  |
j d }td
|d| }td
|d| }|d d |d |d f }|| }|d ur||t|	jj}tjj| dd|j}tjj|| j| jd}t||}|d
ddd  !||d}| "|}||fS )Nr   )minmaxr   r   r   rP   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperG   rE   clampchunkreshaper<   r?   	transposeupdater6   r   matmulrA   get_seq_lengthlen
ValueErrorrR   sizemasked_fillfinfor   rQ   r   r   softmaxr&   todropoutrD   rW   permute
contiguousr!   rH   )rI   rL   rM   rN   rO   rP   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statescache_kwargsattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputr1   r1   r2   forwardV   s:   


zMptAttention.forwardN)NNN)__name__
__module____qualname____doc__r   intr:   r   Tensorr	   ry   __classcell__r1   r1   rJ   r2   r4   A   s"    r4   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	MptMLPr5   c                    sX   t    |j}tj|d| dd| _tjdd| _tjd| |dd| _|j	j
| _d S )N   Fr7   none)approximate)r9   r:   r;   r   rF   up_projGELUact	down_projr@   rC   hidden_dropout)rI   r5   r;   rJ   r1   r2   r:      s   
zMptMLP.__init__rL   residualreturnc                 C   s:   |  | |}| |}tj|| j| jd}|| }|S )NrU   )r   r   r   Frg   r   rW   )rI   rL   r   intermediate_outputoutputr1   r1   r2   ry      s
   
zMptMLP.forward)	r{   r|   r}   r   r:   r   r   ry   r   r1   r1   rJ   r2   r      s    $	r   c                       sl   e Zd ZddededB f fddZ				ddejdejd	ejd
edB de	de	dejdB fddZ
  ZS )MptBlockNr5   r6   c                    sz   t    |j}t||jd| _d | j_|j| _t	||| _
t||jd| _d | j_t|| _|jj| _t| j| _d S )Neps)r9   r:   r;   r   layer_norm_epsilonnorm_1r8   r<   r*   r4   attnnorm_2r   ffnr@   rC   dropout_rater   Dropoutresid_attn_dropout)rI   r5   r6   r;   rJ   r1   r2   r:      s   


zMptBlock.__init__FrL   rM   rO   
layer_past	use_cacheoutput_attentionsrP   c                 C   sV   |  |}|}	| j|||||d\}
}| |
|	 }| |}|}	| ||	}||fS )N)rM   rO   rN   rP   )r   r   r   r   r   )rI   rL   rM   rO   r   r   r   rP   layernorm_outputr   attn_outputsrv   r   r1   r1   r2   ry      s   


zMptBlock.forwardrz   )NFFN)r{   r|   r}   r   r   r:   r   r   r	   boolry   r   r1   r1   rJ   r2   r      s*    r   c                   @   s$   e Zd ZU eed< dZdZdgZdS )MptPreTrainedModelr5   transformerTr   N)r{   r|   r}   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr1   r1   r1   r2   r      s
   
 
r   c                       s   e Zd Zdef fddZdd Zddd	Zd
ejfddZ	e
									ddejdB dedB dejdB dejdB dedB dedB dedB dedB dejdB deejdf eB fddZ  ZS )MptModelr5   c                    sz   t     j| _ j| _t j| j| _t	 fddt
 jD | _t| j jd| _d | j_d| _|   d S )Nc                    s   g | ]}t  |d qS ))r6   )r   ).0ir5   r1   r2   
<listcomp>   s    z%MptModel.__init__.<locals>.<listcomp>r   F)r9   r:   r;   r<   r*   r   	Embedding
vocab_sizewte
ModuleListrangen_layersblocksr   r   norm_fr8   gradient_checkpointing	post_initrI   r5   rJ   r   r2   r:      s    zMptModel.__init__c                 C   s   | j S rz   r   )rI   r1   r1   r2   get_input_embeddings   s   zMptModel.get_input_embeddingsr   Nc                 C   s   t ||||S rz   )r3   )rI   r*   r+   r,   r   r1   r1   r2   r3      s   zMptModel.build_mpt_alibi_tensornew_embeddingsc                 C   
   || _ d S rz   r   rI   r   r1   r1   r2   set_input_embeddings      
zMptModel.set_input_embeddings	input_idsrN   rO   inputs_embedsr   r   output_hidden_statesreturn_dictrP   r   .c
              
   K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|dur4|dur4td|dur>|j\}}n|durI|j\}}}ntd| jr\| jr\|r\t	
d d}|du re| |}|rq|du rqt| j d}|}|rwdnd}|r}dnd}| j| j| j j|jd}|dur| nd	}|	du rtj||| |jd}	t| j |||	|d
tj}| jD ]!}|r||f }||||||||	d}|d	 }|r||d f }q| |}|r||f }|stdd ||||fD S t||||dS )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r1   r   r   )r5   r   rO   rP   rN   )r   rO   r   r   rM   rP   r   c                 s   s    | ]	}|d ur|V  qd S rz   r1   )r   vr1   r1   r2   	<genexpr>i  s    z#MptModel.forward.<locals>.<genexpr>)last_hidden_staterN   rL   
attentions)r5   r   r   r   use_return_dictra   rX   r   rW   loggerwarning_oncer   r
   r3   r*   r=   r   r_   r   r   r   rf   r   r   r   tupler   )rI   r   rN   rO   r   r   r   r   r   rP   kwargsrj   rk   _rL   all_self_attentionsall_hidden_statesr-   past_key_values_lengthcausal_maskblockoutputsr1   r1   r2   ry     s   






zMptModel.forwardr   N	NNNNNNNNN)r{   r|   r}   r   r:   r   r3   r   r   r   r   
LongTensorr	   r   r   r   ry   r   r1   r1   rJ   r2   r      sH    
	
r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       s   e Zd ZddiZdef fddZdejfddZe																						
ddej
d	B ded	B dejd	B dejd	B dejd	B ded	B ded	B ded	B ded	B dejd	B deejB deej eB fddZ  ZS )MptForCausalLMzlm_head.weightztransformer.wte.weightr5   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFr7   )
r9   r:   r   r   r   rF   r;   r   lm_headr   r   rJ   r1   r2   r:   ~  s   
zMptForCausalLM.__init__r   c                 C   r   rz   )r   r   r1   r1   r2   set_output_embeddings  r   z$MptForCausalLM.set_output_embeddingsNr   r   rN   rO   r   labelsr   r   r   r   rP   logits_to_keepr   c                 K   s   |	dur|	n| j j}	| j||||||||	|
d	}|d }t|tr't| dn|}| |dd|ddf }d}|durK| jd||| j jd|}|	sa|f|dd  }|dur_|f| S |S t	|||j
|j|jdS )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)rN   rO   r   r   r   r   r   rP   r   )logitsr   r   r   lossr   rN   rL   r   r1   )r5   r   r   
isinstancer   slicer   loss_functionr   r   rN   rL   r   )rI   r   rN   rO   r   r   r   r   r   r   rP   r   r   transformer_outputsrL   slice_indicesr   r   r   r1   r1   r2   ry     s8   !zMptForCausalLM.forward)NNNNNNNNNNr   )r{   r|   r}   _tied_weights_keysr   r:   r   r   r   r   r   r	   r   r   r   r   ry   r   r1   r1   rJ   r2   r   u  sR    	
r   a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       s   e Zd Zdef fddZdejfddZe									ddej	dB d	e
dB d
ejdB dejdB dejdB dedB dedB dedB dedB deej eB fddZ  ZS )MptForSequenceClassificationr5   c                    s@   t  | |j| _t|| _tj|j|jdd| _| 	  d S r   )
r9   r:   
num_labelsr   r   r   rF   r;   scorer   r   rJ   r1   r2   r:     s
   
z%MptForSequenceClassification.__init__r   c                 C   r   rz   )r   r   r1   r1   r2   r     r   z2MptForSequenceClassification.set_output_embeddingsNr   rN   rO   r   r   r   r   r   r   r   c
              
   K   s$  |	dur|	n| j j}	| j||||||||	d}|d }| |}|dur*|jd }n|jd }| j jdu r=|dkr=td| j jdu rFd}n1|durk|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j jdu r| jdkrd
| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jd
krt }| jdkr|| | }n#|||}n| j jdkrt }|||}n| j jdkrt }|||}|	s|f|dd  }|dur|f| S |S t|||j|j|jdS )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrN   rO   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rS   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr   )r5   r   r   r   rX   pad_token_idra   rf   r   r   r    r   argmaxr   r   rK   r{   problem_typer   r   longr   r   r)   r   r   r   rN   rL   r   )rI   r   rN   rO   r   r   r   r   r   r   r   r   rL   r   rj   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctr   r1   r1   r2   ry     st   



"


z$MptForSequenceClassification.forwardr   )r{   r|   r}   r   r:   r   r   r   r   r   r	   r   r   r   ry   r   r1   r1   rJ   r2   r     sD    		
r   c                       s   e Zd Zdef fddZe									ddejdB dedB dej	dB dej	dB d	ej	dB d
e
dB de
dB de
dB de
dB deej	 eB fddZ  ZS )MptForTokenClassificationr5   c                    s   t  | |j| _t|| _t|dr|jd ur|j}nt|dr+|jd ur+|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropoutr   g?)r9   r:   r   r   r   hasattrr   r   r   r   rg   rF   r;   
classifierr   )rI   r5   r   rJ   r1   r2   r:   T  s   
z"MptForTokenClassification.__init__Nr   rN   rO   r   r   r   r   r   r   r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|durJ||j}|j\}}t }||	|| | j
|	|| }|	s`|f|dd  }|dur^|f| S |S t|||j|jdS )r   Nr   r   r   )r   r   rL   r   )r5   r   r   rg   r   rf   r   rX   r   r!   r   r   rL   r   )rI   r   rN   rO   r   r   r   r   r   r   deprecated_argumentsr   rL   r   r   rj   rk   r   r   r1   r1   r2   ry   e  s>   


z!MptForTokenClassification.forwardr   )r{   r|   r}   r   r:   r   r   r   r	   r   r   r   r   ry   r   r1   r1   rJ   r2   r   R  sB    	
r   c                       s   e Zd Z fddZe								ddejdB dejdB dejdB dejdB dejdB d	edB d
edB dedB de	e
B fddZ  ZS )MptForQuestionAnsweringc                    s2   t  | t|| _t|jd| _|   d S )Nr   )	r9   r:   r   r   r   rF   r;   
qa_outputsr   r   rJ   r1   r2   r:     s   
z MptForQuestionAnswering.__init__Nr   rO   r   start_positionsend_positionsr   r   r   r   c	                 K   sB  |dur|n| j j}| j||||||d}
|
d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrL|d}t| dkrY|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|
dd  }|dur|f| S |S t||||
j|
jd	S )
r   N)rO   r   r   r   r   r   r   rS   r   )ignore_indexr   )r   start_logits
end_logitsrL   r   )r5   r   r   r   splitr)   ri   r`   rb   rY   r   r   rL   r   )rI   r   rO   r   r   r   r   r   r   r   r   sequence_outputr   r  r  
total_lossignored_indexr   
start_lossend_lossr   r1   r1   r2   ry     sJ   	






zMptForQuestionAnswering.forward)NNNNNNNN)r{   r|   r}   r:   r   r   r   FloatTensorr   r   r   ry   r   r1   r1   rJ   r2   r     s<    	r   )r   r   r   r   r   r   r   )0r~   r"   r   r   torch.nnr   r   r   r   r   r   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mptr   
get_loggerr{   r   r3   Moduler4   r   r   r   r   r   r   r   r   __all__r1   r1   r1   r2   <module>   sJ   

I: SvXS