o
    ib                     @   s  d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% e!&e'Z(d+ddZ)G dd dej*Z+G dd dej*Z,G dd deZ-e G dd deZ.e G dd de.Z/e dd G d!d" d"e.eZ0e d#d G d$d% d%e.Z1e G d&d' d'e.Z2e G d(d) d)e.Z3g d*Z4dS ),zPyTorch MPT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	MptConfig   c                 C   s   t jd| dt j|dddd|}dtt|  }t jd|d t j|d }|||  }dt 	d| }|d|dd}|| krjt j
|ddddddf |ddddddf gddddd| df }|| }|dS )	a  
    Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopes r5   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensor-   s   $L
r7   c                       sx   e Zd ZdZddedee f fddZeddd	d
			dde	j
de	j
dee dee	j
 dee	j
 f
ddZ  ZS )MptAttentionzzMulti-head self attention.
    Using torch or triton attention implementation enables user to also use additive bias.
    Nconfig	layer_idxc                    s   t    |j| _|j| _|j| _| j| j | _|jj| _| jd u r.dt	
| j| j  | _|jj| _|jj| _tj| jd| j dd| _tj| j| jdd| _|| _d S )Nr   r
   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler&   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr:   )selfr9   r:   	__class__r5   r6   r>   I   s   





zMptAttention.__init__past_key_valuepast_key_values4.58new_nameversionhidden_statesposition_biasattention_maskcache_positionc                 C   s   |j d d \}}| |}| jr|j| j | jd}|jddd\}	}
}|	||| j| jdd}	|
||| j| jdd}
|||| j| jdd}|d urbd|i}|	|
|| j
|\}
}t|	|
dd| j }|d u ru|n||  }|d urt|j dkrtd	t|j  |
j d }td
|d| }td
|d| }|d d |d |d f }|| }|d ur||t|	jj}tjj| dd|j}tjj|| j| jd}t||}|d
ddd  !||d}| "|}||fS )Nr   )minmaxr
   r    r   rY   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperK   rI   clampchunkreshaper@   rC   	transposeupdater:   r"   matmulrE   get_seq_lengthlen
ValueErrorr[   sizemasked_fillfinfor   rZ   r   r	   softmaxr*   todropoutrH   r`   permute
contiguousr%   rL   )rM   rV   rW   rQ   rX   rY   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statescache_kwargsattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputr5   r5   r6   forwardY   s:   	


zMptAttention.forwardN)NNN)__name__
__module____qualname____doc__r   r   intr>   r   r"   Tensorr   r   __classcell__r5   r5   rN   r6   r8   D   s$    r8   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	MptMLPr9   c                    sX   t    |j}tj|d| dd| _tjdd| _tjd| |dd| _|j	j
| _d S )N   Fr;   none)approximate)r=   r>   r?   r   rJ   up_projGELUact	down_projrD   rG   hidden_dropout)rM   r9   r?   rN   r5   r6   r>      s   
zMptMLP.__init__rV   residualreturnc                 C   s:   |  | |}| |}tj|| j| jd}|| }|S )Nr^   )r   r   r   Frp   r   r`   )rM   rV   r   intermediate_outputoutputr5   r5   r6   r      s
   
zMptMLP.forward)	r   r   r   r   r>   r"   r   r   r   r5   r5   rN   r6   r      s    $	r   c                       sl   e Zd Zddedee f fddZ				ddejdejd	ejd
ee	 de
de
deej fddZ  ZS )MptBlockNr9   r:   c                    sz   t    |j}t||jd| _d | j_|j| _t	||| _
t||jd| _d | j_t|| _|jj| _t| j| _d S )Neps)r=   r>   r?   r   layer_norm_epsilonnorm_1r<   r@   r.   r8   attnnorm_2r   ffnrD   rG   dropout_rater   Dropoutresid_attn_dropout)rM   r9   r:   r?   rN   r5   r6   r>      s   


zMptBlock.__init__FrV   rW   rX   
layer_past	use_cacheoutput_attentionsrY   c                 C   sV   |  |}|}	| j|||||d\}
}| |
|	 }| |}|}	| ||	}||fS )N)rW   rX   rQ   rY   )r   r   r   r   r   )rM   rV   rW   rX   r   r   r   rY   layernorm_outputr   attn_outputsr   r   r5   r5   r6   r      s   


zMptBlock.forwardr   )NFFN)r   r   r   r   r   r   r>   r"   r   r   boolr   r   r5   r5   rN   r6   r      s*    r   c                	       s   e Zd ZU eed< dZdZdgZdgZ fddZ	de
jfd	d
Zeedddddeeejejf  deeejejf  fddZ  ZS )MptPreTrainedModelr9   transformerTr   z
lm_head.*.c                    s   t  j|i | d S r   )r=   r>   )rM   inputskwargsrN   r5   r6   r>      s   zMptPreTrainedModel.__init__modulec                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tr\|jdurS|jj	  |jjd dS dS )zInitialize the weights.g        )meanstdNr   )
isinstancer   rJ   weightdatanormal_r9   initializer_ranger<   zero_	Embeddingpadding_idxr   fill_)rM   r   r5   r5   r6   _init_weights   s   



z MptPreTrainedModel._init_weightsrP   rQ   rR   rS   r   c                    s8   | d d j \}}||  t fdd| D S )zw
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        r   c                 3   s4    | ]}|d    |d   fV  qdS )r   r   N)rd   ).0r   batch_size_times_num_headsrC   rt   r5   r6   	<genexpr>  s    
z;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>)ra   tuple)rQ   rs   r.   r5   r   r6   _convert_to_mpt_cache   s
   z(MptPreTrainedModel._convert_to_mpt_cache)r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingr>   r   Moduler   staticmethodr   r   r"   r   r   r   r5   r5   rN   r6   r      s   
 r   c                       s   e Zd Zdef fddZdd Zddd	Zd
ejfddZ	e
									ddeej dee deej deej dee dee dee dee deej deeejdf ef fddZ  ZS )MptModelr9   c                    sz   t     j| _ j| _t j| j| _t	 fddt
 jD | _t| j jd| _d | j_d| _|   d S )Nc                    s   g | ]}t  |d qS ))r:   )r   )r   ir9   r5   r6   
<listcomp>  s    z%MptModel.__init__.<locals>.<listcomp>r   F)r=   r>   r?   r@   r.   r   r   
vocab_sizewte
ModuleListrangen_layersblocksr   r   norm_fr<   gradient_checkpointing	post_initrM   r9   rN   r   r6   r>     s    zMptModel.__init__c                 C   s   | j S r   r   )rM   r5   r5   r6   get_input_embeddings&  s   zMptModel.get_input_embeddingsr   Nc                 C   s   t ||||S r   )r7   )rM   r.   r/   r0   r   r5   r5   r6   r7   )  s   zMptModel.build_mpt_alibi_tensornew_embeddingsc                 C   
   || _ d S r   r   rM   r   r5   r5   r6   set_input_embeddings,     
zMptModel.set_input_embeddings	input_idsrQ   rX   inputs_embedsr   r   output_hidden_statesreturn_dictrY   r   .c
              
   K   s*  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|dur4|dur4td|dur>|j\}}n|durI|j\}}}ntd| jr\| jr\|r\t	
d d}|du re| |}|rq|du rqt| j d}|rt|trt	
d t|}|}|rdnd}|rdnd}|dur| nd	}|| }|du rtj||f|jd
}n||j}| j| j| j j|jd
}t|||f||}| }| jD ]!}|r||f }||||||||	d}|d	 }|r||d f }q| |}|r||f }|stdd ||||fD S t||||dS )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r5   r   r   )r   rX   r   r   rW   rY   r   c                 s   s    | ]	}|d ur|V  qd S r   r5   )r   vr5   r5   r6   r     s    z#MptModel.forward.<locals>.<genexpr>)last_hidden_staterQ   rV   
attentions)r9   r   r   r   use_return_dictrj   ra   r   r`   loggerwarning_oncer   r   r   r   from_legacy_cacherh   r"   onesr   ro   r7   r.   rA   r   r   r   r   r   )rM   r   rQ   rX   r   r   r   r   r   rY   r   rs   rt   _rV   all_self_attentionsall_hidden_statespast_key_values_lengthseq_length_with_pastr1   causal_maskblockoutputsr5   r5   r6   r   /  s   







zMptModel.forwardr   N	NNNNNNNNN)r   r   r   r   r>   r   r7   r"   r   r   r   r   
LongTensorr   r   r   r   r   r   r   r5   r5   rN   r6   r     sH    
	
r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       s   e Zd ZdgZdef fddZdejfddZe											dd	e
ej d
e
e de
ej de
ej de
ej de
e de
e de
e de
e de
ej deeej ef fddZ  ZS )MptForCausalLMzlm_head.weightr9   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFr;   )
r=   r>   r   r   r   rJ   r?   r   lm_headr   r   rN   r5   r6   r>     s   
zMptForCausalLM.__init__r   c                 C   r   r   )r   r   r5   r5   r6   set_output_embeddings  r   z$MptForCausalLM.set_output_embeddingsNr   rQ   rX   r   labelsr   r   r   r   rY   r   c                 K   s   |	dur|	n| j j}	| j||||||||	|
d	}|d }| |}d}|dur;||j}| j||fd| j ji|}|	sQ|f|dd  }|durO|f| S |S t|||j	|j
|jdS )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)rQ   rX   r   r   r   r   r   rY   r   r   r   losslogitsrQ   rV   r   )r9   r   r   r   ro   r   loss_functionr   r   rQ   rV   r   )rM   r   rQ   rX   r   r   r   r   r   r   rY   r   transformer_outputsrV   	lm_logitsr   r   r5   r5   r6   r     sF    
zMptForCausalLM.forward)
NNNNNNNNNN)r   r   r   _tied_weights_keysr   r>   r"   r   r   r   r   r   r   r   r   r   r   r   r   r5   r5   rN   r6   r     sL    	
r   a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                          e Zd Zdef fddZe									ddeej dee	 deej
 deej
 d	eej
 d
ee dee dee dee deeej
 ef fddZ  ZS )MptForSequenceClassificationr9   c                    s@   t  | |j| _t|| _tj|j|jdd| _| 	  d S r   )
r=   r>   
num_labelsr   r   r   rJ   r?   scorer   r   rN   r5   r6   r>     s
   
z%MptForSequenceClassification.__init__Nr   rQ   rX   r   r   r   r   r   r   r   c
              
   C   s$  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}|dur*|jd }n|jd }| j jdu r=|dkr=td| j jdu rFd}n1|durk|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j jdu r| jdkrd
| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jd
krt }| jdkr|| | }n#|||}n| j jdkrt }|||}n| j jdkrt }|||}|	s|f|
dd  }|dur|f| S |S t|||
j|
j|
jdS )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrQ   rX   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r\   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr   )r9   r   r   r  ra   pad_token_idrj   ro   r   r"   r$   r#   argmaxr   r   rO   r   problem_typer  r   longr   r   r-   r   r   r   rQ   rV   r   )rM   r   rQ   rX   r   r   r   r   r   r   r   rV   r   rs   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctr   r5   r5   r6   r     st   



"


z$MptForSequenceClassification.forwardr   )r   r   r   r   r>   r   r   r"   r   r   r   r   r   r   r   r   r   r5   r5   rN   r6   r    sB    		
r  c                       r  )MptForTokenClassificationr9   c                    s   t  | |j| _t|| _t|dr|jd ur|j}nt|dr+|jd ur+|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropoutr   g?)r=   r>   r  r   r   hasattrr  r   r   r   rp   rJ   r?   
classifierr   )rM   r9   r  rN   r5   r6   r>     s   
z"MptForTokenClassification.__init__Nr   rQ   rX   r   r   r   r   r   r   r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|durJ||j}|j\}}t }||	|| | j
|	|| }|	s`|f|dd  }|dur^|f| S |S t|||j|jdS )r  Nr  r   r   )r   r   rV   r   )r9   r   r   rp   r  ro   r   ra   r   r%   r  r   rV   r   )rM   r   rQ   rX   r   r   r   r   r   r   deprecated_argumentsr   rV   r   r   rs   rt   r  r   r5   r5   r6   r     s>   


z!MptForTokenClassification.forwardr   )r   r   r   r   r>   r   r   r"   r   r   r   r   r   r   r   r   r   r5   r5   rN   r6   r    sB    	
r  c                       s   e Zd Z fddZe								ddeej deej deej deej deej d	ee	 d
ee	 dee	 de
eef fddZ  ZS )MptForQuestionAnsweringc                    s2   t  | t|| _t|jd| _|   d S )Nr   )	r=   r>   r   r   r   rJ   r?   
qa_outputsr   r   rN   r5   r6   r>     s   
z MptForQuestionAnswering.__init__Nr   rX   r   start_positionsend_positionsr   r   r   r   c	                 C   sB  |dur|n| j j}| j||||||d}	|	d }
| |
}|jddd\}}|d }|d }d}|dur|durt| dkrL|d}t| dkrY|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|	dd  }|dur|f| S |S t||||	j|	jd	S )
r   N)rX   r   r   r   r   r   r   r\   r    )ignore_indexr   )r   start_logits
end_logitsrV   r   )r9   r   r   r  splitr-   rr   ri   rk   rb   r   r   rV   r   )rM   r   rX   r   r  r  r   r   r   r   sequence_outputr   r  r   
total_lossignored_indexr  
start_lossend_lossr   r5   r5   r6   r     sJ   	






zMptForQuestionAnswering.forward)NNNNNNNN)r   r   r   r>   r   r   r"   r   FloatTensorr   r   r   r   r   r   r5   r5   rN   r6   r    s<    	

r  )r   r   r   r  r  r  r   )5r   r&   typingr   r   r"   r   torch.nnr   r   r   r   r	   r   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_mptr   
get_loggerr   r   r7   r   r8   r   r   r   r   r   r  r  r  __all__r5   r5   r5   r6   <module>   sN   

J:0 XrXR