o
    wiG                     @   s  d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z! e"e#Z$d)ddZ%G dd dej&Z'G dd dej&Z(G dd deZ)eG dd deZ*eG dd de*Z+eddG dd  d e*eZ,ed!dG d"d# d#e*Z-eG d$d% d%e*Z.eG d&d' d'e*Z/g d(Z0dS )*zPyTorch MPT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )GenerationMixin)!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MptConfig   c                 C   s   t jd| dt j|dddd|}dtt|  }t jd|d t j|d }|||  }dt 	d| }|d|dd}|| krjt j
|ddddddf |ddddddf gddddd| df }|| }|dS )	a  
    Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopes r2   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensor,   s   $L
r4   c                
       sZ   e Zd ZdZdef fddZ		ddejdejdee	ej  d	eej fd
dZ
  ZS )MptAttentionzzMulti-head self attention.
    Using torch or triton attention implementation enables user to also use additive bias.
    configc                    s   t    |j| _|j| _|j| _| j| j | _|jj| _| jd u r.dt	
| j| j  | _|jj| _|jj| _tj| jd| j dd| _tj| j| jdd| _d S )Nr   r
   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler#   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projselfr6   	__class__r2   r3   r:   H   s   




zMptAttention.__init__Nhidden_statesposition_biaspast_key_valueattention_maskc                 C   s4  |j d d \}}| |}| jr|j| j | jd}|jddd\}}	}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
|d urtt	|dkrot
j|d |	gdd}	t
j|d |
gdd}
|	|
f}n|	|
f}t
||	dd| j }|d u r|n||d j d  }|d urt	|j dkrtd	t	|j  |	j d }td|d| }td|d| }|d d |d |d f }|| }|d ur||t
|jj}tjj| dd|
j}tjj|| j| jd
}t
||
}|dddd ||d}|  |}|||fS )Nr   )minmaxr
   r   r   r   z6Expecting position_bias shape to be 3 dimensions, got ptraining)!shaperG   rE   clampchunkreshaper<   r?   	transposelenr   catmatmulrA   
ValueErrorrR   sizemasked_fillfinfor   rQ   r   r	   softmaxr'   todropoutrD   rW   permute
contiguousr"   rH   )rJ   rM   rN   rO   rP   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statesattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputr2   r2   r3   forwardW   s@   




zMptAttention.forward)NN)__name__
__module____qualname____doc__r   r:   r   Tensorr   tuplerw   __classcell__r2   r2   rK   r3   r5   C   s    r5   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	MptMLPr6   c                    sX   t    |j}tj|d| dd| _tjdd| _tjd| |dd| _|j	j
| _d S )N   Fr7   none)approximate)r9   r:   r;   r   rF   up_projGELUact	down_projr@   rC   hidden_dropoutrJ   r6   r;   rK   r2   r3   r:      s   
zMptMLP.__init__rM   residualreturnc                 C   s:   |  | |}| |}tj|| j| jd}|| }|S )NrU   )r   r   r   Frf   r   rW   )rJ   rM   r   intermediate_outputoutputr2   r2   r3   rw      s
   
zMptMLP.forward)	rx   ry   rz   r   r:   r   r|   rw   r~   r2   r2   rK   r3   r      s    $	r   c                       sb   e Zd Zdef fddZ			ddejdejdejd	eeejejf  d
e	de	fddZ
  ZS )MptBlockr6   c                    sx   t    |j}t||jd| _d | j_|j| _t	|| _
t||jd| _d | j_t|| _|jj| _t| j| _d S )Neps)r9   r:   r;   r   layer_norm_epsilonnorm_1r8   r<   r+   r5   attnnorm_2r   ffnr@   rC   dropout_rater   Dropoutresid_attn_dropoutr   rK   r2   r3   r:      s   



zMptBlock.__init__NFrM   rN   rP   
layer_past	use_cacheoutput_attentionsc                 C   st   |  |}|}| j||||d\}	}
}| |	| }| |}|}| ||}|f}|r1||f7 }|r8||
f7 }|S )N)rN   rP   rO   )r   r   r   r   r   )rJ   rM   rN   rP   r   r   r   layernorm_outputr   attn_outputsrt   rO   r   outputsr2   r2   r3   rw      s$   



zMptBlock.forward)NFF)rx   ry   rz   r   r:   r   r|   r   r}   boolrw   r~   r2   r2   rK   r3   r      s$    r   c                       sz   e Zd ZeZdZdZdgZdgZ fddZ	de
jfdd	Zed
eeejejf  deeejejf  fddZ  ZS )MptPreTrainedModeltransformerTr   z
lm_head.*.c                    s   t  j|i | d S N)r9   r:   )rJ   inputskwargsrK   r2   r3   r:      s   zMptPreTrainedModel.__init__modulec                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tr\|jdurS|jj	  |jjd dS dS )zInitialize the weights.g        )meanstdNr   )
isinstancer   rF   weightdatanormal_r6   initializer_ranger8   zero_	Embeddingpadding_idxr   fill_)rJ   r   r2   r2   r3   _init_weights   s   



z MptPreTrainedModel._init_weightsrO   r   c                    s8   | d d j \}}||  t fdd| D S )zw
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        r   c                 3   s4    | ]}|d    |d   fV  qdS r   r   N)r[   .0r   batch_size_times_num_headsr?   rj   r2   r3   	<genexpr>  s    
z;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>)rX   r}   )rO   ri   r+   r2   r   r3   _convert_to_mpt_cache   s
   z(MptPreTrainedModel._convert_to_mpt_cache)rx   ry   rz   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingr:   r   Moduler   staticmethodr}   r   r|   r   r~   r2   r2   rK   r3   r      s    r   c                       s   e Zd Zdef fddZdd Zddd	Zd
ejfddZ	e
								ddeej deeeejejf df  deej deej dee dee dee dee deeejdf ef fddZ  ZS )MptModelr6   c                    sz   t     j| _ j| _t j| j| _t	 fddt
 jD | _t| j jd| _d | j_d| _|   d S )Nc                    s   g | ]}t  qS r2   )r   )r   _r6   r2   r3   
<listcomp>   s    z%MptModel.__init__.<locals>.<listcomp>r   F)r9   r:   r;   r<   r+   r   r   
vocab_sizewte
ModuleListrangen_layersblocksr   r   norm_fr8   gradient_checkpointing	post_initrI   rK   r   r3   r:     s    zMptModel.__init__c                 C      | j S r   r   rJ   r2   r2   r3   get_input_embeddings,     zMptModel.get_input_embeddingsr   Nc                 C   s   t ||||S r   )r4   )rJ   r+   r,   r-   r   r2   r2   r3   r4   /  s   zMptModel.build_mpt_alibi_tensornew_embeddingsc                 C   
   || _ d S r   r   rJ   r   r2   r2   r3   set_input_embeddings2     
zMptModel.set_input_embeddings	input_idspast_key_values.rP   inputs_embedsr   r   output_hidden_statesreturn_dictr   c	              	   K   sT  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|dur4|dur4td|dur>|j\}
}n|durI|j\}
}}ntd|du r[tdgt| j	 }|du rd| 
|}|}|rjdnd}|rpdnd}|rvdnd}| jr| jr|rtd d}|}d}|d dur|d d jd }|| }|du rtj|
|f|jd	}n||j}| j| j| j j|jd	}t||
|f||}| }t| j	|D ]2\}}|r||f }|||||||d
}|d }|du r||d f }|r|||rdnd f }q| |}|r||f }|s"tdd ||||fD S t||||dS )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr2   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )r   rP   r   r   rN   Tr   c                 s   s    | ]	}|d ur|V  qd S r   r2   )r   vr2   r2   r3   r     s    z#MptModel.forward.<locals>.<genexpr>)last_hidden_stater   rM   
attentions)r6   r   r   r   use_return_dictr`   rX   r}   r]   r   r   r   rW   loggerwarning_oncer   onesr   re   r4   r+   r=   r   r   zipr   r   )rJ   r   r   rP   r   r   r   r   r   r   ri   rj   r   rM   presentsall_self_attentionsall_hidden_statesseq_length_with_pastpast_key_values_lengthr.   causal_maskblockr   r   r2   r2   r3   rw   5  s   

	

zMptModel.forwardr   NNNNNNNNN)rx   ry   rz   r   r:   r   r4   r   r|   r   r   r   
LongTensorr}   r   r   r   rw   r~   r2   r2   rK   r3   r     sB    
	r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       s  e Zd ZdgZdef fddZdd Zdejfdd	Z	e
	
	
	
	
	
	
	
	
	
ddeej deeeejejf df  deej deej deej dee dee dee dee deeej ef fddZdeeejejf df dejdeeejejf df fddZ  ZS )MptForCausalLMzlm_head.weightr6   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFr7   )
r9   r:   r   r   r   rF   r;   r   lm_headr   rI   rK   r2   r3   r:     s   
zMptForCausalLM.__init__c                 C   r   r   r   r   r2   r2   r3   get_output_embeddings  r   z$MptForCausalLM.get_output_embeddingsr   c                 C   r   r   r   r   r2   r2   r3   set_output_embeddings  r   z$MptForCausalLM.set_output_embeddingsNr   r   .rP   r   labelsr   r   r   r   r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}d}|dur:||j}| j||fd| j ji|
}|	sP|f|dd  }|durN|f| S |S t|||j	|j
|jdS )aZ  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        Nr   rP   r   r   r   r   r   r   r   r   losslogitsr   rM   r   )r6   r   r   r   re   r   loss_functionr   r   r   rM   r   )rJ   r   r   rP   r   r   r   r   r   r   r   transformer_outputsrM   	lm_logitsr   r   r2   r2   r3   rw     sD   

zMptForCausalLM.forwardpastbeam_idxc                    s,    fdd|D t fdd|D }|S )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c                    s&   i | ]}|D ]
}|j  |j qqS r2   )r   re   )r   r   
past_state)r   r2   r3   
<dictcomp>  s
    z1MptForCausalLM._reorder_cache.<locals>.<dictcomp>c                 3   sD    | ]}|d   d  |d  j |d  d  |d  j fV  qdS r   )index_selectr   r   )device_to_beam_idxr2   r3   r     s    
z0MptForCausalLM._reorder_cache.<locals>.<genexpr>)r}   )rJ   r   r   reordered_pastr2   )r   r  r3   _reorder_cache  s   
zMptForCausalLM._reorder_cache	NNNNNNNNN)rx   ry   rz   _tied_weights_keysr   r:   r   r   r|   r   r   r   r   r}   r   r   r   rw   r  r~   r2   r2   rK   r3   r     sV    	
Fr   a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                          e Zd Zdef fddZe									ddeej dee	e	ej
ej
f df  deej
 d	eej
 d
eej
 dee dee dee dee dee	ej
 ef fddZ  ZS )MptForSequenceClassificationr6   c                    s@   t  | |j| _t|| _tj|j|jdd| _| 	  d S r   )
r9   r:   
num_labelsr   r   r   rF   r;   scorer   rI   rK   r2   r3   r:   .  s
   
z%MptForSequenceClassification.__init__Nr   r   .rP   r   r   r   r   r   r   r   c
              
   C   s$  |	dur|	n| j j}	| j||||||||	d}
|
d }| |}|dur*|jd }n|jd }| j jdu r=|dkr=td| j jdu rFd}n1|durk|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j jdu r| jdkrd
| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jd
krt }| jdkr|| | }n#|||}n| j jdkrt }|||}n| j jdkrt }|||}|	s|f|
dd  }|dur|f| S |S t|||
j|
j|
jdS )4  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rS   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr   )r6   r   r   r  rX   pad_token_idr`   re   r   r   r!   r    argmaxr   r   rL   rx   problem_typer  r   longintr   r*   r   r   r   r   rM   r   )rJ   r   r   rP   r   r   r   r   r   r   r   rM   r   ri   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctr   r2   r2   r3   rw   7  st   



"


z$MptForSequenceClassification.forwardr  )rx   ry   rz   r   r:   r   r   r   r   r}   r|   r   r   r   rw   r~   r2   r2   rK   r3   r
    sB    		
r
  c                       r	  )MptForTokenClassificationr6   c                    s   t  | |j| _t|| _t|dr|jd ur|j}nt|dr+|jd ur+|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropoutr   g?)r9   r:   r  r   r   hasattrr  r   r   r   rf   rF   r;   
classifierr   )rJ   r6   r  rK   r2   r3   r:     s   
z"MptForTokenClassification.__init__Nr   r   .rP   r   r   r   r   r   r   r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|durJ||j}|j\}}t }||	|| | j
|	|| }|	s`|f|dd  }|dur^|f| S |S t|||j|jdS )r  Nr   r   r   )r   r   rM   r   )r6   r   r   rf   r  re   r   rX   r   r"   r  r   rM   r   )rJ   r   r   rP   r   r   r   r   r   r   deprecated_argumentsr   rM   r   r   ri   rj   r  r   r2   r2   r3   rw     s>   


z!MptForTokenClassification.forwardr  )rx   ry   rz   r   r:   r   r   r   r   r}   r|   r   r   r   rw   r~   r2   r2   rK   r3   r    sB    	
r  c                       s   e Zd Z fddZe								ddeej deej deej deej deej d	ee	 d
ee	 dee	 de
eef fddZ  ZS )MptForQuestionAnsweringc                    s2   t  | t|| _t|jd| _|   d S )Nr   )	r9   r:   r   r   r   rF   r;   
qa_outputsr   rI   rK   r2   r3   r:     s   
z MptForQuestionAnswering.__init__Nr   rP   r   start_positionsend_positionsr   r   r   r   c	                 C   sB  |dur|n| j j}| j||||||d}	|	d }
| |
}|jddd\}}|d }|d }d}|dur|durt| dkrL|d}t| dkrY|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|	dd  }|dur|f| S |S t||||	j|	jd	S )
r   N)rP   r   r   r   r   r   r   rS   r   )ignore_indexr   )r   start_logits
end_logitsrM   r   )r6   r   r   r!  splitr*   rh   r]   ra   rY   r   r   rM   r   )rJ   r   rP   r   r"  r#  r   r   r   r   sequence_outputr   r%  r&  
total_lossignored_indexr  
start_lossend_lossr   r2   r2   r3   rw     sJ   	






zMptForQuestionAnswering.forwardr   )rx   ry   rz   r:   r   r   r   r   FloatTensorr   r   r}   r   rw   r~   r2   r2   rK   r3   r     s<    	

r   )r   r   r   r
  r  r   r   )1r{   r#   typingr   r   r   torch.utils.checkpointr   torch.nnr   r   r   r   r	   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_mptr   
get_loggerrx   r   r4   r   r5   r   r   r   r   r   r
  r  r   __all__r2   r2   r2   r3   <module>   sL   

L@/ prXR