o
    iǼ                     @   s  d Z ddlmZmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ e& rddl,m-Z- ddl.m/Z/ e'0e1Z2G dd dej3Z4	d5dej5dej6dej6dej6deej6 de7de7fd d!Z8G d"d# d#ej5Z9G d$d% d%eZ:e$G d&d' d'eZ;G d(d) d)e;Z<e$G d*d+ d+e;Z=G d,d- d-e;eZ>e$d.d/G d0d1 d1e;Z?e$G d2d3 d3e;Z@g d4ZAdS )6zPyTorch OPT model.    )CallableOptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )	OPTConfig)	BlockMask)make_flex_block_causal_maskc                       sR   e Zd ZdZdedef fddZ		ddejd	ed
eej f fddZ	  Z
S )OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr#   r$   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.pyr)   8   s   z&OPTLearnedPositionalEmbedding.__init__r   Nattention_maskpast_key_values_lengthposition_idsc                    sL   |du rt j|dd}|| d  }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr(   forwardr'   )r*   r/   r0   r1   r+   r-   r.   r7   >   s
   z%OPTLearnedPositionalEmbedding.forward)r   N)__name__
__module____qualname____doc__intr)   r4   
LongTensorr   r7   __classcell__r-   r-   r+   r.   r"   3   s    	r"           modulequerykeyvaluer/   scalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )N)r3   dtypeptrainingr   r&   )r4   matmul	transposer   
functionalsoftmaxfloat32torH   rE   rK   
contiguous)
r@   rA   rB   rC   r/   rD   rE   kwargsattn_weightsattn_outputr-   r-   r.   eager_attention_forwardP   s   
rV   c                       s   e Zd ZdZ	ddedee f fddZeddd	d
					dde	j
dee dee	j
 dee	j
 dedee	j
 dee	j
ee	j
 ee f fddZ  ZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s  t    || _|j| _|j| _|j| _|j	| _	|| _
|d u r*td| jj d | j| j | _d| _| j| j | jkrJtd| j d| j d| jd | _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _d S )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r(   r)   rX   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrE   enable_biasrY   loggerwarning_oncer,   r8   head_dim	is_causal
ValueErrorrD   r   Lineark_projv_projq_projout_proj)r*   rX   rY   rS   r+   r-   r.   r)   j   s0   

zOPTAttention.__init__past_key_valuepast_key_values4.58new_nameversionFhidden_statesr/   layer_head_maskoutput_attentionscache_positionreturnc                 K   s  |  \}}	}
| || j }||d| j| jdd}| |}| |}||d| j| jdd}||d| j| jdd}|durT|	||| j
d|i\}}t}| jjdkrbt| jj }|| ||||f| jsndn| jdd	|\}}|||	d }| |}|sd}||fS )
z#Input shape: Batch x Time x ChannelrF   r   r&   Nru   eagerr?         ?)rE   rD   )sizerj   rD   viewr_   rd   rM   rh   ri   updaterY   rV   rX   _attn_implementationr   rK   rE   reshaperR   rk   )r*   rr   rm   r/   rs   rt   ru   rS   bsztgt_len_query_states
key_statesvalue_statesattention_interfacerU   rT   r-   r-   r.   r7      s>   



zOPTAttention.forwardN)NNNFN)r8   r9   r:   r;   r   r   r<   r)   r   r4   Tensorr   booltupler7   r>   r-   r-   r+   r.   rW   g   s:    #	rW   c                       s   e Zd Zddedee f fddZedddd					
	
		ddej	deej	 deej	 dee
 dee dee deej deej	 dee deejeeejejf  f fddZ  ZS )OPTDecoderLayerNrX   rY   c                    s   t    |j| _t||d| _|j| _|j| _t|j	 | _
tj| j|jd| _tj| j|j|jd| _tj|j| j|jd| _tj| j|jd| _d S )N)rX   rY   elementwise_affinerZ   )r(   r)   r\   r]   rW   	self_attndo_layer_norm_beforerE   r
   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normrg   ffn_dimra   fc1fc2final_layer_norm)r*   rX   rY   r+   r-   r.   r)      s   
zOPTDecoderLayer.__init__rl   rm   rn   ro   Frr   r/   rs   rt   	use_cacher1   ru   rS   rv   c	              
   K   s   |}
| j r
| |}| jd|||||||d|	\}}tjj|| j| jd}|
| }| j s3| |}|j}|d|	d}|}
| j rI| 
|}| |}| |}| |}tjj|| j| jd}|
| |}| j sr| 
|}|f}|r|||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence..
        )rr   rm   r1   r/   rs   rt   ru   rI   rF   Nr-   )r   r   r   r   rN   rE   rK   shaper}   ry   r   r   r   r   rz   )r*   rr   r/   rs   rm   rt   r   r1   ru   rS   residualself_attn_weightshidden_states_shapeoutputsr-   r-   r.   r7      sD   









zOPTDecoderLayer.forwardr   )NNNFFNN)r8   r9   r:   r   r   r<   r)   r   r4   r   r   r   r=   r   r   r   FloatTensorr7   r>   r-   r-   r+   r.   r      s>    	
r   c                   @   s@   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdd ZdS )OPTPreTrainedModelrX   modelTr   c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )Nr?   )meanstdrx   )rX   init_std
isinstancer   rg   weightdatanormal_r[   zero_	Embeddingpadding_idxr   fill_)r*   r@   r   r-   r-   r.   _init_weights:  s   

z OPTPreTrainedModel._init_weightsN)r8   r9   r:   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r-   r-   r-   r.   r   -  s   
 r   c                       s   e Zd ZdZdef fddZ	d deejdf dejd	ejd
e	de
f
ddZedejdededejd	ejdefddZe											d!deej deej deej d
ee	 deej dee
 dee
 dee
 dee
 deej d	eej dee deeef fddZ  ZS )"
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    rX   c                    s  t     j| _ j| _ j| _ j| _ j| _t	
 j j| j| _t j j| _ j jkr?t	j j jdd| _nd | _ j jkrTt	j j jdd| _nd | _ jrh jsht	j j jd| _nd | _t	 fddt jD | _d| _|   d S )NFrZ   r   c                    s   g | ]}t  |d qS ))rY   )r   ).0irX   r-   r.   
<listcomp>p  s    z'OPTDecoder.__init__.<locals>.<listcomp>)r(   r)   rE   	layerdroppad_token_idr   max_position_embeddingsmax_target_positions
vocab_sizer   r   word_embed_proj_dimembed_tokensr"   r\   embed_positionsrg   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointing	post_initr*   rX   r+   r   r.   r)   Q  s,   
 zOPTDecoder.__init__Fr/   r    input_tensorru   rm   rt   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r?   flex_attentionr   Fsdpa)inputs_embedsr0   is_trainingr   rF   )sequence_lengthtarget_lengthrH   ru   
batch_size)cudaxpunpu)rX   r|   anyr   r4   r   r!   get_seq_lengthis_compileabler   _ignore_causal_mask_sdparK   rH   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypefinfomin_unmask_unattended)r*   r/   r   ru   rm   rt   past_seen_tokensusing_compilable_cacherH   r   r   causal_mask	min_dtyper-   r-   r.   _update_causal_maskw  sT   




zOPTDecoder._update_causal_maskr   r   rH   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerH   r   r   )diagonalr   rF   r   )r3   r4   r   r   fullr   triuaranger}   expandcloner   rQ   masked_fill)r/   r   r   rH   ru   r   rS   r   r   mask_lengthpadding_maskr-   r-   r.   r     s,    $
6  z@OPTDecoder._prepare_4d_causal_attention_mask_with_cache_positionN	input_ids	head_maskr   r   output_hidden_statesreturn_dictr1   rS   rv   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|	dur$|	n| j j}	|du |duA r4td| jrC| jrC|rCt	d d}|durP|
d|jd }|du rY| |}|re|du ret| j d}|durm| nd}|du rtj|||jd  |jd	}|du r||jd  }tj|jd ||jd	}| |||||}|
du rtj|dd
}
|
| d  }
|
dd|df }
| j|||
d}| jdur| |}|||j }|rdnd}|rdnd}t|gdgD ]*\}}|dur| d t| jkrtd| dt| j d| d  dqt| jD ]E\}}|r'||f7 }| jr8tg }|| jk r8q||f||
|durF|| nd||||d|}|d }|r_||d f7 }q| j durl|  |}| j!durw| !|}|r||f7 }t"||||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`. for padding use -1.

                [What are position IDs?](../glossary#position-ids)
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
                the complete sequence length.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrF   r   r   r   r   r2   )r1   r-   r   zThe `z` should be specified for z layers, but it is for .)r/   r1   rs   rm   rt   r   ru   last_hidden_staterm   rr   
attentions)#rX   rt   r   r   use_return_dictrf   r   rK   rb   rc   rz   r   r   r   r   r4   r   r   onesr   r5   r6   r   r   rQ   zipry   lenr   	enumeraterandr   r   r   r   )r*   r   r/   r   rm   r   r   rt   r   r   r1   ru   rS   r   
seq_lengthr   
pos_embedsrr   all_hidden_statesall_self_attns	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputsr-   r-   r.   r7     s   G







	


zOPTDecoder.forward)FNNNNNNNNNNN)r8   r9   r:   r;   r   r)   r   r4   r   r   r   r   staticmethodr<   rH   r   r   r   r=   r   r   r   r   r   r7   r>   r-   r-   r+   r.   r   I  s    ,
D6	

r   c                       s   e Zd Zdef fddZdd Zdd Zee											dd	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e de	e de	e de	e
j de	e
j dee deeef fddZ  ZS )OPTModelrX   c                    s"   t  | t|| _|   d S r   )r(   r)   r   decoderr   r   r+   r-   r.   r)     s   
zOPTModel.__init__c                 C      | j jS r   r  r   r*   r-   r-   r.   get_input_embeddings     zOPTModel.get_input_embeddingsc                 C      || j _d S r   r  r*   rC   r-   r-   r.   set_input_embeddings     zOPTModel.set_input_embeddingsNr   r/   r   rm   r   r   rt   r   r   r1   ru   rS   rv   c                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	| jd|||
||||||d|d|}t|j|j|j	|j
dS )NTr   r/   r1   r   rm   r   r   rt   r   r   ru   r   r-   )rX   rt   r   r   r   r  r   r   rm   rr   r   )r*   r   r/   r   rm   r   r   rt   r   r   r1   ru   rS   decoder_outputsr-   r-   r.   r7     s4   zOPTModel.forwardr  )r8   r9   r:   r   r)   r  r  r   r   r   r4   r=   r   r   r   r   r   r   r   r   r   r7   r>   r-   r-   r+   r.   r    sX    	

r  c                !       s   e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Ze	e
												ddeej deej deej dee deej deej dee dee dee dee deej deej dee deeef fddZ  ZS )OPTForCausalLMzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFrZ   )
r(   r)   r  r   r   rg   r   r   lm_headr   r   r+   r-   r.   r)     s   
zOPTForCausalLM.__init__c                 C   
   | j jjS r   r   r  r   r  r-   r-   r.   r       
z#OPTForCausalLM.get_input_embeddingsc                 C      || j j_d S r   r  r  r-   r-   r.   r       z#OPTForCausalLM.set_input_embeddingsc                 C   r  r   r   r  )r*   r  r-   r-   r.   set_decoder  r  zOPTForCausalLM.set_decoderc                 C   r  r   r#  r  r-   r-   r.   get_decoder  r  zOPTForCausalLM.get_decoderNr   r/   r   rm   r   labelsr   rt   r   r   r1   ru   rS   rv   c                 K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| jjd|||||||||	d|d|}| |d  }d}|durV||j	}| j
||fd| j ji|}t|||j|j|jdS )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```NTr  r   r   losslogitsrm   rr   r   r-   )rX   rt   r   r   r   r  r  rR   rQ   r   loss_functionr   r   rm   rr   r   )r*   r   r/   r   rm   r   r&  r   rt   r   r   r1   ru   rS   r   r)  r(  r-   r-   r.   r7     sL   )zOPTForCausalLM.forwardNNNNNNNNNNNN)r8   r9   r:   _tied_weights_keysr)   r  r  r$  r%  r   r   r   r4   r=   r   r   r   r   r   r   r   r   r   r7   r>   r-   r-   r+   r.   r    sd    
	

r  a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       s   e Zd Zdef fddZe											ddeej deej	 deej	 dee
 d	eej	 d
eej dee dee dee dee deej deeef fddZdd Zdd Z  ZS )OPTForSequenceClassificationrX   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r  )
r(   r)   
num_labelsr  r   r   rg   r   scorer   r   r+   r-   r.   r)   f  s
   
z%OPTForSequenceClassification.__init__Nr   r/   r   rm   r   r&  r   rt   r   r   r1   rv   c                 C   sL  |
dur|
n| j j}
| j|||||||||	|
d
}|d }| |}|dur0|jdd \}}n	|jdd \}}| j jdu rG|dkrGtd| j jdu rPd}n1|duru|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrd| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n,|||}n&| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	rm   r/   r1   r   r   r   rt   r   r   r   r&   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rF   )r   rH   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr'  )rX   r   r   r0  r   r   rf   rQ   r   r4   int32r   argmaxrb   rc   r,   r8   problem_typer/  rH   r6   r<   r   squeezer   rz   r   r   rm   rr   r   )r*   r   r/   r   rm   r   r&  r   rt   r   r   r1   transformer_outputsrr   r)  r   r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr(  loss_fctoutputr-   r-   r.   r7   o  sx   



"


z$OPTForSequenceClassification.forwardc                 C   r  r   r  r  r-   r-   r.   r    r   z1OPTForSequenceClassification.get_input_embeddingsc                 C   r!  r   r  r  r-   r-   r.   r    r"  z1OPTForSequenceClassification.set_input_embeddingsr  )r8   r9   r:   r   r)   r   r   r4   r=   r   r   r   r   r   r   r7   r  r  r>   r-   r-   r+   r.   r.  W  sR    		

^r.  c                       s   e Zd Zdef fddZe												ddeej deej	 deej	 dee
 d	eej	 d
eej deej dee dee dee dee deej deeef fddZdd Zdd Z  ZS )OPTForQuestionAnsweringrX   c                    s2   t  | t|| _t|jd| _|   d S r%   )	r(   r)   r  r   r   rg   r   
qa_outputsr   r   r+   r-   r.   r)     s   
z OPTForQuestionAnswering.__init__Nr   r/   r   rm   r   start_positionsend_positionsr   rt   r   r   r1   rv   c                 C   sZ  |dur|n| j j}| j||||||||	|
|d
}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrP|d}t| dkr]|d}|d}|	d|
|j}|	d|
|j}t|d}|||}|||}|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```Nr1  r   r   rF   r2   )ignore_indexr&   )r(  start_logits
end_logitsrr   r   )rX   r   r   rA  splitr8  rR   r   ry   clamprQ   r   r   r   rr   r   )r*   r   r/   r   rm   r   rB  rC  r   rt   r   r   r1   r9  rr   r)  rE  rF  
total_lossignored_indexr>  
start_lossend_lossr?  r-   r-   r.   r7     sR   0






zOPTForQuestionAnswering.forwardc                 C   r  r   r  r  r-   r-   r.   r  A  r   z,OPTForQuestionAnswering.get_input_embeddingsc                 C   r!  r   r  r  r-   r-   r.   r  D  r"  z,OPTForQuestionAnswering.set_input_embeddingsr+  )r8   r9   r:   r   r)   r   r   r4   r=   r   r   r   r   r   r   r7   r  r  r>   r-   r-   r+   r.   r@    sX    	

ar@  )r  r  r   r.  r@  )r?   )Br;   typingr   r   r   r4   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   configuration_optr   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   
get_loggerr8   rb   r   r"   Moduler   floatrV   rW   r   r   r   r  r  r.  r@  __all__r-   r-   r-   r.   <module>   sn   
$
`f  d=npr