o
    wi                     @   s  d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* e' rddl+m,Z, ddl-m.Z. e(/e0Z1G dd dej2Z3	d6dej4dej5dej5dej5deej5 de6de6fdd Z7G d!d" d"ej4Z8G d#d$ d$eZ9e%G d%d& d&e Z:G d'd( d(e:Z;e%G d)d* d*e:Z<G d+d, d,ee$Z=G d-d. d.e:eZ>e%d/d0G d1d2 d2e:Z?e%G d3d4 d4e:Z@g d5ZAdS )7zPyTorch OPT model.    )CallableOptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPastQuestionAnsweringModelOutput SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )	OPTConfig)	BlockMask)make_flex_block_causal_maskc                       sR   e Zd ZdZdedef fddZ		ddejd	ed
eej f fddZ	  Z
S )OPTLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr"   r#   	__class__ a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/opt/modeling_opt.pyr(   8   s   z&OPTLearnedPositionalEmbedding.__init__r   Nattention_maskpast_key_values_lengthposition_idsc                    sL   |du rt j|dd}|| d  }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr'   forwardr&   )r)   r.   r/   r0   r*   r,   r-   r6   >   s
   z%OPTLearnedPositionalEmbedding.forwardr   N)__name__
__module____qualname____doc__intr(   r3   
LongTensorr   r6   __classcell__r,   r,   r*   r-   r!   3   s    	r!           modulequerykeyvaluer.   scalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )N)r2   dtypeptrainingr   r%   )r3   matmul	transposer   
functionalsoftmaxfloat32torH   rE   rK   
contiguous)
r@   rA   rB   rC   r.   rD   rE   kwargsattn_weightsattn_outputr,   r,   r-   eager_attention_forwardP   s   
rV   c                       s   e Zd ZdZ	ddedee f fddZ					ddej	d	ee
ej	  d
eej	 deej	 dedeej	 de
ej	eej	 ee f fddZ  ZS )OPTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s  t    || _|j| _|j| _|j| _|j	| _	|| _
|d u r*td| jj d | j| j | _d| _| j| j | jkrJtd| j d| j d| jd | _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _tj| j| j| j	d| _d S )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩bias)r'   r(   rX   hidden_size	embed_dimnum_attention_heads	num_headsattention_dropoutrE   enable_biasrY   loggerwarning_oncer+   r8   head_dim	is_causal
ValueErrorrD   r   Lineark_projv_projq_projout_proj)r)   rX   rY   rS   r*   r,   r-   r(   j   s0   

zOPTAttention.__init__Fhidden_statespast_key_valuer.   layer_head_maskoutput_attentionscache_positionreturnc                 K   s<  |  \}}	}
| || j }||d| j| jdd}| |}| |}||d| j| jdd}||d| j| jdd}|durT|	||| j
d|i\}}t}| jjdkrp| jjdkrj|rjtd nt| jj }|| ||||f| js|d	n| jd
d|\}}|||	d }| |}|sd}|||fS )z#Input shape: Batch x Time x ChannelrF   r   r%   Nrp   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r?         ?)rE   rD   )sizerj   rD   viewr_   rd   rM   rh   ri   updaterY   rV   rX   _attn_implementationrb   rc   r   rK   rE   reshaperR   rk   )r)   rl   rm   r.   rn   ro   rp   rS   bsztgt_len_query_states
key_statesvalue_statesattention_interfacerU   rT   r,   r,   r-   r6      sF   




zOPTAttention.forwardN)NNNFN)r8   r9   r:   r;   r   r   r<   r(   r3   Tensortupleboolr   r6   r>   r,   r,   r*   r-   rW   g   s8    &	rW   c                       s   e Zd Zddedee f fddZ							ddejdeej d	eej d
ee	ej  dee
 dee
 deej deej dee de	ejee	ejejf  f fddZ  ZS )OPTDecoderLayerNrX   rY   c                    s   t    |j| _t||d| _|j| _|j| _t|j	 | _
tj| j|jd| _tj| j|j|jd| _tj|j| j|jd| _tj| j|jd| _d S )N)rX   rY   elementwise_affinerZ   )r'   r(   r\   r]   rW   	self_attndo_layer_norm_beforerE   r
   activation_functionactivation_fnr   	LayerNormlayer_norm_elementwise_affineself_attn_layer_normrg   ffn_dimra   fc1fc2final_layer_norm)r)   rX   rY   r*   r,   r-   r(      s   
zOPTDecoderLayer.__init__Frl   r.   rn   rm   ro   	use_cacher0   rp   rS   rq   c	              
   K   s  |}
| j r
| |}| jd|||||||d|	\}}}tjj|| j| jd}|
| }| j s4| |}|j}|d|	d}|}
| j rJ| 
|}| |}| |}| |}tjj|| j| jd}|
| |}| j ss| 
|}|f}|r}||f7 }|r||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence..
        )rl   rm   r0   r.   rn   ro   rp   rI   rF   Nr,   )r   r   r   r   rN   rE   rK   shapery   ru   r   r   r   r   rv   )r)   rl   r.   rn   rm   ro   r   r0   rp   rS   residualself_attn_weightspresent_key_valuehidden_states_shapeoutputsr,   r,   r-   r6      sH   









zOPTDecoderLayer.forwardr   )NNNFFNN)r8   r9   r:   r   r   r<   r(   r3   r   r   r   r=   r   r   FloatTensorr6   r>   r,   r,   r*   r-   r      s<    	
r   c                   @   sB   e Zd ZeZdZdZdgZdZdZ	dZ
dZdZdZdZdd ZdS )OPTPreTrainedModelmodelTr   c                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|tjrX|jjd |jj	  d S d S )Nr?   )meanstdrt   )rX   init_std
isinstancer   rg   weightdatanormal_r[   zero_	Embeddingpadding_idxr   fill_)r)   r@   r   r,   r,   r-   _init_weightsB  s   

z OPTPreTrainedModel._init_weightsN)r8   r9   r:   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cacher   r,   r,   r,   r-   r   4  s    r   c                       s6  e Zd ZdZdef fddZdd Zdd Z		d$d
ee	j
df de	j
de	j
dedef
ddZed
e	j
dedede	jde	j
defddZe											d%dee	j d
ee	j
 dee	j
 deee	j  dee	j dee dee dee dee dee	j dee	j
 d ee d!eeef fd"d#Z  ZS )&
OPTDecoderz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OPTDecoderLayer`]

    Args:
        config: OPTConfig
    rX   c                    s  t     j| _ j| _ j| _ j| _ j| _t	
 j j| j| _t j j| _ j jkr?t	j j jdd| _nd | _ j jkrTt	j j jdd| _nd | _ jrh jsht	j j jd| _nd | _t	 fddt jD | _d| _|   d S )NFrZ   r   c                    s   g | ]}t  |d qS ))rY   )r   ).0irX   r,   r-   
<listcomp>x  s    z'OPTDecoder.__init__.<locals>.<listcomp>)r'   r(   rE   	layerdroppad_token_idr   max_position_embeddingsmax_target_positions
vocab_sizer   r   word_embed_proj_dimembed_tokensr!   r\   embed_positionsrg   project_out
project_inr   _remove_final_layer_normr   r   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointing	post_initr)   rX   r*   r   r-   r(   Y  s,   
 zOPTDecoder.__init__c                 C      | j S r   r   r)   r,   r,   r-   get_input_embeddings~     zOPTDecoder.get_input_embeddingsc                 C   
   || _ d S r   r   r)   rC   r,   r,   r-   set_input_embeddings     
zOPTDecoder.set_input_embeddingsFr.   r   input_tensorrp   past_key_valuesro   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r?   flex_attentionr   Frs   )inputs_embedsr/   is_trainingr   rF   )sequence_lengthtarget_lengthrH   rp   
batch_size)cudaxpunpu)rX   rx   anyr   r3   r   r    get_seq_lengthis_compileabler   _ignore_causal_mask_sdparK   rH   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetypefinfomin_unmask_unattended)r)   r.   r   rp   r   ro   past_seen_tokensusing_compilable_cacherH   r   r   causal_mask	min_dtyper,   r,   r-   _update_causal_mask  sT   




zOPTDecoder._update_causal_maskr   r   rH   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerH   r   r   )diagonalr   rF   r   )r2   r3   r   r   fullr   triuarangery   expandcloner   rQ   masked_fill)r.   r   r   rH   rp   r   rS   r   r   mask_lengthpadding_maskr,   r,   r-   r     s,    $
6  z@OPTDecoder._prepare_4d_causal_attention_mask_with_cache_positionN	input_ids	head_maskr   r   output_hidden_statesreturn_dictr0   rS   rq   c                 K   s`  |dur|n| j j}|dur|n| j j}|dur|n| j j}|	dur$|	n| j j}	|du |duA r4td| jrC| jrC|rCt	d d}|durP|
d|jd }|du rY| |}d}|rrt|tsrd}t|}|du rrt	d |durz| nd}|du rtj|||jd	  |jd
}|du r||jd	  }tj|jd ||jd
}| |||||}|
du rtj|d	d}
|
| d	  }
|
dd|df }
| j|||
d}| jdur| |}|||j }|rdnd}|rdnd}d}t|gdgD ]*\}}|dur$| d t| jkr$td| dt| j d| d  dqt | jD ]Q\}}|r6||f7 }| jrGt!g }|| j"k rGq*||f||
|durU|| nd||||d|}|d }|rp||rmdnd	 }|rz||d	 f7 }q*| j#dur| #|}| j$dur| $|}|r||f7 }|r|nd}|r|% }t&||||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
                config.n_positions - 1]`. for padding use -1.

                [What are position IDs?](../glossary#position-ids)
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
                the complete sequence length.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrF   TzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   r1   )r0   r,   r   zThe `z` should be specified for z layers, but it is for .)r.   r0   rn   rm   ro   r   rp   r%   last_hidden_stater   rl   
attentions)'rX   ro   r   r   use_return_dictrf   r   rK   rb   rc   rv   r   r   r   r   r   from_legacy_cacher   r3   r   r   onesr   r4   r5   r   r   rQ   zipru   lenr   	enumeraterandr   r   r   to_legacy_cacher   )r)   r   r.   r   r   r   r   ro   r   r   r0   rp   rS   return_legacy_cacher   
seq_lengthr   
pos_embedsrl   all_hidden_statesall_self_attnsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputs
next_cacher,   r,   r-   r6     s   H








	


zOPTDecoder.forward)FNNNNNNNNNNN)r8   r9   r:   r;   r   r(   r   r   r   r3   r   r   r   r   staticmethodr<   rH   r   r   r   r=   listr   r   r   r   r   r6   r>   r,   r,   r*   r-   r   Q  s    %

D6	

r   c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
	
	
dde
ej de
ej de
ej de
eeej ef  de
ej de
e de
e de
e de
e de
ej de
ej dee deeef fddZ  ZS )OPTModelrX   c                    s"   t  | t|| _|   d S r   )r'   r(   r   decoderr   r   r*   r,   r-   r(     s   
zOPTModel.__init__c                 C      | j jS r   r  r   r   r,   r,   r-   r        zOPTModel.get_input_embeddingsc                 C      || j _d S r   r  r   r,   r,   r-   r        zOPTModel.set_input_embeddingsc                 C   r   r   )r  r   r,   r,   r-   get_decoder  r   zOPTModel.get_decoderNr   r.   r   r   r   r   ro   r   r   r0   rp   rS   rq   c                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	| jd|||
||||||d|d|}t|j|j|j	|j
dS )NTr   r.   r0   r   r   r   r   ro   r   r   rp   r  r,   )rX   ro   r   r   r  r  r   r  r   rl   r  )r)   r   r.   r   r   r   r   ro   r   r   r0   rp   rS   decoder_outputsr,   r,   r-   r6     s4   zOPTModel.forwardr  )r8   r9   r:   r   r(   r   r   r#  r   r   r   r3   r=   r   r   r  r   r   r   r   r   r   r   r6   r>   r,   r,   r*   r-   r    sZ    	

r  c                   @   s   e Zd ZdS )KwargsForCausalLMN)r8   r9   r:   r,   r,   r,   r-   r&    s    r&  c                !       s  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
ee												d#deej deej deej deeeej ef  deej deej dee dee dee dee deej deej dee deeef fdd Zed!d" Z  ZS )$OPTForCausalLMzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFrZ   )
r'   r(   r  r   r   rg   r   r   lm_headr   r   r*   r,   r-   r(     s   
zOPTForCausalLM.__init__c                 C   
   | j jjS r   r   r  r   r   r,   r,   r-   r     r   z#OPTForCausalLM.get_input_embeddingsc                 C      || j j_d S r   r+  r   r,   r,   r-   r        z#OPTForCausalLM.set_input_embeddingsc                 C   r   r   r)  r   r,   r,   r-   get_output_embeddings!  r   z$OPTForCausalLM.get_output_embeddingsc                 C   r   r   r.  )r)   new_embeddingsr,   r,   r-   set_output_embeddings$  r   z$OPTForCausalLM.set_output_embeddingsc                 C   r!  r   r   r  )r)   r  r,   r,   r-   set_decoder'  r"  zOPTForCausalLM.set_decoderc                 C   r  r   r2  r   r,   r,   r-   r#  *  r   zOPTForCausalLM.get_decoderNr   r.   r   r   r   labelsr   ro   r   r   r0   rp   rS   rq   c                 K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| jjd|||||||||	d|d|}| |d  }d}|durV||j	}| j
||fd| j ji|}t|||j|j|jdS )an  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious. I'm just a little bit of a weirdo."
        ```NTr$  r   r   losslogitsr   rl   r  r,   )rX   ro   r   r  r   r  r)  rR   rQ   r   loss_functionr   r   r   rl   r  )r)   r   r.   r   r   r   r4  r   ro   r   r   r0   rp   rS   r   r7  r6  r,   r,   r-   r6   -  sL   )zOPTForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr,   c                 3   s$    | ]}| d  |jV  qdS r7   )index_selectrQ   r   )r   
past_statebeam_idxr,   r-   	<genexpr>  s   " z0OPTForCausalLM._reorder_cache.<locals>.<genexpr>)r   )r   r<  reordered_past
layer_pastr,   r;  r-   _reorder_cache  s   zOPTForCausalLM._reorder_cacheNNNNNNNNNNNN)r8   r9   r:   _tied_weights_keysr(   r   r   r/  r1  r3  r#  r   r   r   r3   r=   r   r   r  r   r   r   r   r&  r   r   r6   r  r@  r>   r,   r,   r*   r-   r'    sl    
	

Rr'  a  
    The OPT Model transformer with a sequence classification head on top (linear layer).

    [`OPTForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       s   e Zd Zdef fddZe											ddeej deej	 deej	 dee
eej	 ef  d	eej	 d
eej dee dee dee dee deej de
eef fddZdd Zdd Z  ZS )OPTForSequenceClassificationrX   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r(  )
r'   r(   
num_labelsr  r   r   rg   r   scorer   r   r*   r,   r-   r(     s
   
z%OPTForSequenceClassification.__init__Nr   r.   r   r   r   r4  r   ro   r   r   r0   rq   c                 C   sL  |
dur|
n| j j}
| j|||||||||	|
d
}|d }| |}|dur0|jdd \}}n	|jdd \}}| j jdu rG|dkrGtd| j jdu rPd}n1|duru|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrd| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n,|||}n&| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	r   r.   r0   r   r   r   ro   r   r   r   r%   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rF   )r   rH   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr5  )rX   r  r   rF  r   r   rf   rQ   r   r3   int32r   argmaxrb   rc   r+   r8   problem_typerE  rH   r5   r<   r   squeezer   rv   r   r   r   rl   r  )r)   r   r.   r   r   r   r4  r   ro   r   r   r0   transformer_outputsrl   r7  r   r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr6  loss_fctoutputr,   r,   r-   r6     sx   



"


z$OPTForSequenceClassification.forwardc                 C   r*  r   r+  r   r,   r,   r-   r     r   z1OPTForSequenceClassification.get_input_embeddingsc                 C   r,  r   r+  r   r,   r,   r-   r     r-  z1OPTForSequenceClassification.set_input_embeddingsr  )r8   r9   r:   r   r(   r   r   r3   r=   r   r   r  r   r   r   r   r6   r   r   r>   r,   r,   r*   r-   rD    sR    		

^rD  c                       s   e Zd Zdef fddZe												ddeej deej	 deej	 dee
eej	 ef  d	eej	 d
eej deej dee dee dee dee deej de
eef fddZdd Zdd Z  ZS )OPTForQuestionAnsweringrX   c                    s2   t  | t|| _t|jd| _|   d S r$   )	r'   r(   r  r   r   rg   r   
qa_outputsr   r   r*   r,   r-   r(     s   
z OPTForQuestionAnswering.__init__Nr   r.   r   r   r   start_positionsend_positionsr   ro   r   r   r0   rq   c                 C   sZ  |dur|n| j j}| j||||||||	|
|d
}|d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrP|d}t| dkr]|d}|d}|	d|
|j}|	d|
|j}t|d}|||}|||}|| d }|s||f|dd  }|dur|f| S |S t||||j|jd	S )
a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OPTForQuestionAnswering
        >>> import torch

        >>> torch.manual_seed(4)  # doctest: +IGNORE_RESULT
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

        >>> # note: we are loading a OPTForQuestionAnswering from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> model = OPTForQuestionAnswering.from_pretrained("facebook/opt-350m")

        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

        >>> inputs = tokenizer(question, text, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> answer_offset = len(tokenizer(question)[0])

        >>> predict_answer_tokens = inputs.input_ids[
        ...     0, answer_offset + answer_start_index : answer_offset + answer_end_index + 1
        ... ]
        >>> predicted = tokenizer.decode(predict_answer_tokens)
        >>> predicted
        ' a nice puppet'
        ```NrG  r   r   rF   r1   )ignore_indexr%   )r6  start_logits
end_logitsrl   r  )rX   r  r   rW  splitrN  rR   r  ru   clamprQ   r   r   r   rl   r  )r)   r   r.   r   r   r   rX  rY  r   ro   r   r   r0   rO  rl   r7  r[  r\  
total_lossignored_indexrT  
start_lossend_lossrU  r,   r,   r-   r6     sR   0






zOPTForQuestionAnswering.forwardc                 C   r*  r   r+  r   r,   r,   r-   r   u  r   z,OPTForQuestionAnswering.get_input_embeddingsc                 C   r,  r   r+  r   r,   r,   r-   r   x  r-  z,OPTForQuestionAnswering.set_input_embeddingsrA  )r8   r9   r:   r   r(   r   r   r3   r=   r   r   r  r   r   r   r   r6   r   r   r>   r,   r,   r*   r-   rV  	  sX    	

arV  )r'  r  r   rD  rV  )r?   )Br;   typingr   r   r   r3   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   configuration_optr   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerr8   rb   r   r!   Moduler   floatrV   rW   r   r   r   r  r&  r'  rD  rV  __all__r,   r,   r,   r-   <module>   sp   
$
eh  {@}pr