o
    i                     @   s  d Z ddlZddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ e  rddl,m-Z-m.Z. G dd de)Z/G dd de'Z0G dd de%Z1G dd de&Z2eG dd deZ3eG dd  d e3Z4ed!d"G d#d$ d$e3eZ5eG d%d& d&e3Z6ed'd"G d(d) d)e3Z7g d*Z8dS )+zPyTorch BioGPT model.    N)OptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogger)deprecate_kwarg   )BartAttentionBartDecoderLayerBartScaledWordEmbedding)OPTLearnedPositionalEmbedding   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                       s8   e Zd Z		ddejdedeej f fddZ  ZS )	 BioGptLearnedPositionalEmbeddingr   Nattention_maskpast_key_values_lengthposition_idsc                    s   t  ||| dS )z3`input_ids_shape` is expected to be [bsz x seqlen].N)superforward)selfr"   r#   r$   	__class__ f/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/biogpt/modular_biogpt.pyr&   ;   s   z(BioGptLearnedPositionalEmbedding.forward)r   N)	__name__
__module____qualname__torch
LongTensorintr   r&   __classcell__r*   r*   r(   r+   r!   :   s    r!   c                   @      e Zd ZdS )BioGptScaledWordEmbeddingNr,   r-   r.   r*   r*   r*   r+   r4   E       r4   c                   @   r3   )BioGptAttentionNr5   r*   r*   r*   r+   r7   I   r6   r7   c                       s   e Zd Zddedee f fddZedddd					
			ddej	deej	 deej	 dee
 dee dee deej deej	 dee deejeeejejf  f fddZ  ZS )BioGptDecoderLayerNconfig	layer_idxc              	      sv   t  | |j| _t| j|j|jdd||d| _|j| _	t
|j | _t| j|j| _t|j| j| _| `| `d S )NT)	embed_dim	num_headsdropout
is_decoder	is_causalr9   r:   )r%   __init__hidden_sizer;   r7   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probr=   r   
hidden_actactivation_fnnnLinearintermediate_sizefc1fc2encoder_attnencoder_attn_layer_norm)r'   r9   r:   r(   r*   r+   r@   N   s"   	zBioGptDecoderLayer.__init__past_key_valuepast_key_valuesz4.58)new_nameversionFThidden_statesr"   layer_head_maskoutput_attentions	use_cacher$   cache_positionkwargsreturnc	              
   K   s   |}
|  |}| jd|||||||d|	\}}tjj|| j| jd}|
| }|}
| |}| |}| |}tjj|| j	| jd}| 
|}tjj|| j| jd}|
| }|f}|rb||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )rS   rP   r"   rT   rU   r$   rW   ptrainingNr*   )self_attn_layer_normrD   rH   
functionalr=   r\   final_layer_normrK   rG   activation_dropoutrL   )r'   rS   r"   rT   rP   rU   rV   r$   rW   rX   residualself_attn_weightsoutputsr*   r*   r+   r&   d   s6   







zBioGptDecoderLayer.forwardN)NNNFTNN)r,   r-   r.   r   r   r1   r@   r   r/   Tensorr	   boolr0   r   r   tupleFloatTensorr&   r2   r*   r*   r(   r+   r8   M   s>    	
r8   c                   @   s   e Zd ZU eed< dZdZdZdZdZ	dZ
deeejdf  dejdejdefd	d
ZedejdededejdejdefddZdS )BioGptPreTrainedModelr9   biogptTr"   r   input_tensorrW   rP   c                 C   sb  | j jdkr*t|tjrt|}|S |d u r(ttj|jd |jd f|jd}|S | j jdkr>|d ur<|dk	 r<|S d S |d urF|
 nd}|d urO|jnd}| j jdkre|setj|||| jd	red S |j}|jd }|rt| }	nt|tjr|jd
 n|| d }	| j|||	|||jd d}
| j jdkr|d ur|jjdv rt|j}t|
|}
|
S )Nflex_attentionr   r   )sizedeviceflash_attention_2g        Fsdpa)inputs_embedsr#   is_training)sequence_lengthtarget_lengthdtyperW   
batch_size)cudaxpunpu)r9   _attn_implementation
isinstancer/   re   r    onesshapern   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdpar\   rv   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r'   r"   rk   rW   rP   past_seen_tokensusing_compilable_cacherv   rt   ru   causal_mask	min_dtyper*   r*   r+   _update_causal_mask   s`   





z)BioGptPreTrainedModel._update_causal_maskrt   ru   rv   rw   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerv   rn   r   )diagonalrn   rs   r   )dimr/   r   r   fullrn   triuarangereshapeexpandcloner~   tomasked_fill)r"   rt   ru   rv   rW   rw   rX   r   r   mask_lengthpadding_maskr*   r*   r+   r      s,    $
6  zKBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_positionN)r,   r-   r.   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r/   re   r	   r   staticmethodr1   rv   r   r*   r*   r*   r+   ri      s>   
 
Lri   c                       s   e Zd Zdef fddZe											ddeej deej	 deej	 deej	 d	ee
 d
ee deej dee dee dee deej dee deeef fddZ  ZS )BioGptModelr9   c                    s   t     | _ j| _ j| _ j| _ j| _	 j
r"t jnd}t j| j| j	|d| _t j| j| _t fddt jD | _t| j| _d| _|   d S )Ng      ?)embed_scalec                    s   g | ]}t  |d qS ))r:   )r8   ).0ir9   r*   r+   
<listcomp>H  s    z(BioGptModel.__init__.<locals>.<listcomp>F)r%   r@   r9   	layerdroprE   r=   rA   r;   pad_token_idpadding_idxscale_embeddingmathsqrtr4   
vocab_sizeembed_tokensr!   max_position_embeddingsembed_positionsrH   
ModuleListrangenum_hidden_layerslayers	LayerNorm
layer_normgradient_checkpointing	post_init)r'   r9   r   r(   r   r+   r@   :  s    zBioGptModel.__init__N	input_idsr"   	head_maskrq   rP   rV   r$   rU   output_hidden_statesreturn_dictrW   rX   rY   c                 K   s:  |d ur|n| j j}|	d ur|	n| j j}	|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td|d urF|}|j}|d|d }n|d ur^| d d }|d d d d df }ntd|d u rk| 	|}| j
rz| jrz|rztd d}|r|d u rt| j d}|rt|trtd t|}| d d \}}|d ur| nd}|d u rtj||| |jd	}|d u r|| }tj|||jd	}|}| ||||}|d u rtj|d
d}|| d
  }|d d |d f }| j|||d}|| }tjj|| j| jd}| j
r| jr|rtd d}|	r dnd }|r'dnd }d }t| jD ]E\}}|	r<||f7 }| jrMt g }|| j!k rMq0||f||d urZ|| nd |||||d|}|d }|rt||d
 f7 }q0|	r~||f7 }| "|}|
stdd |||||fD S t#|||||dS )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timers   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   )r   )r$   rZ   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r*   )r"   rT   rP   rU   rV   r$   rW   c                 s   s    | ]	}|d ur|V  qd S rd   r*   )r   vr*   r*   r+   	<genexpr>  s    z&BioGptModel.forward.<locals>.<genexpr>)last_hidden_staterP   rS   
attentionscross_attentions)$r9   rU   r   rV   use_return_dict
ValueErrorr~   viewrm   r   r   r\   r   warning_oncer
   r|   rg   from_legacy_cacher   r/   r   rn   r}   r   cumsumlongr   rH   r^   r=   	enumerater   randr   r   r   )r'   r   r"   r   rq   rP   rV   r$   rU   r   r   rW   rX   inputinput_shaperw   
seq_lengthr#   mask_seq_lengthself_attn_cacher   	positionsrS   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputsr*   r*   r+   r&   O  s   



	

zBioGptModel.forward)NNNNNNNNNNN)r,   r-   r.   r   r@   r   r   r/   r0   rh   r	   rf   re   r   r   r   rg   r   r&   r2   r*   r*   r(   r+   r   8  sR    	

r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        s   e Zd ZdgZ fddZdd Zdd Ze												dd	ee	j
 d
ee	j dee	j dee	j dee dee	j
 dee dee	j
 dee dee dee dee	j dee deeef fddZ  ZS )BioGptForCausalLMzoutput_projection.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NF)bias)
r%   r@   r   rj   rH   rI   rA   r   output_projectionr   r'   r9   r(   r*   r+   r@     s   
zBioGptForCausalLM.__init__c                 C   s   | j S rd   r   r'   r*   r*   r+   get_output_embeddings  s   z'BioGptForCausalLM.get_output_embeddingsc                 C   s
   || _ d S rd   r   )r'   new_embeddingsr*   r*   r+   set_output_embeddings  s   
z'BioGptForCausalLM.set_output_embeddingsNr   r"   r   rq   rP   labelsrV   r$   rU   r   r   rW   rX   rY   c                 K   s   |dur|n| j j}| j|f|||||||	|
||d
|}|d }| |}d}|dur;| j||fd| j ji|}|sQ|f|dd  }|durO|f| S |S t|||j|j|j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r"   r   rq   rP   rV   r$   rU   r   r   rW   r   r   r   )losslogitsrP   rS   r   r   )r9   r   rj   r   loss_functionr   r   rP   rS   r   r   )r'   r   r"   r   rq   rP   r   rV   r$   rU   r   r   rW   rX   rc   sequence_outputprediction_scoreslm_lossoutputr*   r*   r+   r&     sP   
zBioGptForCausalLM.forward)NNNNNNNNNNNN)r,   r-   r.   _tied_weights_keysr@   r   r   r   r   r/   r0   rh   r	   rf   re   r   r   r   rg   r   r&   r2   r*   r*   r(   r+   r     s^    		

r   c                        s   e Zd Z fddZe													ddeej deej deej deej dee	 d	eej d
eej dee
 deej dee
 dee
 dee
 deej deeef fddZ  ZS )BioGptForTokenClassificationc                    sj   t  | |j| _t|| _t|dr|jd ur|j}n|j}t	|| _
t|j|j| _|   d S )Nclassifier_dropout)r%   r@   
num_labelsr   rj   hasattrr   rE   rH   Dropoutr=   rI   rA   
classifierr   )r'   r9   r   r(   r*   r+   r@   >  s   
z%BioGptForTokenClassification.__init__Nr   token_type_idsr"   r   rP   rq   r   rV   r$   rU   r   r   rW   rY   c                 C   s  |dur|n| j j}| j|||||||	|
|||d}|d }| |}| |}d}|durgt }|durZ|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|s}|f|dd  }|dur{|f| S |S t|||j|jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N
rP   r"   r   rq   rV   r$   rU   r   r   rW   r   rs   r   r   )r   r   rS   r   )r9   r   rj   r=   r   r   r   r   r/   wheretensorignore_indextype_asr   rS   r   )r'   r   r   r"   r   rP   rq   r   rV   r$   rU   r   r   rW   transformer_outputsrS   r   r   loss_fctactive_lossactive_logitsactive_labelsr   r*   r*   r+   r&   L  sJ   

z$BioGptForTokenClassification.forward)NNNNNNNNNNNNN)r,   r-   r.   r@   r   r   r/   r0   rh   r	   rf   re   r   rg   r   r&   r2   r*   r*   r(   r+   r   <  sZ    	

r   a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                        s   e Zd Zdef fddZe													ddeej deej	 deej	 d	ee
 d
eej	 deej dee deej dee dee dee deej deeejf deeef fddZdd Zdd Z  ZS )BioGptForSequenceClassificationr9   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r%   r@   r   r   rj   rH   rI   rA   scorer   r   r(   r*   r+   r@     s
   
z(BioGptForSequenceClassification.__init__Nr   r   r"   r   rP   rq   r   rV   r$   rU   r   r   rW   logits_to_keeprY   c                 C   s<  |dur|n| j j}| j||||||||	|
||d}|d }t|tr)t| dn|}| |dd|ddf }|durG|jdd \}}n	|jdd \}}| j jdu rYd}n"|durot	
|| j jdd |j}nd}t| jj d |t	j||jd|f }d}|dur| j jdu r| jdkrd	| j _n| jdkr|jt	jks|jt	jkrd
| j _nd| j _| j jd	krt }| jdkr|| | }n+|||}n%| j jd
krt }||d| j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|jdS )r   Nr   r   r   rs   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r   r   rP   rS   r   ) r9   r   rj   r|   r1   slicer  r~   r   r/   nesumr   rn   r   r   r)   r,   r   problem_typer   rv   r   r   squeezer   r   r   r   rP   rS   r   )r'   r   r"   r   rP   rq   r   rV   r$   rU   r   r   rW   r  r   rS   slice_indicesr   rw   rt   pooled_logitsr   r   r   r*   r*   r+   r&     st   $

"


z'BioGptForSequenceClassification.forwardc                 C   s   | j jS rd   rj   r   r   r*   r*   r+   get_input_embeddings  s   z4BioGptForSequenceClassification.get_input_embeddingsc                 C   s   || j _d S rd   r  )r'   valuer*   r*   r+   set_input_embeddings  s   z4BioGptForSequenceClassification.set_input_embeddings)NNNNNNNNNNNNr   )r,   r-   r.   r   r@   r   r   r/   r0   rh   r	   rf   re   r   r1   rg   r   r&   r  r  r2   r*   r*   r(   r+   r    s^    		

^r  )r   r   r  r   ri   )9__doc__r   typingr   r   r/   torch.nnrH   r   r   r   activationsr   cache_utilsr	   r
   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   bart.modeling_bartr   r   r   opt.modeling_optr   configuration_biogptr   integrations.flex_attentionr   r    r!   r4   r7   r8   ri   r   r   r   r  __all__r*   r*   r*   r+   <module>   sR   Z  +TTp