o
    ei                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& e$'e(Z)G dd dej*Z+G dd dej*Z,		d2dej-dej.dej.dej.dej.dB de/dB de/de e" fddZ0G d d! d!ej-Z1G d"d# d#eZ2e#G d$d% d%eZ3e#G d&d' d'e3Z4e#d(d)G d*d+ d+e3eZ5e#G d,d- d-e3Z6e#d.d)G d/d0 d0e3Z7g d1Z8dS )3    N)Callable)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging   )BioGptConfigc                       sR   e Zd ZdZdedef fddZ		ddejd	ed
ejdB f fddZ  Z	S ) BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S )N   )offsetsuper__init__)selfr   r   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/biogpt/modeling_biogpt.pyr!   6   s   z)BioGptLearnedPositionalEmbedding.__init__r   Nattention_maskpast_key_values_lengthposition_idsc                    sL   |du rt j|dd}|| d  }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr    forwardr   )r"   r'   r(   r)   r#   r%   r&   r/   <   s
   z(BioGptLearnedPositionalEmbedding.forward)r   N)
__name__
__module____qualname____doc__intr!   r,   
LongTensorr/   __classcell__r%   r%   r#   r&   r   1   s    	r   c                
       sL   e Zd ZdZddededededB f fdd	Zd
ejf fddZ	  Z
S )BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?r   r   padding_idxembed_scaleNc                    s   t  ||| || _d S N)r    r!   r:   )r"   r   r   r9   r:   r#   r%   r&   r!   R   s   
z"BioGptScaledWordEmbedding.__init__	input_idsc                    s   t  || j S r;   )r    r/   r:   )r"   r<   r#   r%   r&   r/   V   s   z!BioGptScaledWordEmbedding.forward)r8   )r0   r1   r2   r3   r4   floatr!   r,   Tensorr/   r6   r%   r%   r#   r&   r7   M   s    $r7           modulequerykeyvaluer'   scalingdropoutkwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )N      r   r   r*   ptrainingr   )
sizer,   matmul	transposenn
functionalsoftmaxrE   rK   
contiguous)
r@   rA   rB   rC   r'   rD   rE   rF   attn_weightsattn_outputr%   r%   r&   eager_attention_forwardZ   s   
rU   c                       s   e Zd ZdZ						ddededed	ed
edededB dedB f fddZ					dde	j
de	j
dB dedB de	j
dB dede	j
dB dee dee	j
e	j
dB ee	j
 dB f fddZ  ZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr?   FTN	embed_dim	num_headsrE   
is_decoderbias	is_causalconfig	layer_idxc	           	         s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
|| _|d u rK| j	rKtd| jj d tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rH   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rZ   )r    r!   rW   rX   rE   head_dimr\   
ValueErrorrD   rY   r[   r]   loggerwarning_oncer$   r0   rO   Lineark_projv_projq_projout_proj)	r"   rW   rX   rE   rY   rZ   r[   r\   r]   r#   r%   r&   r!   y   s0   


zBioGptAttention.__init__hidden_stateskey_value_statespast_key_valuesr'   output_attentionscache_positionrF   returnc                 K   s  |du}|j dd \}	}
|r|j d n|
}|	|
d| jf}|	|d| jf}| |j| dd}d}|durNt|trL|j| j	}|rH|j
}n|j}n|}|rR|n|}|rk|durk|rk|j| j	 j}|j| j	 j}n@| |}| |}|j| dd}|j| dd}|dur|s|nd}|||| j	d|i\}}|rt|trd|j| j	< t| jjt}|| ||||f| jsdn| j| j|d	|\}}||	|
d }| |}||fS )
z#Input shape: Batch x Time x ChannelNrG   r   r   Frl   Tr?   )rE   rD   rk   )shaper_   rf   viewrN   
isinstancer
   
is_updatedgetr]   cross_attention_cacheself_attention_cachelayerskeysvaluesrd   re   updater   get_interfacer\   _attn_implementationrU   rK   rE   rD   reshaperR   rg   )r"   rh   ri   rj   r'   rk   rl   rF   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statesrq   curr_past_key_valuescurrent_states
key_statesvalue_statesattention_interfacerT   rS   r%   r%   r&   r/      s`   


	

zBioGptAttention.forward)r?   FTFNN)NNNFN)r0   r1   r2   r3   r4   r=   boolr   r!   r,   r>   r   r   r   tupler/   r6   r%   r%   r#   r&   rV   v   s^    	*
rV   c                       s   e Zd ZddededB f fddZ						ddejd	ejdB d
edB de	dB de	dB dej
dB dejdB dee deejeejejf dB f fddZ  ZS )BioGptDecoderLayerNr\   r]   c              	      s   t    |j| _t| j|j|jdd||d| _|j| _	t
|j | _|j| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rW   rX   rE   rY   r[   r\   r]   )r    r!   hidden_sizerW   rV   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrE   r   
hidden_actactivation_fnactivation_dropoutrO   	LayerNormself_attn_layer_normrc   intermediate_sizefc1fc2final_layer_norm)r"   r\   r]   r#   r%   r&   r!      s$   
	zBioGptDecoderLayer.__init__FTrh   r'   rj   rk   	use_cacher)   rl   rF   rm   c              	   K   s   |}	|  |}| jd||||||d|\}}
tjj|| j| jd}|	| }|}	| |}| |}| |}tjj|| j	| jd}| 
|}tjj|| j| jd}|	| }|f}|ra||
f7 }|S )a\  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )rh   rj   r'   rk   r)   rl   rI   Nr%   )r   r   rO   rP   rE   rK   r   r   r   r   r   )r"   rh   r'   rj   rk   r   r)   rl   rF   residualself_attn_weightsoutputsr%   r%   r&   r/     s4   

	




zBioGptDecoderLayer.forwardr;   )NNFTNN)r0   r1   r2   r   r4   r!   r,   r>   r   r   r5   r   r   r   FloatTensorr/   r6   r%   r%   r#   r&   r      s6    	
r   c                   @   s.   e Zd ZU eed< dZdZdZdZdZ	dZ
dS )BioGptPreTrainedModelr\   biogptTN)r0   r1   r2   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr%   r%   r%   r&   r   I  s   
 r   c                       s   e Zd Zdef fddZe										ddejdB dejdB dejdB de	dB d	e
dB d
ejdB de
dB de
dB de
dB dejdB dee deeB fddZ  ZS )BioGptModelr\   c                    s   t     | _ j| _ j| _ j| _ j| _	 j
r"t jnd}t j| j| j	|d| _t j| j| _t fddt jD | _t| j| _d| _|   d S )Nr8   )r:   c                    s   g | ]}t  |d qS ))r]   )r   ).0ir\   r%   r&   
<listcomp>e  s    z(BioGptModel.__init__.<locals>.<listcomp>F)r    r!   r\   	layerdropr   rE   r   rW   pad_token_idr9   scale_embeddingmathsqrtr7   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positionsrO   
ModuleListrangenum_hidden_layersru   r   
layer_normgradient_checkpointing	post_init)r"   r\   r:   r#   r   r&   r!   W  s    zBioGptModel.__init__Nr<   r'   inputs_embedsrj   r   r)   rk   output_hidden_statesreturn_dictrl   rF   rm   c              
   K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td|d urF|}|j}|d|d }n|d ur^| d d }|d d d d df }ntd|d u rk| 	|}| j
rz| jrz|rztd d}|r|d u rt| j d}| d d \}}|d ur| nd}|
d u rtj||| |jd}
|d u r|| }tj|||jd}|}t| j |||
|d	}|d u r|
d}| j|||d
}|| }tjj|| j| jd}| j
r| jr|rtd d}|rdnd }|rdnd }d }t| jD ];\}}|r||f7 }| jr%tg }|| jk r%q||f||||||
d|}|d }|rB||d f7 }q|rL||f7 }| |}|	sbtdd |||||fD S t |||||dS )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timerG   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...Fr   r   device)r\   r   r'   rl   rj   )r)   rI   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r%   )r'   rj   rk   r   r)   rl   r   c                 s   s    | ]	}|d ur|V  qd S r;   r%   )r   vr%   r%   r&   	<genexpr>  s    z&BioGptModel.forward.<locals>.<genexpr>)last_hidden_staterj   rh   
attentionscross_attentions)!r\   rk   r   r   use_return_dictr`   rn   ro   rL   r   r   rK   ra   rb   r	   get_seq_lengthr,   aranger   onesr   	unsqueezer   rO   rP   rE   	enumerateru   randr   r   r   r   )r"   r<   r'   r   rj   r   r)   rk   r   r   rl   rF   inputinput_shape
batch_size
seq_lengthr(   mask_seq_lengthself_attn_cachecausal_mask	positionsrh   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputsr%   r%   r&   r/   l  s   
	




zBioGptModel.forward)
NNNNNNNNNN)r0   r1   r2   r   r!   r   r,   r5   r   r   r   r>   r   r   r   r   r/   r6   r%   r%   r#   r&   r   U  sL    	
r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                       s   e Zd ZddiZ fddZdd Zdd Ze																							
ddej	d	B dej
d	B dej
d	B ded	B dej	d	B ded	B dej	d	B ded	B ded	B ded	B dejd	B deejB dee deeB fddZ  ZS )BioGptForCausalLMzoutput_projection.weightzbiogpt.embed_tokens.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFr^   )
r    r!   r   r   rO   rc   r   r   output_projectionr   r"   r\   r#   r%   r&   r!     s   
zBioGptForCausalLM.__init__c                 C   s   | j S r;   r   r"   r%   r%   r&   get_output_embeddings  s   z'BioGptForCausalLM.get_output_embeddingsc                 C   s
   || _ d S r;   r   )r"   new_embeddingsr%   r%   r&   set_output_embeddings  s   
z'BioGptForCausalLM.set_output_embeddingsNr   r<   r'   r   rj   labelsr   r)   rk   r   r   rl   logits_to_keeprF   rm   c                 K   s   |
dur|
n| j j}
| j|f|||||||	|
|d	|}|d }t|tr,t| dn|}| |dd|ddf }d}|durP| jd||| j jd|}|
sf|f|dd  }|durd|f| S |S t	|||j
|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)	r'   r   rj   r   r)   rk   r   r   rl   r   )logitsr   r   r   )lossr   rj   rh   r   r   r%   )r\   r   r   rp   r4   slicer   loss_functionr   r   rj   rh   r   r   )r"   r<   r'   r   rj   r   r   r)   rk   r   r   rl   r   rF   r   rh   slice_indicesr   r   outputr%   r%   r&   r/     sB   zBioGptForCausalLM.forwardNNNNNNNNNNNr   )r0   r1   r2   _tied_weights_keysr!   r   r   r   r,   r5   r   r   r   r>   r4   r   r   r   r   r/   r6   r%   r%   r#   r&   r     s^    		
r   c                       s   e Zd Z fddZe												ddejdB dejdB dejdB dedB dejdB d	ejdB d
e	dB dejdB de	dB de	dB de	dB dej
dB deeB fddZ  ZS )BioGptForTokenClassificationc                    sj   t  | |j| _t|| _t|dr|jd ur|j}n|j}t	|| _
t|j|j| _|   d S )Nclassifier_dropout)r    r!   
num_labelsr   r   hasattrr   r   rO   DropoutrE   rc   r   
classifierr   )r"   r\   r   r#   r%   r&   r!   K  s   
z%BioGptForTokenClassification.__init__Nr<   token_type_idsr'   rj   r   r   r   r)   rk   r   r   rl   rm   c                 K   s  |dur|n| j j}| j|||||||	|
||d
}|d }| |}| |}d}|durft }|durY|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|s||f|dd  }|durz|f| S |S t|||j|jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	rj   r'   r   r   r)   rk   r   r   rl   r   rG   r   r   )r   r   rh   r   )r\   r   r   rE   r   r   ro   r   r,   wheretensorignore_indextype_asr   rh   r   )r"   r<   r   r'   rj   r   r   r   r)   rk   r   r   rl   rF   transformer_outputsrh   r   r   loss_fctactive_lossactive_logitsactive_labelsr   r%   r%   r&   r/   Y  sH   

z$BioGptForTokenClassification.forward)NNNNNNNNNNNN)r0   r1   r2   r!   r   r,   r5   r   r   r   r>   r   r   r/   r6   r%   r%   r#   r&   r   I  sT    	
r   a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       s   e Zd Zdef fddZe												ddejdB dejdB de	dB d	ejdB d
ejdB de
dB dejdB de
dB de
dB de
dB dejdB deejB deeB fddZdd Zdd Z  ZS )BioGptForSequenceClassificationr\   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r    r!   r   r   r   rO   rc   r   scorer   r   r#   r%   r&   r!     s
   
z(BioGptForSequenceClassification.__init__Nr   r<   r'   rj   r   r   r   r)   rk   r   r   rl   r   rm   c                 K   s:  |
dur|
n| j j}
| j||||||||	|
|d
}|d }t|tr(t| dn|}| |dd|ddf }|durF|jdd \}}n	|jdd \}}| j jdu rXd}n"|durnt	
|| j jdd |j}nd}t| jj d |t	j||jd|f }d}|dur| j jdu r| jdkrd	| j _n| jdkr|jt	jks|jt	jkrd
| j _nd| j _| j jd	krt }| jdkr|| | }n+|||}n%| j jd
krt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )r   Nr   r   r   rG   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r   r   rj   rh   r   ) r\   r   r   rp   r4   r   r  rn   r   r,   nesumtor   ra   rb   r$   r0   r   problem_typer   dtyper.   r   squeezer   ro   r   r   rj   rh   r   )r"   r<   r'   rj   r   r   r   r)   rk   r   r   rl   r   rF   r   rh   r   r   r   sequence_lengthpooled_logitsr   r   r   r%   r%   r&   r/     sr   $

"


z'BioGptForSequenceClassification.forwardc                 C   s   | j jS r;   r   r   r   r%   r%   r&   get_input_embeddings  s   z4BioGptForSequenceClassification.get_input_embeddingsc                 C   s   || j _d S r;   r  )r"   rC   r%   r%   r&   set_input_embeddings  s   z4BioGptForSequenceClassification.set_input_embeddingsr   )r0   r1   r2   r   r!   r   r,   r5   r   r   r   r>   r4   r   r   r/   r  r  r6   r%   r%   r#   r&   r    sX    		
]r  )r   r   r  r   r   )Nr?   )9r   collections.abcr   r,   torch.nnrO   r   r   r   activationsr   cache_utilsr   r	   r
   
generationr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_biogptr   
get_loggerr0   ra   	Embeddingr   r7   Moduler>   r=   rU   rV   r   r   r   r   r   r  __all__r%   r%   r%   r&   <module>   sp   

}V PSo