o
    eic                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% G dd de#Z&G dd de!Z'G dd deZ(G dd de Z)eG dd deZ*eG dd de*Z+eddG d d! d!e*eZ,eG d"d# d#e*Z-ed$dG d%d& d&e*Z.g d'Z/dS )(zPyTorch BioGPT model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogger   )BartAttentionBartDecoderLayerBartScaledWordEmbedding)OPTLearnedPositionalEmbedding   )BioGptConfigc                       s8   e Zd Z		ddejdedejdB f fddZ  ZS )	 BioGptLearnedPositionalEmbeddingr   Nattention_maskpast_key_values_lengthposition_idsc                    s   t  ||| dS )z3`input_ids_shape` is expected to be [bsz x seqlen].N)superforward)selfr   r   r   	__class__ g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/biogpt/modular_biogpt.pyr    1   s   z(BioGptLearnedPositionalEmbedding.forward)r   N)__name__
__module____qualname__torch
LongTensorintr    __classcell__r$   r$   r"   r%   r   0   s    r   c                   @      e Zd ZdS )BioGptScaledWordEmbeddingNr&   r'   r(   r$   r$   r$   r%   r.   ;       r.   c                   @   r-   )BioGptAttentionNr/   r$   r$   r$   r%   r1   ?   r0   r1   c                       s   e Zd ZddededB f fddZ						ddejd	ejdB d
edB de	dB de	dB dej
dB dejdB dee deejeejejf dB f fddZ  ZS )BioGptDecoderLayerNconfig	layer_idxc              	      sv   t  | |j| _t| j|j|jdd||d| _|j| _	t
|j | _t| j|j| _t|j| j| _| `| `d S )NT)	embed_dim	num_headsdropout
is_decoder	is_causalr3   r4   )r   __init__hidden_sizer5   r1   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probr7   r   
hidden_actactivation_fnnnLinearintermediate_sizefc1fc2encoder_attnencoder_attn_layer_norm)r!   r3   r4   r"   r$   r%   r:   D   s"   	zBioGptDecoderLayer.__init__FThidden_statesr   past_key_valuesoutput_attentions	use_cacher   cache_positionkwargsreturnc              	   K   s   |}	|  |}| jd||||||d|\}}
tjj|| j| jd}|	| }|}	| |}| |}| |}tjj|| j	| jd}| 
|}tjj|| j| jd}|	| }|f}|ra||
f7 }|S )a\  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )rI   rJ   r   rK   r   rM   ptrainingNr$   )self_attn_layer_normr>   rB   
functionalr7   rR   final_layer_normrE   rA   activation_dropoutrF   )r!   rI   r   rJ   rK   rL   r   rM   rN   residualself_attn_weightsoutputsr$   r$   r%   r    Z   s4   

	




zBioGptDecoderLayer.forwardN)NNFTNN)r&   r'   r(   r   r+   r:   r)   Tensorr   boolr*   r   r   tupleFloatTensorr    r,   r$   r$   r"   r%   r2   C   s6    	
r2   c                   @   s.   e Zd ZU eed< dZdZdZdZdZ	dZ
dS )BioGptPreTrainedModelr3   biogptTN)r&   r'   r(   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr$   r$   r$   r%   r_      s   
 r_   c                       s   e Zd Zdef fddZe										ddejdB dejdB dejdB de	dB d	e
dB d
ejdB de
dB de
dB de
dB dejdB dee deeB fddZ  ZS )BioGptModelr3   c                    s   t     | _ j| _ j| _ j| _ j| _	 j
r"t jnd}t j| j| j	|d| _t j| j| _t fddt jD | _t| j| _d| _|   d S )Ng      ?)embed_scalec                    s   g | ]}t  |d qS ))r4   )r2   ).0ir3   r$   r%   
<listcomp>   s    z(BioGptModel.__init__.<locals>.<listcomp>F)r   r:   r3   	layerdropr?   r7   r;   r5   pad_token_idpadding_idxscale_embeddingmathsqrtr.   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positionsrB   
ModuleListrangenum_hidden_layerslayers	LayerNorm
layer_normgradient_checkpointing	post_init)r!   r3   ri   r"   rl   r%   r:      s    zBioGptModel.__init__N	input_idsr   inputs_embedsrJ   rL   r   rK   output_hidden_statesreturn_dictrM   rN   rO   c              
   K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td|d urF|}|j}|d|d }n|d ur^| d d }|d d d d df }ntd|d u rk| 	|}| j
rz| jrz|rztd d}|r|d u rt| j d}| d d \}}|d ur| nd}|
d u rtj||| |jd}
|d u r|| }tj|||jd}|}t| j |||
|d	}|d u r|
d}| j|||d
}|| }tjj|| j| jd}| j
r| jr|rtd d}|rdnd }|rdnd }d }t| jD ];\}}|r||f7 }| jr%tg }|| jk r%q||f||||||
d|}|d }|rB||d f7 }q|rL||f7 }| |}|	sbtdd |||||fD S t |||||dS )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timezEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...Frl   r   device)r3   r   r   rM   rJ   )r   rP   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r$   )r   rJ   rK   rL   r   rM   r   c                 s   s    | ]	}|d ur|V  qd S rZ   r$   )rj   vr$   r$   r%   	<genexpr>5  s    z&BioGptModel.forward.<locals>.<genexpr>)last_hidden_staterJ   rI   
attentionscross_attentions)!r3   rK   r   rL   use_return_dict
ValueErrorshapeviewsizeru   r~   rR   r   warning_oncer   get_seq_lengthr)   aranger   onesr
   	unsqueezerw   rB   rT   r7   	enumerater{   randrn   r}   r]   r   )r!   r   r   r   rJ   rL   r   rK   r   r   rM   rN   inputinput_shape
batch_size
seq_lengthr   mask_seq_lengthself_attn_cachecausal_mask	positionsrI   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputsr$   r$   r%   r       s   
	




zBioGptModel.forward)
NNNNNNNNNN)r&   r'   r(   r   r:   r   r)   r*   r^   r   r\   r[   r   r   r]   r   r    r,   r$   r$   r"   r%   rh      sL    	
rh   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                       s   e Zd ZddiZ fddZdd Zdd Ze																							
ddej	d	B dej
d	B dej
d	B ded	B dej	d	B ded	B dej	d	B ded	B ded	B ded	B dejd	B deejB dee deeB fddZ  ZS )BioGptForCausalLMzoutput_projection.weightzbiogpt.embed_tokens.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NF)bias)
r   r:   rh   r`   rB   rC   r;   rt   output_projectionr   r!   r3   r"   r$   r%   r:   K  s   
zBioGptForCausalLM.__init__c                 C   s   | j S rZ   r   r!   r$   r$   r%   get_output_embeddingsT  s   z'BioGptForCausalLM.get_output_embeddingsc                 C   s
   || _ d S rZ   r   )r!   new_embeddingsr$   r$   r%   set_output_embeddingsW  s   
z'BioGptForCausalLM.set_output_embeddingsNr   r   r   r   rJ   labelsrL   r   rK   r   r   rM   logits_to_keeprN   rO   c                 K   s   |
dur|
n| j j}
| j|f|||||||	|
|d	|}|d }t|tr,t| dn|}| |dd|ddf }d}|durP| jd||| j jd|}|
sf|f|dd  }|durd|f| S |S t	|||j
|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)	r   r   rJ   rL   r   rK   r   r   rM   r   )logitsr   rt   r   )lossr   rJ   rI   r   r   r$   )r3   r   r`   
isinstancer+   slicer   loss_functionrt   r   rJ   rI   r   r   )r!   r   r   r   rJ   r   rL   r   rK   r   r   rM   r   rN   rY   rI   slice_indicesr   r   outputr$   r$   r%   r    Z  sB   zBioGptForCausalLM.forwardNNNNNNNNNNNr   )r&   r'   r(   _tied_weights_keysr:   r   r   r   r)   r*   r^   r   r\   r[   r+   r   r   r]   r   r    r,   r$   r$   r"   r%   r   C  s^    		
r   c                       s   e Zd Z fddZe												ddejdB dejdB dejdB dedB dejdB d	ejdB d
e	dB dejdB de	dB de	dB de	dB dej
dB deeB fddZ  ZS )BioGptForTokenClassificationc                    sj   t  | |j| _t|| _t|dr|jd ur|j}n|j}t	|| _
t|j|j| _|   d S )Nclassifier_dropout)r   r:   
num_labelsrh   r`   hasattrr   r?   rB   Dropoutr7   rC   r;   
classifierr   )r!   r3   r   r"   r$   r%   r:     s   
z%BioGptForTokenClassification.__init__Nr   token_type_idsr   rJ   r   r   rL   r   rK   r   r   rM   rO   c                 K   s  |dur|n| j j}| j|||||||	|
||d
}|d }| |}| |}d}|durft }|durY|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|s||f|dd  }|durz|f| S |S t|||j|jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N	rJ   r   r   rL   r   rK   r   r   rM   r   r   r   r   )r   r   rI   r   )r3   r   r`   r7   r   r   r   r   r)   wheretensorignore_indextype_asr   rI   r   )r!   r   r   r   rJ   r   r   rL   r   rK   r   r   rM   rN   transformer_outputsrI   r   r   loss_fctactive_lossactive_logitsactive_labelsr   r$   r$   r%   r      sH   

z$BioGptForTokenClassification.forward)NNNNNNNNNNNN)r&   r'   r(   r:   r   r)   r*   r^   r   r\   r[   r]   r   r    r,   r$   r$   r"   r%   r     sT    	
r   a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       s   e Zd Zdef fddZe												ddejdB dejdB de	dB d	ejdB d
ejdB de
dB dejdB de
dB de
dB de
dB dejdB deejB deeB fddZdd Zdd Z  ZS )BioGptForSequenceClassificationr3   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r   )
r   r:   r   rh   r`   rB   rC   r;   scorer   r   r"   r$   r%   r:     s
   
z(BioGptForSequenceClassification.__init__Nr   r   r   rJ   r   r   rL   r   rK   r   r   rM   r   rO   c                 K   s:  |
dur|
n| j j}
| j||||||||	|
|d
}|d }t|tr(t| dn|}| |dd|ddf }|durF|jdd \}}n	|jdd \}}| j jdu rXd}n"|durnt	
|| j jdd |j}nd}t| jj d |t	j||jd|f }d}|dur| j jdu r| jdkrd	| j _n| jdkr|jt	jks|jt	jkrd
| j _nd| j _| j jd	krt }| jdkr|| | }n+|||}n%| j jd
krt }||d| j|d}n| j jdkrt }|||}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )r   Nr   r   r   r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r   r   rJ   rI   r   ) r3   r   r`   r   r+   r   r   r   ro   r)   nesumtor   r   r   r#   r&   r   problem_typer   dtypelongr   squeezer   r   r   r   rJ   rI   r   )r!   r   r   rJ   r   r   rL   r   rK   r   r   rM   r   rN   r   rI   r   r   r   sequence_lengthpooled_logitsr   r   r   r$   r$   r%   r      sr   $

"


z'BioGptForSequenceClassification.forwardc                 C   s   | j jS rZ   r`   ru   r   r$   r$   r%   get_input_embeddingsb  s   z4BioGptForSequenceClassification.get_input_embeddingsc                 C   s   || j _d S rZ   r   )r!   valuer$   r$   r%   set_input_embeddingse  s   z4BioGptForSequenceClassification.set_input_embeddingsr   )r&   r'   r(   r   r:   r   r)   r*   r^   r   r\   r[   r+   r]   r   r    r   r   r,   r$   r$   r"   r%   r     sX    		
]r   )r   r   r   rh   r_   )0__doc__rr   r)   torch.nnrB   r   r   r   activationsr   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   bart.modeling_bartr   r   r   opt.modeling_optr   configuration_biogptr   r   r.   r1   r2   r_   rh   r   r   r   __all__r$   r$   r$   r%   <module>   sH   U PSo