o
    wi                     @   s(  d dl Z d dlmZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( e% rddl)m*Z*m+Z+ e&,e-Z.G dd dej/Z0G dd dej/Z1			d5dej2dej3dej3dej3deej3 dee4 de4deej3 fdd Z5G d!d" d"ej2Z6G d#d$ d$eZ7e$G d%d& d&eZ8e$G d'd( d(e8Z9G d)d* d*ee#Z:e$d+d,G d-d. d.e8eZ;e$G d/d0 d0e8Z<e$d1d,G d2d3 d3e8Z=g d4Z>dS )6    N)CallableOptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringis_torch_flex_attn_availablelogging   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                       sR   e Zd ZdZdedef fddZ		ddejd	ed
eej f fddZ	  Z
S ) BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S )N   )offsetsuper__init__)selfr    r!   	__class__ g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/biogpt/modeling_biogpt.pyr%   ;   s   z)BioGptLearnedPositionalEmbedding.__init__r   Nattention_maskpast_key_values_lengthposition_idsc                    sL   |du rt j|dd}|| d  }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr$   forwardr#   )r&   r+   r,   r-   r'   r)   r*   r3   A   s
   z(BioGptLearnedPositionalEmbedding.forwardr   N)__name__
__module____qualname____doc__intr%   r0   
LongTensorr   r3   __classcell__r)   r)   r'   r*   r   6   s    	r   c                
       sL   e Zd ZdZddedededee f fddZd	ej	f fd
dZ
  ZS )BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
          ?r    r!   padding_idxembed_scalec                    s   t  ||| || _d S N)r$   r%   r?   )r&   r    r!   r>   r?   r'   r)   r*   r%   W   s   
z"BioGptScaledWordEmbedding.__init__	input_idsc                    s   t  || j S r@   )r$   r3   r?   )r&   rA   r'   r)   r*   r3   [   s   z!BioGptScaledWordEmbedding.forward)r=   )r5   r6   r7   r8   r9   r   floatr%   r0   Tensorr3   r;   r)   r)   r'   r*   r<   R   s    $r<           modulequerykeyvaluer+   scalingdropout	head_maskc                 K   s   |d u r| dd }t||dd| }	|d ur|	| }	tjj|	dd}	|d ur5|	|dddd }	tjj|	|| j	d}	t|	|}
|
dd
 }
|
|	fS )N      r"   r   r.   r   ptraining)sizer0   matmul	transposenn
functionalsoftmaxviewrJ   rP   
contiguous)rE   rF   rG   rH   r+   rI   rJ   rK   kwargsattn_weightsattn_outputr)   r)   r*   eager_attention_forward_   s   r\   c                       s   e Zd ZdZ						ddededed	ed
ededee dee f fddZ							dde
jdee
j dee dee
j dee
j dedee
j dee dee
jee
j eee
j  f fddZ  ZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrD   FTN	embed_dim	num_headsrJ   
is_decoderbias	is_causalconfig	layer_idxc	           	         s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
|| _|d u rK| j	rKtd| jj d tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rM   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.ra   )r$   r%   r^   r_   rJ   head_dimrc   
ValueErrorrI   r`   rb   rd   loggerwarning_oncer(   r5   rT   Lineark_projv_projq_projout_proj)	r&   r^   r_   rJ   r`   ra   rb   rc   rd   r'   r)   r*   r%      s0   


zBioGptAttention.__init__hidden_stateskey_value_statespast_key_valuer+   layer_head_maskoutput_attentionscache_positionrY   returnc                 K   s  |du}	|j dd \}
}|	r|j d n|}|
|d| jf}|
|d| jf}| |j| dd}|durLt|trJ|j| j	}|	rF|j
}n|j}n|}|	rP|n|}|	rg|durg|rg|j| j	 }|j| j	 }n;| |}| |}|j| dd}|j| dd}|dur|	s|nd}|||| j	d|i\}}|	rd|j| j	< t}| jjdkrt| jj }|| ||||f| jsdn| j| j||d	|\}}||
|d }| |}|||fS )
z#Input shape: Batch x Time x ChannelNrL   r   r"   rt   TeagerrD   )rJ   rI   rs   rK   )shaperf   rm   rW   rS   
isinstancer   
is_updatedgetrd   cross_attention_cacheself_attention_cache	key_cachevalue_cacherk   rl   updater\   rc   _attn_implementationr   rP   rJ   rI   reshaperX   rn   )r&   ro   rp   rq   r+   rr   rs   rt   rY   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statesry   curr_past_key_valuecurrent_states
key_statesvalue_statesattention_interfacer[   rZ   r)   r)   r*   r3      s`   






zBioGptAttention.forward)rD   FTFNN)NNNNFN)r5   r6   r7   r8   r9   rB   boolr   r   r%   r0   rC   r
   r   r   tupler3   r;   r)   r)   r'   r*   r]   }   sd    	*r]   c                       s   e Zd Zddedee f fddZ							ddejd	eej d
eej dee	 dee
 dee
 deej deej dee deejeeejejf  f fddZ  ZS )BioGptDecoderLayerNrc   rd   c              	      s   t    |j| _t| j|j|jdd||d| _|j| _	t
|j | _|j| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)r^   r_   rJ   r`   rb   rc   rd   )r$   r%   hidden_sizer^   r]   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrJ   r	   
hidden_actactivation_fnactivation_dropoutrT   	LayerNormself_attn_layer_normrj   intermediate_sizefc1fc2final_layer_norm)r&   rc   rd   r'   r)   r*   r%      s$   
	zBioGptDecoderLayer.__init__FTro   r+   rr   rq   rs   	use_cacher-   rt   flash_attn_kwargsru   c	              
   K   s   |}
|  |}| jd|||||||d|	\}}}tjj|| j| jd}|
| }|}
| |}| |}| |}tjj|| j	| jd}| 
|}tjj|| j| jd}|
| }|f}|rc||f7 }|rj||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
                cache in the correct position and to infer the complete sequence length.
        )ro   rq   r+   rr   rs   r-   rt   rN   Nr)   )r   r   rT   rU   rJ   rP   r   r   r   r   r   )r&   ro   r+   rr   rq   rs   r   r-   rt   r   residualself_attn_weightsoutputsr)   r)   r*   r3     s:   







zBioGptDecoderLayer.forwardr@   )NNNFTNN)r5   r6   r7   r   r   r9   r%   r0   rC   r
   r   r:   r   r   r   FloatTensorr3   r;   r)   r)   r'   r*   r      s<    	
r   c                   @   s   e Zd ZeZdZdZdZdZdZ	dZ
dZdd Zdeeejdf  dejdejd	efd
dZedejdededejdejdefddZdS )BioGptPreTrainedModelbiogptTc                 C   s   t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
rC|jjjd| jjd |jdurA|jj|j 	  dS dS t |tjrX|jj	  |jjd dS dS )zInitialize the weightsrD   )meanstdNr=   )rx   rT   rj   weightdatanormal_rc   initializer_rangera   zero_	Embeddingr>   r   fill_)r&   rE   r)   r)   r*   _init_weightsc  s   

z#BioGptPreTrainedModel._init_weightsr+   r   input_tensorrt   past_key_valuesc                 C   sb  | j jdkr*t|tjrt|}|S |d u r(ttj|jd |jd f|jd}|S | j jdkr>|d ur<|dk	 r<|S d S |d urF|
 nd}|d urO|jnd}| j jdkre|setj|||| jd	red S |j}|jd }|rt| }	nt|tjr|jd
 n|| d }	| j|||	|||jd d}
| j jdkr|d ur|jjdv rt|j}t|
|}
|
S )Nflex_attentionr   r   )rQ   deviceflash_attention_2rD   Fsdpa)inputs_embedsr,   is_trainingrL   )sequence_lengthtarget_lengthdtypert   
batch_size)cudaxpunpu)rc   r   rx   r0   rC   r   onesrw   r   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdparP   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r&   r+   r   rt   r   past_seen_tokensusing_compilable_cacher   r   r   causal_mask	min_dtyper)   r)   r*   _update_causal_maskt  s`   





z)BioGptPreTrainedModel._update_causal_maskr   r   r   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer   r   r   )diagonalr   rL   r   )r/   r0   r   r   fullr   triuaranger   expandclonerw   tomasked_fill)r+   r   r   r   rt   r   rY   r   r   mask_lengthpadding_maskr)   r)   r*   r     s,    $
6  zKBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_positionN)r5   r6   r7   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_static_cacher   r   r   r0   rC   r
   r   staticmethodr9   r   r   r)   r)   r)   r*   r   X  sB    
Lr   c                       s   e Zd Zdef fddZdd Zdd Ze											dd	ee	j
 d
ee	j dee	j dee	j deeee	j   dee dee	j
 dee dee dee dee	j dee deeef fddZ  ZS )BioGptModelrc   c                    s   t     | _ j| _ j| _ j| _ j| _	 j
r"t jnd}t j| j| j	|d| _t j| j| _t fddt jD | _t| j| _d| _|   d S )Nr=   )r?   c                    s   g | ]}t  |d qS ))rd   )r   ).0irc   r)   r*   
<listcomp>	  s    z(BioGptModel.__init__.<locals>.<listcomp>F)r$   r%   rc   	layerdropr   rJ   r   r^   pad_token_idr>   scale_embeddingmathsqrtr<   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positionsrT   
ModuleListrangenum_hidden_layerslayersr   
layer_normgradient_checkpointing	post_init)r&   rc   r?   r'   r   r*   r%     s    zBioGptModel.__init__c                 C      | j S r@   r   r&   r)   r)   r*   get_input_embeddings     z BioGptModel.get_input_embeddingsc                 C   
   || _ d S r@   r   r&   rH   r)   r)   r*   set_input_embeddings     
z BioGptModel.set_input_embeddingsNrA   r+   rK   r   r   r   r-   rs   output_hidden_statesreturn_dictrt   r   ru   c           !      K   s|  |d ur|n| j j}|	d ur|	n| j j}	|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td|d urF|}|j}|d|d }n|d ur^| d d }|d d d d df }ntd|d u rk| 	|}| j
rz| jrz|rztd d}d}|rt|tsd}td t|}| d d \}}|d ur| nd}|d u rtj||| |jd	}|d u r|| }tj|||jd	}t|tr|jn|}| ||||}|d u rtj|d
d}|| d
  }|d d |d f }| j|||d}|| }tjj|| j| jd}| j
r| jr|rtd d}|	r dnd }|r'dnd }d }|r0dnd }t| j D ]Q\}}|	rC||f7 }| jrTt!g }|| j"k rTq7||f||d ura|| nd |||||d|}|d }|r}||rzdnd
 }|r||d
 f7 }q7|	r||f7 }| #|}|r|nd } |r|$ } |
st%dd || |||fD S t&|| |||dS )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timerL   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...FTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   r   r.   )r-   rN   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r)   )r+   rr   rq   rs   r   r-   rt   r"   c                 s   s    | ]	}|d ur|V  qd S r@   r)   )r   vr)   r)   r*   	<genexpr>  s    z&BioGptModel.forward.<locals>.<genexpr>)last_hidden_stater   ro   
attentionscross_attentions)'rc   rs   r  r   use_return_dictrg   rw   rW   rQ   r   r   rP   rh   ri   rx   r
   r   from_legacy_cacher   r0   r   r   r   r|   r   r1   r2   r   rT   rU   rJ   	enumerater   randr   r   to_legacy_cacher   r   )!r&   rA   r+   rK   r   r   r   r-   rs   r  r  rt   r   inputinput_shapereturn_legacy_cacher   
seq_lengthr,   mask_seq_lengthself_attn_cacher   	positionsro   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacheidxdecoder_layerdropout_probabilitylayer_outputs
next_cacher)   r)   r*   r3     s   



	

zBioGptModel.forward)NNNNNNNNNNN)r5   r6   r7   r   r%   r   r  r   r   r0   r:   r   r   rC   r   r   r   r   r   r3   r;   r)   r)   r'   r*   r     sV    	

r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r5   r6   r7   r)   r)   r)   r*   r     s    r   zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        s   e Zd ZdgZ fddZdd Zdd Ze												dd	ee	j
 d
ee	j dee	j dee	j deeee	j   dee	j
 dee dee	j
 dee dee dee dee	j dee deeef fddZedd Z  ZS )BioGptForCausalLMzoutput_projection.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFre   )
r$   r%   r   r   rT   rj   r   r   output_projectionr   r&   rc   r'   r)   r*   r%     s   
zBioGptForCausalLM.__init__c                 C   r   r@   r$  r   r)   r)   r*   get_output_embeddings  r   z'BioGptForCausalLM.get_output_embeddingsc                 C   r   r@   r&  )r&   new_embeddingsr)   r)   r*   set_output_embeddings  r  z'BioGptForCausalLM.set_output_embeddingsNrA   r+   rK   r   r   labelsr   r-   rs   r  r  rt   rY   ru   c                 K   s   |dur|n| j j}| j|f|||||||	|
||d
|}|d }| |}d}|dur;| j||fd| j ji|}|sQ|f|dd  }|durO|f| S |S t|||j|j|j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
r+   rK   r   r   r   r-   rs   r  r  rt   r   r   r   )losslogitsr   ro   r	  r
  )rc   r  r   r$  loss_functionr   r   r   ro   r	  r
  )r&   rA   r+   rK   r   r   r*  r   r-   rs   r  r  rt   rY   r   sequence_outputprediction_scoreslm_lossoutputr)   r)   r*   r3     sP   
zBioGptForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr)   c                 3   s$    | ]}| d  |jV  qdS r4   )index_selectr   r   )r   
past_statebeam_idxr)   r*   r    s   " z3BioGptForCausalLM._reorder_cache.<locals>.<genexpr>)r   )r   r5  reordered_past
layer_pastr)   r4  r*   _reorder_cache  s   z BioGptForCausalLM._reorder_cacheNNNNNNNNNNNN)r5   r6   r7   _tied_weights_keysr%   r'  r)  r   r   r0   r:   r   r   rC   r   r   r   r   r   r3   r   r8  r;   r)   r)   r'   r*   r"    sb    		

@r"  c                        s   e Zd Z fddZe													ddeej deej deej deej dee	e	ej
   d	eej d
eej dee deej dee dee dee deej
 dee	ef fddZ  ZS )BioGptForTokenClassificationc                    sj   t  | |j| _t|| _t|dr|jd ur|j}n|j}t	|| _
t|j|j| _|   d S )Nclassifier_dropout)r$   r%   
num_labelsr   r   hasattrr<  r   rT   DropoutrJ   rj   r   
classifierr   )r&   rc   r<  r'   r)   r*   r%     s   
z%BioGptForTokenClassification.__init__NrA   token_type_idsr+   rK   r   r   r*  r   r-   rs   r  r  rt   ru   c                 C   s  |dur|n| j j}| j|||||||	|
|||d}|d }| |}| |}d}|durgt }|durZ|ddk}|d| j}t	||dt
|j|}|||}n||d| j|d}|s}|f|dd  }|dur{|f| S |S t|||j|jdS )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N
r   r+   rK   r   r   r-   rs   r  r  rt   r   rL   r   r"   )r+  r,  ro   r	  )rc   r  r   rJ   r@  r   rW   r=  r0   wheretensorignore_indextype_asr   ro   r	  )r&   rA   rA  r+   rK   r   r   r*  r   r-   rs   r  r  rt   transformer_outputsro   r,  r+  loss_fctactive_lossactive_logitsactive_labelsr1  r)   r)   r*   r3   +  sJ   

z$BioGptForTokenClassification.forward)NNNNNNNNNNNNN)r5   r6   r7   r%   r   r   r0   r:   r   r   rC   r   r   r   r3   r;   r)   r)   r'   r*   r;    sZ    	

r;  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       s   e Zd Zdef fddZe												ddeej deej	 deej	 dee
e
ej   d	eej	 d
eej dee deej dee dee dee deej dee
ef fddZdd Zdd Z  ZS )BioGptForSequenceClassificationrc   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S r#  )
r$   r%   r=  r   r   rT   rj   r   scorer   r%  r'   r)   r*   r%     s
   
z(BioGptForSequenceClassification.__init__NrA   r+   rK   r   r   r*  r   r-   rs   r  r  rt   ru   c                 C   s  |dur|n| j j}| j||||||||	|
||d}|d }| |}|dur1|jdd \}}n	|jdd \}}| j jdu rCd}n"|durYt|| j jdd 	|j
}nd}t| jj d |tj||j
d|f }d}|dur| j jdu r| jdkrd	| j _n| jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| jdkr|| | }n+|||}n%| j jd
krt }||d| j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|jdS )rB  NrC  r   r"   rL   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)r+  r,  r   ro   r	  )rc   r  r   rN  rw   r   r0   nesumr   r   rh   ri   r(   r5   r   problem_typer=  r   r2   r9   r   squeezer   rW   r   r   r   ro   r	  )r&   rA   r+   rK   r   r   r*  r   r-   rs   r  r  rt   rH  ro   r,  r   r   pooled_logitsr+  rI  r1  r)   r)   r*   r3     sr   
$

"


z'BioGptForSequenceClassification.forwardc                 C   s   | j jS r@   r   r   r   r)   r)   r*   r     s   z4BioGptForSequenceClassification.get_input_embeddingsc                 C   s   || j _d S r@   rW  r  r)   r)   r*   r    s   z4BioGptForSequenceClassification.set_input_embeddingsr9  )r5   r6   r7   r   r%   r   r   r0   r:   r   r   rC   r   r   r   r3   r   r  r;   r)   r)   r'   r*   rM  p  sX    		

\rM  )r"  r;  rM  r   r   )NrD   N)?r   typingr   r   r   r0   torch.nnrT   r   r   r   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_biogptr   integrations.flex_attentionr   r   
get_loggerr5   rh   r   r   r<   ModulerC   rB   r\   r]   r   r   r   r   r"  r;  rM  __all__r)   r)   r)   r*   <module>   sz   

~] ! =]Tn