o
    	۷i                    @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dlm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 dd Z7G dd de
j8Z9dd Z:dd Z;G dd de
j8Z<	 	did!e
j8d"ej	d#ej	d$ej	d%eej	 d&e=d'e=d(eej	 d)e'e, fd*d+Z>G d,d- d-e
j8Z?G d.d/ d/e
j8Z@G d0d1 d1e
j8ZAd2d3 ZBG d4d5 d5e
j8ZCG d6d7 d7e
j8ZDG d8d9 d9eZEG d:d; d;e
j8ZFG d<d= d=e
j8ZGe-G d>d? d?e$ZHG d@dA dAeHZIG dBdC dCe
j8ZJG dDdE dEe
j8ZKG dFdG dGe
j8ZLee-G dHdI dIeZMG dJdK dKe
j8ZNG dLdM dMe
j8ZOedNG dOdP dPe
j8ZPG dQdR dRe
j8ZQG dSdT dTe
j8ZRdUdV ZSdjdWdXZTdYej	dZeUd[ej	fd\d]ZVG d^d_ d_e
j8ZWG d`da daeZXe-G dbdc dce$ZYG ddde deeYZZG dfdg dgeYeZ[g dhZ\dS )k    N)	dataclass)CallableOptionalUnion)Tensornn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsBaseModelOutputWithPast,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSModuleUtilsMixinPreTrainedModelget_parameter_dtype)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)OutputRecordercheck_model_inputs   )EvollaConfigSaProtConfigc                 C   s2   |  | }tj|dd|| }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r$   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxmaskincremental_indices r3   `/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/evolla/modeling_evolla.py"create_position_ids_from_input_ids5   s   r5   c                       s:   e Zd ZdZ fddZ				d	ddZdd Z  ZS )
EvollaSaProtEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _|jrtj	|j|j
d| _nd | _t|j| _t|dd| _| jdt|jddd |j| _| jdkrTtj|j|j| jd| _|j| _|j| _d | _d S )	N)r0   epsposition_embedding_typeabsoluteposition_ids)r$   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr9   register_bufferr+   arangemax_position_embeddingsexpandr0   position_embeddingstoken_dropoutmask_token_idr;   selfconfig	__class__r3   r4   r@   J   s$   


zEvollaSaProtEmbeddings.__init__Nc           
      C   s  |d u r|d urt || j}n| |}|d u r| |}|}| jrc|d urc||| jkdd}d}|d ur=|dn|j	d }|| jkd
 | }|d|  d| d d d d f  |j}| jdkrq| |}	||	 }| jd ur{| |}|d ur||d |j}|S )Nr<           gQ?r$   r:   )r5   r0   &create_position_ids_from_inputs_embedsrE   rS   masked_fillrT   	unsqueezesumshapefloattodtyper9   rR   rI   )
rV   r/   attention_maskr;   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedrR   r3   r3   r4   forwardc   s.   

	"



zEvollaSaProtEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr<   r$   rb   devicer   )sizer+   rO   r0   r.   rk   r]   rQ   )rV   rd   input_shapesequence_lengthr;   r3   r3   r4   r[      s   	z=EvollaSaProtEmbeddings.create_position_ids_from_inputs_embedsNNNN)__name__
__module____qualname____doc__r@   ri   r[   __classcell__r3   r3   rX   r4   r6   E   s    
1r6   c                 C   s&   | j ddd\}}tj| |fddS )N   r<   r'   )chunkr+   catxx1x2r3   r3   r4   rotate_half_esm   s   r|   c                 C   s`   |d d d d d | j d d d f }|d d d d d | j d d d f }| | t| |  S )N)r_   r|   )ry   cossinr3   r3   r4   apply_rotary_pos_emb_esm   s   &&r   c                       sb   e Zd ZU dZejed< def fddZdddZ	d	ejd
ejde
ejejf fddZ  ZS )EvollaSaProtRotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    inv_freqr(   c                    sP   t    ddtjd|dtjd |   }| d| d | _d | _d | _	d S )N      ?i'  r   ru   rb   r   )
r?   r@   r+   rO   int64r`   rN   _seq_len_cached_cos_cached_sin_cached)rV   r(   r   rX   r3   r4   r@      s   
$
z$EvollaSaProtRotaryEmbedding.__init__ru   c                 C   s   |j | }|| jks| jj|jkrU|| _tj|j | |jd| j}t|| j}tj	||fdd
|j}| d d d d d d f | _| d d d d d d f | _| j| jfS )Nrk   r<   r'   )r_   r   r   rk   r+   rO   r-   r   outerrw   ra   r~   r   r   )rV   ry   seq_dimensionseq_lentfreqsembr3   r3   r4   _update_cos_sin_tables   s   
z2EvollaSaProtRotaryEmbedding._update_cos_sin_tablesqkreturnc                 C   sJ   | j |dd\| _| _t|| j| jj|jdt|| j| jj|jdfS )Nr}   )r   r   )r   r   r   r   ra   rb   )rV   r   r   r3   r3   r4   ri      s   z#EvollaSaProtRotaryEmbedding.forward)ru   )rp   rq   rr   rs   r+   r   __annotations__r*   r@   r   tupleri   rt   r3   r3   rX   r4   r      s   
 


.r   rZ   modulequerykeyvaluerc   scalingrL   	head_maskkwargsc                 K   s  t ||dd| }	t| drt| jdv rt|jd }
t j|
t j|	jd	dd}t j|
t j|	jd	dd}|| }| 
|| j d }|j|jd}| jd	krYt d
||}n| jdkrpt d
||}t d||}|| }|	| }	|d ur|d d d d d d d |jd f }|	| }	tjj|	dt jd|j}	tjj|	|| jd}	|d ur|	| }	t |	|}|dd }||	fS )Nru   r   r9   relative_keyrelative_key_queryrj   r<   r$   r   r   zbhld,lrd->bhlrr   zbhrd,lrd->bhlrr}   )r(   rb   )ptraining)r+   matmul	transposehasattrr9   r_   rO   r.   rk   viewdistance_embeddingrP   ra   rb   einsumr   
functionalsoftmaxfloat32rL   r   
contiguous)r   r   r   r   rc   r   rL   r   r   attn_weights
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keycausal_maskattn_outputr3   r3   r4   eager_attention_forward   s2   


&r   c                       sr   e Zd Zd fdd	Z				ddejdeej deej deej d	eej d
ee	 de
ej fddZ  ZS )EvollaSaProtSelfAttentionNFc                    s8  t    || _|j|j dkr"t|ds"td|j d|j d|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _|j| _|p\t|dd| _d | _| jdksk| jd	kr}|j| _t
d
|j d | j| _n| jdkrt| jd| _|j| _|| _d| _| jo| | _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r9   r:   r   r   ru   r$   rotaryr'   r   )r?   r@   rW   rC   num_attention_headsr   
ValueErrorr*   attention_head_sizeall_head_sizer   Linearr   r   r   attention_probs_dropout_probrL   rM   r9   rotary_embeddingsrP   rA   r   r   
is_decoder	layer_idxr   	is_causal)rV   rW   r9   r   is_cross_attentionrX   r3   r4   r@     s8   


z"EvollaSaProtSelfAttention.__init__hidden_statesrc   r   encoder_hidden_statesencoder_attention_maskr   r   c                 K   s>  |j d d \}}||d| jf}	| ||	dd}
|d u}|r$|n|}|r*|n|}| ||	dd}| ||	dd}|
| jd  }
| jdkrX| |
|\}
}t	}| j
jdkry| jdv rstd| j
j d	| j d
t| j
j }|| |
|||f| jsdn| j| j|d|\}}|||d }||fS )Nr<   r$   ru         r   eagerr   zESM z attention does not support z^ embeddings. Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`rZ   )rL   r   r   )r_   r   r   r   r   r   r   r9   r   r   rW   _attn_implementationr   r   r   rL   r   reshaper   )rV   r   rc   r   r   r   r   
batch_sizer   hidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   r3   r3   r4   ri   3  sB   	

	
z!EvollaSaProtSelfAttention.forward)NNFro   )rp   rq   rr   r@   r+   r   r   FloatTensorr   r   r   ri   rt   r3   r3   rX   r4   r     s*    %r   c                       $   e Zd Z fddZdd Z  ZS )EvollaSaProtSelfOutputc                    s.   t    t|j|j| _t|j| _d S N)	r?   r@   r   r   rC   denserJ   rK   rL   rU   rX   r3   r4   r@   j     
zEvollaSaProtSelfOutput.__init__c                 C       |  |}| |}|| }|S r   r   rL   rV   r   input_tensorr3   r3   r4   ri   o     

zEvollaSaProtSelfOutput.forwardrp   rq   rr   r@   ri   rt   r3   r3   rX   r4   r   i      r   c                       sB   e Zd Zd
 fdd	Zdd Z				ddee fdd	Z  ZS )EvollaSaProtAttentionNFc                    sD   t    t|||d| _t|| _t | _tj	|j
|jd| _	d S )N)r   r   r7   )r?   r@   r   rV   r   outputsetpruned_headsr   rG   rC   rH   )rV   rW   r   r   rX   r3   r4   r@   w  s
   

zEvollaSaProtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r$   r'   )lenr   rV   r   r   r   r   r   r   r   r   r   r   union)rV   headsindexr3   r3   r4   prune_heads~  s   z!EvollaSaProtAttention.prune_headsr   c           
      K   s:   |  |}| j|f||||d|\}}	| ||}|S )Nrc   r   r   r   )rG   rV   r   )
rV   r   rc   r   r   r   r   hidden_states_lnr   _r3   r3   r4   ri     s   
	
zEvollaSaProtAttention.forward)NFro   )	rp   rq   rr   r@   r   r   r   ri   rt   r3   r3   rX   r4   r   v  s    r   c                 C   s    | d dt | td   S )zz
    This is the gelu implementation from the original EVOLLA_SA_PROT repo. Using F.gelu yields subtly wrong results.
    g      ?r   g       @)r+   erfmathsqrt)ry   r3   r3   r4   gelu  s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )EvollaSaProtIntermediatec                    s    t    t|j|j| _d S r   )r?   r@   r   r   rC   intermediate_sizer   rU   rX   r3   r4   r@     s   
z!EvollaSaProtIntermediate.__init__r   r   c                 C   s   |  |}t|}|S r   )r   r   )rV   r   r3   r3   r4   ri     s   
z EvollaSaProtIntermediate.forwardrp   rq   rr   r@   r+   r   ri   rt   r3   r3   rX   r4   r     s    r   c                       r   )EvollaSaProtOutputc                    s.   t    t|j|j| _t|j| _	d S r   )
r?   r@   r   r   r   rC   r   rJ   rK   rL   rU   rX   r3   r4   r@     r   zEvollaSaProtOutput.__init__c                 C   r   r   r   r   r3   r3   r4   ri     r   zEvollaSaProtOutput.forwardr   r3   r3   rX   r4   r     r   r   c                       s@   e Zd Z fddZ				d	dee fddZdd Z  ZS )
EvollaSaProtLayerc                    s   t    |j| _d| _t|| _|j| _|j| _| jr-| js&t|  dt|dd| _	t
|| _t|| _tj|j|jd| _d S )Nr$   z> should be used as a decoder model if cross attention is addedT)r   r7   )r?   r@   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionRuntimeErrorcrossattentionr   intermediater   r   r   rG   rC   rH   rU   rX   r3   r4   r@     s   



zEvollaSaProtLayer.__init__Nr   c           	      K   sj   | j |f||d|}| jr.|d ur.t| ds td|  d| j|f||||d|}| |}|S )N)rc   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r   r   r   AttributeErrorr   feed_forward_chunk)	rV   r   rc   r   r   r   r   attention_outputlayer_outputr3   r3   r4   ri     s2   	


	zEvollaSaProtLayer.forwardc                 C   s$   |  |}| |}| ||}|S r   )rG   r   r   )rV   r  attention_output_lnintermediate_outputr  r3   r3   r4   r     s   

z$EvollaSaProtLayer.feed_forward_chunkro   )	rp   rq   rr   r@   r   r   ri   r   rt   r3   r3   rX   r4   r     s    
#r   c                       s<   e Zd Z fddZe				ddee fddZ  ZS )EvollaSaProtEncoderc                    sN   t     | _t fddt jD | _tj j	 j
d| _d| _d S )Nc                    s   g | ]}t  qS r3   )r   .0r   rW   r3   r4   
<listcomp>  s    z0EvollaSaProtEncoder.__init__.<locals>.<listcomp>r7   F)r?   r@   rW   r   
ModuleListrangenum_hidden_layerslayerrG   rC   rH   emb_layer_norm_aftergradient_checkpointingrU   rX   r  r4   r@      s
   
 
zEvollaSaProtEncoder.__init__Nr   c           
      K   s\   t | jD ]\}}|d ur|| nd }	||f||	||d|}q| jr)| |}t|dS )Nr   )last_hidden_state)	enumerater  r  r   )
rV   r   rc   r   r   r   r   ilayer_modulelayer_head_maskr3   r3   r4   ri     s   
	

zEvollaSaProtEncoder.forwardro   )	rp   rq   rr   r@   r    r   r   ri   rt   r3   r3   rX   r4   r    s    r  c                       r   )EvollaSaProtPoolerc                    s*   t    t|j|j| _t | _d S r   )r?   r@   r   r   rC   r   Tanh
activationrU   rX   r3   r4   r@   #  s   
zEvollaSaProtPooler.__init__r   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )rV   r   first_token_tensorpooled_outputr3   r3   r4   ri   (  s   

zEvollaSaProtPooler.forwardr   r3   r3   rX   r4   r  "  s    r  c                   @   sT   e Zd ZU eed< dgZdZdZdZe	e
edddge
edddgdZd	d
 ZdS )EvollaSaProtPreTrainedModelrW   r   Tr$   r   )r   
layer_namer   )r   
attentionscross_attentionsc                 C   s   | j j}t|tjr"|jjjd|d |jdur |jj	  dS dS t|tj
rC|jjjd|d |jdurA|jj|j 	  dS dS t|tjrX|jj	  |jjd dS dS )zInitialize the weightsrZ   meanstdNr   )rW   initializer_range
isinstancer   r   weightdatanormal_biaszero_rA   r0   rG   fill_rV   r   r   r3   r3   r4   _init_weightsA  s   

z)EvollaSaProtPreTrainedModel._init_weightsN)rp   rq   rr   r&   r   _no_split_modules_supports_flash_attn_supports_sdpa_supports_attention_backendr   r"   r   _can_record_outputsr*  r3   r3   r3   r4   r  1  s   
 r  c                       s   e Zd Zdef fddZdd Zdd Zdd	 Ze 	
dde	e
j de	e
j deee
j ef fddZ	
	
ddedee de	e
j de	e
j def
ddZ  ZS )EvollaSaProtProteinEncoderrW   c                    s$   t  | t|| _t|| _d S r   )r?   r@   r6   re   r  encoderrU   rX   r3   r4   r@   R  s   
z#EvollaSaProtProteinEncoder.__init__c                 C   s   | j jS r   re   rE   rV   r3   r3   r4   get_input_embeddingsW  s   z/EvollaSaProtProteinEncoder.get_input_embeddingsc                 C   s   || j _d S r   r2  rV   r   r3   r3   r4   set_input_embeddingsZ     z/EvollaSaProtProteinEncoder.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr1  r  r   r   )rV   heads_to_pruner  r   r3   r3   r4   _prune_heads]  s   z'EvollaSaProtProteinEncoder._prune_headsNr/   rc   r   c                 C   sv   |  }|\}}|j}|d u rtj||f|d}| j||d}| ||}| j||d}	|	d }
t|
|	j|	j	|	j
dS )Nr   r/   rc   )rc   r   )r  r   r  r  )rl   rk   r+   onesre   get_extended_attention_maskr1  r   r   r  r  )rV   r/   rc   rm   r   r   rk   rd   extended_attention_maskencoder_outputssequence_outputr3   r3   r4   ri   e  s   z"EvollaSaProtProteinEncoder.forwardrm   rk   rb   c                 C   s   |du rt | }| dkr| jjs|durtdt | dkr1|dddddddf }n+| dkrP| jjrCt|||}n|ddddddf }nt	d| d|j
 d|j|d}d	| t|j }|S )
a  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nru   zNThe `device` argument is deprecated and will be removed in v5 of Transformers.r   z!Wrong shape for input_ids (shape z) or attention_mask (shape r   r   r   )r   r(   rW   r   warningswarnFutureWarningr   *create_extended_attention_mask_for_decoderr   r_   ra   r+   finfomin)rV   rc   rm   rk   rb   r>  r3   r3   r4   r=  ~  s*   	z6EvollaSaProtProteinEncoder.get_extended_attention_maskr   NN)rp   rq   rr   r&   r@   r4  r6  r:  r#   r   r+   r   r   r   r   ri   r*   rk   rb   r=  rt   r3   r3   rX   r4   r0  Q  s6    r0  c                       s&   e Zd Zd fdd	Zdd Z  ZS )!EvollaSequenceCompressorAttention@      c                    sx   t    |d | _|| _|| }t|| _t|| _tj||dd| _	tj||d dd| _
tj||dd| _d S )Nr   Fr&  ru   )r?   r@   scaler   r   rG   
norm_medianorm_latentsr   to_qto_kvto_out)rV   r(   dim_headr   	inner_dimrX   r3   r4   r@     s   

z*EvollaSequenceCompressorAttention.__init__c                 C   s  |  |}| |}| j}| |}tj||fdd}| |jddd\}}||	d|	d|d
dddd}||	d|	d|d
dddd}||	d|	d|d
dddd}|| j }t||dd}	|	|	jddd	  }	|	j\}
}}}t|||j}|d
d
d
d
d
d
f }|d
d
d
d
d
d
f }|| }|	d|  d}	|	jdd}t||}|
dddd}||	d|	dd}| |S )z
        Args:
            x (torch.Tensor): image features
                shape (b, n1, D)
            latent (torch.Tensor): latent features
                shape (b, n2, D);  n2: num of latent tokens
        r}   r'   ru   r<   r   r$   r   Tr(   keepdimNg     )rM  rN  r   rO  r+   rw   rP  rv   r   rl   permuterL  r   r   amaxdetachr_   r<  ra   rk   r\   boolr   r   rQ  )rV   ry   latentsr1   hr   kv_inputr   vsimbsnhskdokdr<  mask_expones_expattnoutr3   r3   r4   ri     s2   




(((

z)EvollaSequenceCompressorAttention.forward)rI  rJ  r   r3   r3   rX   r4   rH    s    rH  c                       s&   e Zd Zd fdd	Zdd Z  ZS )EvollaFeedForward   c                    sT   t    t|| }t|| _tj||dd| _t | _	tj||dd| _
d S NFrK  )r?   r@   r*   r   rG   normr   fc1GELUr  fc2)rV   r(   multrS  rX   r3   r4   r@     s   

zEvollaFeedForward.__init__c              	   C   s   |  | | | |S r   )rm  r  rk  rj  )rV   ry   r3   r3   r4   ri     s   zEvollaFeedForward.forward)rh  r   r3   r3   rX   r4   rg    s    	rg  c                       s*   e Zd Zdef fddZdd Z  ZS )!EvollaSequenceCompressorResamplerrW   c              
      s   t    |jj}|j| _tjt	| j|dd| _
tg | _t|jD ]}| jtt||j|jdt||jdg q%t|j| _t||j| _d S )NT)requires_grad)r(   rR  r   )r(   rn  )r?   r@   protein_encoder_configrC   resampler_num_latentsnum_latentsr   	Parameterr+   randnrZ  r
  layersr  resampler_depthappendrH  resampler_dim_headresampler_headsrg  resampler_ff_multrG   rj  r   protein_projector)rV   rW   protein_repr_dimr   rX   r3   r4   r@      s"   

z*EvollaSequenceCompressorResampler.__init__c                 C   s   |j d }|j \}}t|| j|j}tj||fdd}t|| jj}| jd  |ddd }||j	}| j
D ]\}	}
|	|||| }|
|| }q=| |}| |S )Nr   r$   r'   r<   )r_   r+   r<  rs  ra   rk   rw   rZ  r   rb   rv  r|  rj  )rV   embedsr1   br_  r   latent_maskr<  rZ  re  fftransformed_featurer3   r3   r4   ri     s   



z)EvollaSequenceCompressorResampler.forward)rp   rq   rr   r%   r@   ri   rt   r3   r3   rX   r4   ro    s    ro  c                   @   sf   e Zd ZU dZeej ed< dZeej ed< dZ	ee
ejdf  ed< dZee
ejdf  ed< dS )EvollaProteinEncoderModelOutputNsequence_compressor_outputr  .r   r  )rp   rq   rr   r  r   r+   r   r   r  r   r   r  r3   r3   r3   r4   r  )  s
   
 r  c                       s<   e Zd Zdef fddZedejdejfddZ	  Z
S )EvollaProteinEncoderrW   c                    s(   t    t|jd| _t|d| _d S )Nr  )r?   r@   r0  rq  modelro  sequence_compressor_resamplerrU   rX   r3   r4   r@   3  s   
zEvollaProteinEncoder.__init__r/   rc   c                 K   s.   | j ||d}|j}| ||}t||jdS )Nr;  )r  r  )r  r  r  r  )rV   r/   rc   r   protein_outputprotein_embedssequence_reprr3   r3   r4   ri   8  s   zEvollaProteinEncoder.forward)rp   rq   rr   r%   r@   r    r+   
LongTensorr   ri   rt   r3   r3   rX   r4   r  2  s     r  c                       sl   e Zd Z			ddee dee dee f fddZdd Zed	d
dd							dddZ  Z	S )#EvollaSequenceAlignerCrossAttentionNprotein_encoder_dimstructure_encoder_dimmsa_encoder_dimc                    st  t    |j| _|j| _| jd | _t| j| j | _| j| j | _|j}|j	}|j
}t| j| j| _|d urJt|| j| _t|| j| _nd | _d | _|d uret|| j| _t|| j| _nd | _d | _|d urt|| j| _t|| j| _nd | _d | _t| j| _t|| _tj| j| j|d| _t| j|| _ttdg| _ttdg| _d S )Nr   rK  rZ   ) r?   r@   rC   r   rL  r*   r   r   $aligner_attention_probs_dropout_probaligner_enable_biasaligner_ffn_multr   r   r   key_proteinvalue_proteinkey_structurevalue_structurekey_msa	value_msaEvollaRMSNormattention_normrJ   rL   out_projrg  r  rt  r+   tensorgate_attentiongate_ffw)rV   rW   r  r  r  r   enable_biasffn_multrX   r3   r4   r@   E  s>   
z,EvollaSequenceAlignerCrossAttention.__init__c	                 C   s  |||g}	dd |	D }	|	st dtj|	dd}	| |}
| |
}
| jdur=| jdur=||}| |}| |}nd}d}| jdur[| j	dur[||}| |}| 	|}nd}d}| j
dury| jdury||}| 
|}| |}nd}d}|||g}dd |D }tj|dd}|||g}dd |D }tj|dd}|
 dd	 | j| jf }|
j| d
ddd}
| dd	 | j| jf }|j| d
ddd}| dd	 | j| jf }|j| d
ddd}|
| j }
|du rt|d
|d|j}|ddddddf |	ddddddf  }t|
|d	d}||jd	dd  }|d|  t|jj}tjd	d|}t||}|d
ddd }| dd | j f }|j| }| !|}|S )z
        query_states: text
        key_value_states: protein
        query_states: [bs, query_seq_len, dim]
        key_value_states: [bs, kv_seq_len, dim]
        query_attn_mask: [bs, query_seq_len]
        kv_attn_mask: [bs, kv_seq_len]
        c                 S      g | ]}|d ur|qS r   r3   r  r3   r3   r4   r	        zGEvollaSequenceAlignerCrossAttention.cross_attention.<locals>.<listcomp>z=At least one modality should be provided for cross attention.r$   r'   Nc                 S   r  r   r3   r  r3   r3   r4   r	    r  c                 S   r  r   r3   r  r3   r3   r4   r	    r  r<   r   ru   r   r}   TrT  )"r   r+   rw   r  r   r  r  ra   r  r  r  r  rl   r   r   r   rV  rL  r<  rk   r   r   rW  rX  r\   rY  rE  rb   rF  r   Softmaxr   r   r  )rV   query_statesprotein_key_value_statesstructure_key_value_statesmsa_key_value_statesquery_attn_maskprotein_kv_attn_maskstructure_kv_attn_maskmsa_kv_attn_maskkv_attn_maskr   key_layer_proteinvalue_layer_proteinkey_layer_structurevalue_layer_structurekey_layer_msavalue_layer_msar   r   new_query_layer_shapenew_key_layer_shapenew_value_layer_shaperc   r   attention_scoresattention_probscontext_layernew_context_layer_shaper3   r3   r4   cross_attentionx  s|   












 0

z3EvollaSequenceAlignerCrossAttention.cross_attentionpast_key_valuepast_key_values4.58new_nameversionc              
   C   s  |d ur&|j \}}}|d u r%t|||	j|	j||fdj |j}nd }|d urN|j \}}}|d u rMt|||	j|
j||fdj |j}nd }|d urv|j \}}}|d u rut|||	j|j||fdj |j}nd }|}|d ur| s|d ur| s|d ur| r|}| j||||||||d}t	| j
| }|| }|}| |t	| j }|| }|S )N)rl   )r  r  r  r  r  r  r  r  )r_   r+   r<  ra   rk   rQ   Tanyr  tanhr  r  r  )rV   r  protein_kv_statesstructure_kv_statesmsa_kv_statesr  r  r  r  protein_batch_maskstructure_batch_maskmsa_batch_maskr  r_  protein_kv_seq_lenr(   structure_kv_seq_lenmsa_kv_seq_lenr   residualr3   r3   r4   ri     sf   z+EvollaSequenceAlignerCrossAttention.forward)NNNNNNNNNN)
rp   rq   rr   r   r*   r@   r  r!   ri   rt   r3   r3   rX   r4   r  D  s*    3pr  RMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	r  ư>c                    s&   t    tt|| _|| _dS )z<
        EvollaRMSNorm is equivalent to T5LayerNorm
        N)r?   r@   r   rt  r+   r<  r#  variance_epsilon)rV   rC   r8   rX   r3   r4   r@   5  s   

zEvollaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nru   r<   T)rU  )	rb   ra   r+   r   powr  rsqrtr  r#  )rV   r   input_dtypevariancer3   r3   r4   ri   =  s
   zEvollaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r#  r_   r  r3  r3   r3   r4   
extra_reprD  s   zEvollaRMSNorm.extra_repr)r  )rp   rq   rr   r@   ri   r  rt   r3   r3   rX   r4   r  3  s    r  c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	EvollaRotaryEmbeddingr   NrW   c                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   Fr=   )r?   r@   r   r"  r  dictgetr  rP   max_seq_len_cachedoriginal_max_seq_lenrW   r   rope_init_fnattention_scalingrN   r   original_inv_freq)rV   rW   rk   r   rX   r3   r4   r@   K  s   
zEvollaRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r<   r$   mpscpuF)device_typeenabledru   r'   r   )r   r`   rQ   r_   ra   rk   r"  r  strr+   autocastr   rw   r~   r  r   rb   )
rV   ry   r;   inv_freq_expandedposition_ids_expandedr  r   r   r~   r   r3   r3   r4   ri   \  s   0&zEvollaRotaryEmbedding.forwardr   )rp   rq   rr   r+   r   r   r%   r@   no_gradr   ri   rt   r3   r3   rX   r4   r  H  s   
 
r  c                       r   )	EvollaMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )NrK  )r?   r@   rW   rC   r   r   r   mlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnrU   rX   r3   r4   r@   m  s   
zEvollaMLP.__init__c                 C   s$   |  | | || | }|S r   )r  r  r  r  )rV   ry   r  r3   r3   r4   ri   w  s    zEvollaMLP.forwardr   r3   r3   rX   r4   r  l  s    
r  c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr<   ru   r'   )r_   r+   rw   rx   r3   r3   r4   rotate_half|  s   r  c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r]   r  )r   r   r~   r   r;   unsqueeze_dimq_embedk_embedr3   r3   r4   apply_rotary_pos_emb  s
   

r  r   n_repr   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r$   N)r_   rQ   r   )r   r  batchnum_key_value_headsslenhead_dimr3   r3   r4   	repeat_kv  s
   0r  c                       s   e Zd ZdZdedef fddZedddd		
	
ddej	de
ej	ej	f deej	 dee deej dee de
ej	ej	f fddZ  ZS )EvollaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrW   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr  r   TrK  )r?   r@   rW   r   rM   rC   r   r  r  num_key_value_groupsr   attention_dropoutr   r   r   attention_biasq_projk_projv_projo_projrV   rW   r   rX   r3   r4   r@     s(   
zEvollaAttention.__init__r  r  r  r  Nr   rR   rc   cache_positionr   r   c                 K   s$  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| jd|\}}|jg |dR   }| |}||fS )Nr<   r$   ru   )r   r~   r  r   rZ   )rL   r   )r_   r  r  r   r   r  r  r  updater   r   rW   r   r   r   r   r   r   r   r  )rV   r   rR   rc   r  r  r   rm   r   r  
key_statesvalue_statesr~   r   cache_kwargsr   r   r   r3   r3   r4   ri     s8   


zEvollaAttention.forwardrG  )rp   rq   rr   rs   r%   r*   r@   r!   r+   r   r   r   r
   r  r   r   ri   rt   r3   r3   rX   r4   r    s*    r  c                        s   e Zd Zdedef fddZedddd							
																ddejde	ejejf de
ej de
ej de
e de
e de
ej de
ej de
ej de
ej de
ej de
ej de
ej de
ej dejfddZ  ZS )EvollaDecoderLayerrW   r   c                    s   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
|d t|j|j d dkr@t||jd| _d S d S )NrW   r   r7   r$   r   )r  )r?   r@   rC   r  	self_attnr  mlpr  rms_norm_epsinput_layernormpost_attention_layernormmaxr  aligner_num_add_layersr  adapterr  rX   r3   r4   r@     s   

zEvollaDecoderLayer.__init__r  r  r  r  NFr   rR   rc   r;   	use_cacher  r  r  r  r  r  r  r  r   c              
   K   s   |}|  |}| jd|||||||d|\}}|| }|}| |}| |}|| }t| dr?| j|||	|
||||d}|S )N)r   rc   r;   r  r  r  rR   r  )r  r  r  r  r  r  r  r  r3   )r  r  r  r  r   r  )rV   r   rR   rc   r;   r  r  r  r  r  r  r  r  r  r  r   r  r   r3   r3   r4   ri     s<   





zEvollaDecoderLayer.forward)NNNFNNNNNNNN)rp   rq   rr   r%   r*   r@   r!   r+   r   r   r   r  r
   rY  ri   rt   r3   r3   rX   r4   r    s\    	
r  c                       sZ   e Zd ZU eed< dZdZg dZdgZdZ	dZ
dZdZdZeedZ fdd	Z  ZS )
EvollaPreTrainedModelrW   r  T)r  ro  r  r  F)r   r  c                    sj   | j j}t | t|tr#|j  |j  |j	j
jd d S t|tr3|jjjd|d d S d S )Nr   rZ   r  )rW   r!  r?   r*  r"  r  r  r'  r  r  r#  r$  r(  ro  rZ  r%  r)  rX   r3   r4   r*  P  s   



z#EvollaPreTrainedModel._init_weights)rp   rq   rr   r%   r   base_model_prefixsupports_gradient_checkpointingr+  _skip_keys_device_placementr,  r-  _supports_flex_attn_can_compile_fullgraphr.  r  r  r/  r*  rt   r3   r3   rX   r4   r  :  s   
 r  c                !       s   e Zd Zdef fddZdd Zdd Zee 													dd	e	e
j d
e	e
j de	e
j de	e de	e
j de	e de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j deeef fddZ  ZS )EvollaModelrW   c                    s   t     j| _ j| _t| j j| j| _t	 d| _
t fddt jD | _t j jd| _t d| _t dd| _|   d S )Nr  c                    s   g | ]}t  |d qS )r  )r  )r  r   r  r3   r4   r	  c  s    z(EvollaModel.__init__.<locals>.<listcomp>r7   r  F)r?   r@   rD   r0   rB   r   rA   rC   embed_tokensr  protein_encoderr
  r  r  rv  r  r  rj  r  
rotary_embrM   r  	post_initrU   rX   r  r4   r@   \  s   

zEvollaModel.__init__c                 C   s   | j S r   r  r3  r3   r3   r4   r4  q  s   z EvollaModel.get_input_embeddingsc                 C   s
   || _ d S r   r"  r5  r3   r3   r4   r6  t     
z EvollaModel.set_input_embeddingsNr/   rc   r;   r  rd   r  r  protein_input_idsprotein_attention_maskstructure_feats	msa_featsr  r  r   c                 K   sJ  |du |duA rt d|du r| |}|r!|du r!t| jd}|du r=|dur-| nd}tj|||jd  |jd}|du rF|	d}d}d}|durj|	durj| j
||	d}|j}tjdg|jd  |jd}t| j||||d	}|}| ||}| jD ]}||f||||||||
|||||d
|}q| |}t||d}|S )a;  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
        structure_feats (torch.FloatTensor):
            The input IDs for purely structure-based features. Should be of shape `(batch_size, structure_seq_length, structure_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        msa_feats (torch.FloatTensor):
            The input IDs for purely MSA-based features. Should be of shape `(batch_size, msa_seq_length, msa_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        structure_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely structure-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `structure_feats`. Dummpy input for now.
        msa_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely MSA-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `msa_feats`. Dummpy input for now.
        Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r$   r   r;  T)rW   input_embedsrc   r  r  )rc   r;   r  r  r  rR   r  r  r  r  r  r  r  )r  r  )r   r  r   rW   get_seq_lengthr+   rO   r_   rk   r]   r  r  r  r   r   rv  rj  r   )rV   r/   rc   r;   r  rd   r  r  r$  r%  r&  r'  r  r  r   past_seen_tokensprotein_featsr  protein_outputsr   r   rR   decoder_layerr   r3   r3   r4   ri   w  sr   !



zEvollaModel.forward)NNNNNNNNNNNNN)rp   rq   rr   r%   r@   r4  r6  r   r#   r   r+   r  r   r
   r   rY  r   r   r   ri   rt   r3   r3   rX   r4   r  [  s`    	

r  c                       s   e Zd Z fddZdd Zdd Zee							ddee	j
 d	ee	j d
ee	j dee	j
 dee	j
 dee	j dee fddZ  ZS )EvollaForProteinText2Textc                    s@   t  | t|| _|j| _tj|j| jdd| _| 	  d S ri  )
r?   r@   r  r  rB   r   r   rC   lm_headr!  rU   rX   r3   r4   r@     s
   
z"EvollaForProteinText2Text.__init__c                 C   s
   | j  S r   )r  r4  r3  r3   r3   r4   r4    r#  z.EvollaForProteinText2Text.get_input_embeddingsc                 C   s   | j |S r   )r  r6  r5  r3   r3   r4   r6    r7  z.EvollaForProteinText2Text.set_input_embeddingsNr/   rc   rd   labelsr$  r%  r  c              	   K   sr   | j d||||||d|}	|	d }
| |
}d}|dur+| jd||| jd|}t|||	j|	j|	jd}|S )a,  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.

        Example:

        ```python
        >>> from transformers import EvollaProcessor, EvollaForProteinText2Text
        >>> model = EvollaForProteinText2Text.from_pretrained("westlake/Evolla-10B-hf")
        >>> processor = EvollaProcessor.from_pretrained("westlake/Evolla-10B-hf")

        >>> protein_information = {
            "aa_seq": "your amino acid sequence",
            "foldseek": "your foldseek sequence",
        }
        >>> question = "What is the function of this protein?"
        >>> message = [
            {"role": "system", "content": "You are an AI expert that can answer any questions about protein."},
            {"role": "user", "content": question},
        ]

        >>> inputs = processor(proteins=[protein_information], messages_list=[message], return_tensors="pt", padding="longest")
        >>> outputs = model.generate(**inputs)

        >>> print(processor.batch_decode(outputs, skip_special_tokens=True))
        ```)r/   rc   rd   r$  r%  r  r   N)logitsr0  rB   )lossr1  r  r   r  r3   )r  r/  loss_functionrB   r   r  r   r  )rV   r/   rc   rd   r0  r$  r%  r  r   outputsr   r1  r2  
lm_outputsr3   r3   r4   ri     s.   *	
z!EvollaForProteinText2Text.forwardr  )rp   rq   rr   r@   r4  r6  r    r   r   r+   r  r   r   rY  ri   rt   r3   r3   rX   r4   r.    s8    r.  )r.  r  r  )rZ   N)Nr$   )]r   rA  dataclassesr   typingr   r   r   r+   r   r   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   r   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r    utils.deprecationr!   utils.genericr"   r#   configuration_evollar%   r&   r5   Moduler6   r|   r   r   r`   r   r   r   r   r   r   r   r   r  r  r  r0  rH  rg  ro  r  r  r  r  r  r  r  r  r*   r  r  r  r  r  r.  __all__r3   r3   r3   r4   <module>   s   a3	
2Y0:#f:* p$
GI  S