o
    ei3                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 dd Z7G dd dej8Z9dd Z:dd Z;G dd  d ej8Z<		!djd"ej8d#ej=d$ej=d%ej=d&ej=dB d'e>dB d(e>d)e)e+ fd*d+Z?G d,d- d-ej8Z@G d.d/ d/ej8ZAG d0d1 d1ej8ZBd2d3 ZCG d4d5 d5ej8ZDG d6d7 d7ej8ZEG d8d9 d9eZFG d:d; d;ej8ZGG d<d= d=ej8ZHe,G d>d? d?e'ZIG d@dA dAeIZJG dBdC dCej8ZKG dDdE dEej8ZLG dFdG dGej8ZMee,G dHdI dIe!ZNG dJdK dKej8ZOG dLdM dMej8ZPedNG dOdP dPej8ZQG dQdR dRej8ZRG dSdT dTej8ZSdUdV ZTedWdkdXdYZUdZej=d[eVd\ej=fd]d^ZWeeUG d_d` d`ej8ZXG dadb dbeZYe,G dcdd dde'ZZG dedf dfeZZ[G dgdh dheZeZ\g diZ]dS )l    N)Callable)	dataclass)Optional)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsBaseModelOutputWithPast,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )EvollaConfigSaProtConfigc                 C   s2   |  | }tj|dd|| }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r#   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxmaskincremental_indices r2   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/evolla/modeling_evolla.py"create_position_ids_from_input_ids4   s   r4   c                       s:   e Zd ZdZ fddZ				d	ddZdd Z  ZS )
EvollaSaProtEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _|jrtj	|j|j
d| _nd | _t|j| _t|dd| _| jdt|jddd |j| _| jdkrTtj|j|j| jd| _|j| _|j| _d | _d S )	N)r/   epsposition_embedding_typeabsoluteposition_ids)r#   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr8   register_bufferr*   arangemax_position_embeddingsexpandr/   position_embeddingstoken_dropoutmask_token_idr:   selfconfig	__class__r2   r3   r?   I   s$   


zEvollaSaProtEmbeddings.__init__Nc           
      C   s  |d u r|d urt || j}n| |}|d u r| |}|}| jrc|d urc||| jkdd}d}|d ur=|dn|j	d }|| jkd
 | }|d|  d| d d d d f  |j}| jdkrq| |}	||	 }| jd ur{| |}|d ur||d |j}|S )Nr;           gQ?r#   r9   )r4   r/   &create_position_ids_from_inputs_embedsrD   rR   masked_fillrS   	unsqueezesumshapefloattodtyper8   rQ   rH   )
rU   r.   attention_maskr:   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedrQ   r2   r2   r3   forwardb   s.   

	"



zEvollaSaProtEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr;   r#   )ra   devicer   )sizer*   rN   r/   r-   ri   r\   rP   )rU   rc   input_shapesequence_lengthr:   r2   r2   r3   rZ      s   	z=EvollaSaProtEmbeddings.create_position_ids_from_inputs_embedsNNNN)__name__
__module____qualname____doc__r?   rh   rZ   __classcell__r2   r2   rW   r3   r5   D   s    
1r5   c                 C   s&   | j ddd\}}tj| |fddS )N   r;   r&   )chunkr*   catxx1x2r2   r2   r3   rotate_half_esm   s   rz   c                 C   s`   |d d d d d | j d d d f }|d d d d d | j d d d f }| | t| |  S )N)r^   rz   )rw   cossinr2   r2   r3   apply_rotary_pos_emb_esm   s   &&r~   c                       sb   e Zd ZU dZejed< def fddZdddZ	d	ejd
ejde
ejejf fddZ  ZS )EvollaSaProtRotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    inv_freqr'   c                    sV   t    || _ddtjd|dtjd |   }| d| d | _d | _	d | _
d S )N      ?'  r   rs   ra   r   )r>   r?   r'   r*   rN   int64r_   rM   _seq_len_cached_cos_cached_sin_cached)rU   r'   r   rW   r2   r3   r?      s   
$
z$EvollaSaProtRotaryEmbedding.__init__rs   c                 C   s   |j | }|| jks| jj|jkrU|| _tj|j | |jd| j}t|| j}tj	||fdd
|j}| d d d d d d f | _| d d d d d d f | _| j| jfS )Nri   r;   r&   )r^   r   r   ri   r*   rN   r,   r   outerru   r`   r|   r}   r   )rU   rw   seq_dimensionseq_lentfreqsembr2   r2   r3   _update_cos_sin_tables   s   
z2EvollaSaProtRotaryEmbedding._update_cos_sin_tablesqkreturnc                 C   sJ   | j |dd\| _| _t|| j| jj|jdt|| j| jj|jdfS )Nr{   )r   r   )r   r   r   r~   r`   ra   )rU   r   r   r2   r2   r3   rh      s   z#EvollaSaProtRotaryEmbedding.forward)rs   )rn   ro   rp   rq   r*   Tensor__annotations__r)   r?   r   tuplerh   rr   r2   r2   rW   r3   r      s   
 

.r   rY   modulequerykeyvaluerb   scalingrK   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nr;         rs   r   r&   )ptrainingr#   )
rj   r*   matmul	transposer   
functionalsoftmaxrK   r   
contiguous)
r   r   r   r   rb   r   rK   r   attn_weightsattn_outputr2   r2   r3   eager_attention_forward   s   
r   c                       sf   e Zd Zd fdd	Z			ddejdejdB dejdB dejdB d	ee d
e	ej fddZ
  ZS )EvollaSaProtSelfAttentionNFc                    s   t    || _|j|j dkr"t|ds"td|j d|j d|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _|j| _d | _|p_t|dd| _| jdkrmt| jd	| _|j| _|| _d
| _| jo|| | _d S )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r8   r9   rotaryr&   r   )r>   r?   rV   rB   num_attention_headshasattr
ValueErrorr)   attention_head_sizeall_head_sizer   Linearr   r   r   attention_probs_dropout_probrK   rotary_embeddingsrL   r8   r   
is_decoder	layer_idxr   	is_causal)rU   rV   r8   r   is_cross_attentionrW   r2   r3   r?      s2   


z"EvollaSaProtSelfAttention.__init__hidden_statesrb   encoder_hidden_statesencoder_attention_maskr   r   c                 K   s
  |j d d \}}||d| jf}| ||dd}	|d u}
|
r$|n|}|
r*|n|}| ||dd}| ||dd}|	| jd  }	| jdkrX| |	|\}	}t	
| jjt}|| |	|||f| jsldn| j| jd|\}}|||d }||fS )Nr;   r#   rs   r   r   rY   rK   r   )r^   r   r   viewr   r   r   r8   r   r   get_interfacerV   _attn_implementationr   r   rK   r   reshaper   )rU   r   rb   r   r   r   
batch_size
seq_lengthhidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   r2   r2   r3   rh     s8   

z!EvollaSaProtSelfAttention.forward)NNFNNN)rn   ro   rp   r?   r*   r   FloatTensorr   r   r   rh   rr   r2   r2   rW   r3   r      s$    #r   c                       $   e Zd Z fddZdd Z  ZS )EvollaSaProtSelfOutputc                    s.   t    t|j|j| _t|j| _d S N)	r>   r?   r   r   rB   denserI   rJ   rK   rT   rW   r2   r3   r?   K     
zEvollaSaProtSelfOutput.__init__c                 C       |  |}| |}|| }|S r   r   rK   rU   r   input_tensorr2   r2   r3   rh   P     

zEvollaSaProtSelfOutput.forwardrn   ro   rp   r?   rh   rr   r2   r2   rW   r3   r   J      r   c                       s8   e Zd Zd fdd	Z			d	dee fddZ  ZS )
EvollaSaProtAttentionNFc                    s<   t    t|||d| _t|| _tj|j|j	d| _d S )N)r   r   r6   )
r>   r?   r   rU   r   outputr   rF   rB   rG   )rU   rV   r   r   rW   r2   r3   r?   X  s   

zEvollaSaProtAttention.__init__r   c           	      K   s8   |  |}| j|f|||d|\}}| ||}|S )Nrb   r   r   )rF   rU   r   )	rU   r   rb   r   r   r   hidden_states_lnr   _r2   r2   r3   rh   _  s   

zEvollaSaProtAttention.forward)NFr   )rn   ro   rp   r?   r   r   rh   rr   r2   r2   rW   r3   r   W  s    
r   c                 C   s    | d dt | td   S )zz
    This is the gelu implementation from the original EVOLLA_SA_PROT repo. Using F.gelu yields subtly wrong results.
    g      ?r   g       @)r*   erfmathsqrt)rw   r2   r2   r3   gelus  s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )EvollaSaProtIntermediatec                    s    t    t|j|j| _d S r   )r>   r?   r   r   rB   intermediate_sizer   rT   rW   r2   r3   r?   {  s   
z!EvollaSaProtIntermediate.__init__r   r   c                 C   s   |  |}t|}|S r   )r   r   )rU   r   r2   r2   r3   rh     s   
z EvollaSaProtIntermediate.forwardrn   ro   rp   r?   r*   r   rh   rr   r2   r2   rW   r3   r   z  s    r   c                       r   )EvollaSaProtOutputc                    s.   t    t|j|j| _t|j| _	d S r   )
r>   r?   r   r   r   rB   r   rI   rJ   rK   rT   rW   r2   r3   r?     r   zEvollaSaProtOutput.__init__c                 C   r   r   r   r   r2   r2   r3   rh     r   zEvollaSaProtOutput.forwardr   r2   r2   rW   r3   r     r   r   c                       s>   e Zd Z fddZ			d	dee fddZdd Z  ZS )
EvollaSaProtLayerc                    s   t    |j| _d| _t|| _|j| _|j| _| jr-| js&t|  dt|dd| _	t
|| _t|| _tj|j|jd| _d S )Nr#   z> should be used as a decoder model if cross attention is addedT)r   r6   )r>   r?   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionRuntimeErrorcrossattentionr   intermediater   r   r   rF   rB   rG   rT   rW   r2   r3   r?     s   



zEvollaSaProtLayer.__init__Nr   c                 K   sf   | j |fd|i|}| jr,|d ur,t| dstd|  d| j|f|||d|}| |}|S )Nrb   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r   r   r   AttributeErrorr   feed_forward_chunk)rU   r   rb   r   r   r   attention_outputlayer_outputr2   r2   r3   rh     s.   


zEvollaSaProtLayer.forwardc                 C   s$   |  |}| |}| ||}|S r   )rF   r   r   )rU   r   attention_output_lnintermediate_outputr   r2   r2   r3   r     s   

z$EvollaSaProtLayer.feed_forward_chunkr   )	rn   ro   rp   r?   r   r   rh   r   rr   r2   r2   rW   r3   r     s    
 r   c                       s:   e Zd Z fddZe			ddee fddZ  ZS )EvollaSaProtEncoderc                    sN   t     | _t fddt jD | _tj j	 j
d| _d| _d S )Nc                    s   g | ]}t  qS r2   )r   .0r   rV   r2   r3   
<listcomp>  s    z0EvollaSaProtEncoder.__init__.<locals>.<listcomp>r6   F)r>   r?   rV   r   
ModuleListrangenum_hidden_layerslayerrF   rB   rG   emb_layer_norm_aftergradient_checkpointingrT   rW   r   r3   r?     s
   
 
zEvollaSaProtEncoder.__init__Nr   c                 K   sF   t | jD ]\}}||f|||d|}q| jr| |}t|dS )Nr   )last_hidden_state)	enumerater   r   r   )rU   r   rb   r   r   r   ilayer_moduler2   r2   r3   rh     s   	

zEvollaSaProtEncoder.forwardr   )	rn   ro   rp   r?   r   r   r   rh   rr   r2   r2   rW   r3   r     s    r   c                       r   )EvollaSaProtPoolerc                    s*   t    t|j|j| _t | _d S r   )r>   r?   r   r   rB   r   Tanh
activationrT   rW   r2   r3   r?     s   
zEvollaSaProtPooler.__init__r   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )rU   r   first_token_tensorpooled_outputr2   r2   r3   rh     s   

zEvollaSaProtPooler.forwardr   r2   r2   rW   r3   r     s    r   c                       s`   e Zd ZU eed< dgZdZdZdZdZ	e
eedddgeedddgdZ fd	d
Z  ZS )EvollaSaProtPreTrainedModelrV   r   Tr#   r   )index
layer_namer   )r   
attentionscross_attentionsc                    sT   t  | t|tr(ddtjd|jdtjd |j   }t	
|j| d S d S )Nr   r   r   rs   r   )r>   _init_weights
isinstancer   r*   rN   r'   r   r_   initcopy_r   )rU   r   r   rW   r2   r3   r
  	  s
   
(z)EvollaSaProtPreTrainedModel._init_weights)rn   ro   rp   r%   r   _no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r!   r   _can_record_outputsr
  rr   r2   r2   rW   r3   r    s   
 r  c                
       sj   e Zd Zdef fddZdd Zdd Zee	dd	e	j
dB d
e	j
dB dee	j
 eB fddZ  ZS )EvollaSaProtProteinEncoderrV   c                    s,   t  | t|| _t|| _|   d S r   )r>   r?   r5   rd   r   encoder	post_initrT   rW   r2   r3   r?     s   

z#EvollaSaProtProteinEncoder.__init__c                 C   s   | j jS r   rd   rD   rU   r2   r2   r3   get_input_embeddings  s   z/EvollaSaProtProteinEncoder.get_input_embeddingsc                 C   s   || j _d S r   r  rU   r   r2   r2   r3   set_input_embeddings     z/EvollaSaProtProteinEncoder.set_input_embeddingsNr.   rb   r   c                 K   s   |  }|\}}|j}|d u rtj||f|d}| j||d}t| j||d}| j|fd|i|}	|	d }
t|
|	j	|	j
|	jdS )Nr   r.   rb   )rV   rc   rb   rb   r   )r   r   r  r	  )rj   ri   r*   onesrd   r   rV   r  r   r   r  r	  )rU   r.   rb   r   rk   r   r   ri   rc   encoder_outputssequence_outputr2   r2   r3   rh     s&   z"EvollaSaProtProteinEncoder.forwardr   )rn   ro   rp   r%   r?   r  r  r    r"   r*   r   r   r   rh   rr   r2   r2   rW   r3   r    s    r  c                       s&   e Zd Zd fdd	Zdd Z  ZS )!EvollaSequenceCompressorAttention@      c                    sx   t    |d | _|| _|| }t|| _t|| _tj||dd| _	tj||d dd| _
tj||dd| _d S )Nr   Fbiasrs   )r>   r?   scaleheadsr   rF   
norm_medianorm_latentsr   to_qto_kvto_out)rU   r'   dim_headr'  	inner_dimrW   r2   r3   r?   ?  s   

z*EvollaSequenceCompressorAttention.__init__c                 C   s  |  |}| |}| j}| |}tj||fdd}| |jddd\}}||	d|	d|d
dddd}||	d|	d|d
dddd}||	d|	d|d
dddd}|| j }t||dd}	|	|	jddd	  }	|	j\}
}}}t|||j}|d
d
d
d
d
d
f }|d
d
d
d
d
d
f }|| }|	d|  d}	|	jdd}t||}|
dddd}||	d|	dd}| |S )z
        Args:
            x (torch.Tensor): image features
                shape (b, n1, D)
            latent (torch.Tensor): latent features
                shape (b, n2, D);  n2: num of latent tokens
        r{   r&   rs   r;   r   r#   r   Tr'   keepdimNg     )r(  r)  r'  r*  r*   ru   r+  rt   r   rj   permuter&  r   r   amaxdetachr^   r  r`   ri   r[   boolr   r   r,  )rU   rw   latentsr0   hr   kv_inputr   vsimbsnhskdokdr  mask_expones_expattnoutr2   r2   r3   rh   L  s2   




(((

z)EvollaSequenceCompressorAttention.forward)r"  r#  r   r2   r2   rW   r3   r!  >  s    r!  c                       s&   e Zd Zd fdd	Zdd Z  ZS )EvollaFeedForward   c                    sT   t    t|| }t|| _tj||dd| _t | _	tj||dd| _
d S NFr$  )r>   r?   r)   r   rF   normr   fc1GELUr  fc2)rU   r'   multr.  rW   r2   r3   r?   y  s   

zEvollaFeedForward.__init__c              	   C   s   |  | | | |S r   )rH  r  rF  rE  )rU   rw   r2   r2   r3   rh     s   zEvollaFeedForward.forward)rC  r   r2   r2   rW   r3   rB  x  s    	rB  c                       s*   e Zd Zdef fddZdd Z  ZS )!EvollaSequenceCompressorResamplerrV   c              
      s   t    |jj}|j| _tjt	| j|dd| _
tg | _t|jD ]}| jtt||j|jdt||jdg q%t|j| _t||j| _d S )NT)requires_grad)r'   r-  r'  )r'   rI  )r>   r?   protein_encoder_configrB   resampler_num_latentsnum_latentsr   	Parameterr*   randnr5  r   layersr   resampler_depthappendr!  resampler_dim_headresampler_headsrB  resampler_ff_multrF   rE  r   protein_projector)rU   rV   protein_repr_dimr   rW   r2   r3   r?     s"   

z*EvollaSequenceCompressorResampler.__init__c                 C   s   |j d }|j \}}t|| j|j}tj||fdd}t|| jj}| jd  |ddd }||j	}| j
D ]\}	}
|	|||| }|
|| }q=| |}| |S )Nr   r#   r&   r;   )r^   r*   r  rN  r`   ri   ru   r5  r   ra   rQ  rW  rE  )rU   embedsr0   br:  r   latent_maskr  r5  r@  fftransformed_featurer2   r2   r3   rh     s   



z)EvollaSequenceCompressorResampler.forward)rn   ro   rp   r$   r?   rh   rr   r2   r2   rW   r3   rJ    s    rJ  c                   @   sf   e Zd ZU dZejdB ed< dZejdB ed< dZe	ejdf dB ed< dZ
e	ejdf dB ed< dS )EvollaProteinEncoderModelOutputNsequence_compressor_outputr   .r   r  )rn   ro   rp   r_  r*   r   r   r   r   r   r  r2   r2   r2   r3   r^    s
   
 r^  c                       s<   e Zd Zdef fddZedejdejfddZ	  Z
S )EvollaProteinEncoderrV   c                    s(   t    t|jd| _t|d| _d S )Nr   )r>   r?   r  rL  modelrJ  sequence_compressor_resamplerrT   rW   r2   r3   r?     s   
zEvollaProteinEncoder.__init__r.   rb   c                 K   s.   | j ||d}|j}| ||}t||jdS )Nr  )r_  r   )ra  r   rb  r^  )rU   r.   rb   r   protein_outputprotein_embedssequence_reprr2   r2   r3   rh     s   zEvollaProteinEncoder.forward)rn   ro   rp   r$   r?   r   r*   
LongTensorr   rh   rr   r2   r2   rW   r3   r`    s     r`  c                       s^   e Zd Z			ddedB dedB dedB f fddZdd Z							dd	d
Z  ZS )#EvollaSequenceAlignerCrossAttentionNprotein_encoder_dimstructure_encoder_dimmsa_encoder_dimc                    st  t    |j| _|j| _| jd | _t| j| j | _| j| j | _|j}|j	}|j
}t| j| j| _|d urJt|| j| _t|| j| _nd | _d | _|d uret|| j| _t|| j| _nd | _d | _|d urt|| j| _t|| j| _nd | _d | _t| j| _t|| _tj| j| j|d| _t| j|| _ttdg| _ttdg| _d S )Nr   r$  rY   ) r>   r?   rB   r   r&  r)   r   r   $aligner_attention_probs_dropout_probaligner_enable_biasaligner_ffn_multr   r   r   key_proteinvalue_proteinkey_structurevalue_structurekey_msa	value_msaEvollaRMSNormattention_normrI   rK   out_projrB  r\  rO  r*   tensorgate_attentiongate_ffw)rU   rV   rh  ri  rj  r   enable_biasffn_multrW   r2   r3   r?     s>   
z,EvollaSequenceAlignerCrossAttention.__init__c	                 C   s  |||g}	dd |	D }	|	st dtj|	dd}	| |}
| |
}
| jdur=| jdur=||}| |}| |}nd}d}| jdur[| j	dur[||}| |}| 	|}nd}d}| j
dury| jdury||}| 
|}| |}nd}d}|||g}dd |D }tj|dd}|||g}dd |D }tj|dd}|
 dd	 | j| jf }|
j| d
ddd}
| dd	 | j| jf }|j| d
ddd}| dd	 | j| jf }|j| d
ddd}|
| j }
|du rt|d
|d|j}|ddddddf |	ddddddf  }t|
|d	d}||jd	dd  }|d|  t|jj}tjd	d|}t||}|d
ddd }| dd | j f }|j| }| !|}|S )z
        query_states: text
        key_value_states: protein
        query_states: [bs, query_seq_len, dim]
        key_value_states: [bs, kv_seq_len, dim]
        query_attn_mask: [bs, query_seq_len]
        kv_attn_mask: [bs, kv_seq_len]
        c                 S      g | ]}|d ur|qS r   r2   r   r2   r2   r3   r         zGEvollaSequenceAlignerCrossAttention.cross_attention.<locals>.<listcomp>z=At least one modality should be provided for cross attention.r#   r&   Nc                 S   r|  r   r2   r   r2   r2   r3   r   ;  r}  c                 S   r|  r   r2   r   r2   r2   r3   r   ?  r}  r;   r   rs   r   r{   Tr/  )"r   r*   ru   ru  r   rn  ro  r`   rp  rq  rr  rs  rj   r   r   r   r1  r&  r  ri   r   r   r2  r3  r[   r4  finfora   minr   Softmaxr   r   rv  )rU   query_statesprotein_key_value_statesstructure_key_value_statesmsa_key_value_statesquery_attn_maskprotein_kv_attn_maskstructure_kv_attn_maskmsa_kv_attn_maskkv_attn_maskr   key_layer_proteinvalue_layer_proteinkey_layer_structurevalue_layer_structurekey_layer_msavalue_layer_msar   r   new_query_layer_shapenew_key_layer_shapenew_value_layer_shaperb   r   attention_scoresattention_probscontext_layernew_context_layer_shaper2   r2   r3   cross_attention  s|   












 0

z3EvollaSequenceAlignerCrossAttention.cross_attentionc              
   C   s  |d ur&|j \}}}|d u r%t|||	j|	j||fdj |j}nd }|d urN|j \}}}|d u rMt|||	j|
j||fdj |j}nd }|d urv|j \}}}|d u rut|||	j|j||fdj |j}nd }|}|d ur| s|d ur| s|d ur| r|}| j||||||||d}t	| j
| }|| }|}| |t	| j }|| }|S )N)rj   )r  r  r  r  r  r  r  r  )r^   r*   r  r`   ri   rP   Tanyr  tanhrx  r\  ry  )rU   r  protein_kv_statesstructure_kv_statesmsa_kv_statesr  r  r  r  protein_batch_maskstructure_batch_maskmsa_batch_maskpast_key_valuesr:  protein_kv_seq_lenr'   structure_kv_seq_lenmsa_kv_seq_lenr   residualr2   r2   r3   rh   o  sf   z+EvollaSequenceAlignerCrossAttention.forwardr   )NNNNNNN)rn   ro   rp   r)   r?   r  rh   rr   r2   r2   rW   r3   rg    s(    3wrg  RMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )rt  ư>r7   r   Nc                    s&   t    tt|| _|| _dS )z<
        EvollaRMSNorm is equivalent to T5LayerNorm
        N)r>   r?   r   rO  r*   r  weightvariance_epsilon)rU   rB   r7   rW   r2   r3   r?     s   

zEvollaRMSNorm.__init__r   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nrs   r;   T)r0  )	ra   r`   r*   float32powmeanrsqrtr  r  )rU   r   input_dtypevariancer2   r2   r3   rh     s
   zEvollaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r  r^   r  r  r2   r2   r3   
extra_repr  s   zEvollaRMSNorm.extra_repr)r  )
rn   ro   rp   r_   r?   r*   r   rh   r  rr   r2   r2   rW   r3   rt    s    rt  c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )EvollaRotaryEmbeddingr   NrV   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   Fr<   original_inv_freq)r>   r?   rO   max_seq_len_cachedoriginal_max_seq_lenrV   rope_parametersr  compute_default_rope_parametersr   attention_scalingrM   clone)rU   rV   ri   rope_init_fnr   rW   r2   r3   r?     s   


zEvollaRotaryEmbedding.__init__ri   ztorch.devicer   r   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNr   r   rs   r   ri   ra   )	r  rL   rB   r   r*   rN   r   r`   r_   )rV   ri   r   baser'   attention_factorr   r2   r2   r3   r    s   
&z5EvollaRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r;   r#   mpscpuF)device_typeenabledrs   r&   r   )r   r_   rP   r^   r`   ri   r  typestrr   r   r*   ru   r|   r  r}   ra   )
rU   rw   r:   inv_freq_expandedposition_ids_expandedr  r   r   r|   r}   r2   r2   r3   rh     s   0&zEvollaRotaryEmbedding.forwardr   r   )rn   ro   rp   r*   r   r   r$   r?   staticmethodr   r)   r   r_   r  no_gradr   rh   rr   r2   r2   rW   r3   r    s&   
 

r  c                       r   )	EvollaMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nr$  )r>   r?   rV   rB   r   r   r   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnrT   rW   r2   r3   r?     s   
zEvollaMLP.__init__c                 C   s$   |  | | || | }|S r   )r  r  r  r  )rU   rw   r  r2   r2   r3   rh     s    zEvollaMLP.forwardr   r2   r2   rW   r3   r    s    
r  c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr;   rs   r&   )r^   r*   ru   rv   r2   r2   r3   rotate_half  s   r  rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r\   r  )r   r   r|   r}   unsqueeze_dimq_embedk_embedr2   r2   r3   apply_rotary_pos_emb&  s
   

r  r   n_repr   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r#   N)r^   rP   r   )r   r  batchnum_key_value_headsslenr  r2   r2   r3   	repeat_kv@  s
   0r  c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejf fddZ  ZS )EvollaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrV   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr  r   Tr$  )r>   r?   rV   r   rL   rB   r   r  r  num_key_value_groupsr   attention_dropoutr   r   r   attention_biasq_projk_projv_projo_projrU   rV   r   rW   r2   r3   r?   P  s(   
zEvollaAttention.__init__Nr   rQ   rb   r  cache_positionr   r   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jd|\}}|jg |dR   }| |}||fS )Nr;   r#   rs   )r}   r|   r  rY   r   )r^   r  r  r   r   r  r  r  updater   r   r   rV   r   r   r   r  r   r   r   r  )rU   r   rQ   rb   r  r  r   rk   r   r  
key_statesvalue_statesr|   r}   cache_kwargsr   r   r   r2   r2   r3   rh   g  s8   	

zEvollaAttention.forwardrm   )rn   ro   rp   rq   r$   r)   r?   r*   r   r   r	   rf  r   r   rh   rr   r2   r2   rW   r3   r  L  s,    r  c                       s   e Zd Zdedef fddZ													ddejdeejejf dB d	ejdB d
ej	dB de
dB dedB dej	dB dejdB dejdB dejdB dejdB dejdB dejdB dejdB dejfddZ  ZS )EvollaDecoderLayerrV   r   c                    s   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
|d t|j|j d dkr@t||jd| _d S d S )NrV   r   r6   r#   r   )rh  )r>   r?   rB   r  	self_attnr  mlprt  rms_norm_epsinput_layernormpost_attention_layernormmaxr   aligner_num_add_layersrg  adapterr  rW   r2   r3   r?     s   

zEvollaDecoderLayer.__init__NFr   rQ   rb   r:   r  	use_cacher  r  r  r  r  r  r  r  r   c              
   K   s   |}|  |}| jd|||||||d|\}}|| }|}| |}| |}|| }t| dr?| j|||	|
||||d}|S )N)r   rb   r:   r  r  r  rQ   r  )r  r  r  r  r  r  r  r  r2   )r  r  r  r  r   r  )rU   r   rQ   rb   r:   r  r  r  r  r  r  r  r  r  r  r   r  r   r2   r2   r3   rh     s<   





zEvollaDecoderLayer.forward)NNNNFNNNNNNNN)rn   ro   rp   r$   r)   r?   r*   r   r   rf  r	   r4  rh   rr   r2   r2   rW   r3   r    s\    	
r  c                       sb   e Zd ZU eed< dZdZg dZdgZdZ	dZ
dZdZdZeedZe  fdd	Z  ZS )
EvollaPreTrainedModelrV   ra  T)r  rJ  rg  r  F)r   r  c                    sl   | j j}t | t|tr$t|j t|j	 t
|jj d S t|tr4tj|jd|d d S d S )NrY   )r  std)rV   initializer_ranger>   r
  r  rg  r  zeros_rx  ry  ones_ru  r  rJ  normal_r5  )rU   r   r  rW   r2   r3   r
    s   

z#EvollaPreTrainedModel._init_weights)rn   ro   rp   r$   r   base_model_prefixsupports_gradient_checkpointingr  _skip_keys_device_placementr  r  r  _can_compile_fullgraphr  r  r  r  r*   r  r
  rr   r2   r2   rW   r3   r    s    
 r  c                !       s   e Zd Zdef fddZdd Zdd Zeee														dd	e
jdB d
e
jdB de
jdB dedB de
jdB dedB de
jdB de
jdB de
jdB de
jdB de
jdB de
jdB de
jdB deeB fddZ  ZS )EvollaModelrV   c                    s   t     j| _ j| _t| j j| j| _t	 d| _
t fddt jD | _t j jd| _t dd| _t d| _|   d S )Nr   c                    s   g | ]}t  |d qS )r  )r  )r   r   r   r2   r3   r     s    z(EvollaModel.__init__.<locals>.<listcomp>r6   r   F)r>   r?   rC   r/   rA   r   r@   rB   embed_tokensr`  protein_encoderr   r   r   rQ  rt  r  rE  rL   r   r  
rotary_embr  rT   rW   r   r3   r?     s   

zEvollaModel.__init__c                 C   s   | j S r   r  r  r2   r2   r3   r    s   z EvollaModel.get_input_embeddingsc                 C   s
   || _ d S r   r  r  r2   r2   r3   r       
z EvollaModel.set_input_embeddingsNr.   rb   r:   r  rc   r  r  protein_input_idsprotein_attention_maskstructure_feats	msa_featsr  r  r   c                 K   sJ  |du |duA rt d|du r| |}|r!|du r!t| jd}|du r=|dur-| nd}tj|||jd  |jd}|du rF|	d}d}d}|duri|	duri| j
||	d}|j}tj|jd |jtjd}t| j||||d	}|}| j||d
}| jD ]}||f|||||||
||||||d|}q| |}t||d}|S )a;  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
        structure_feats (torch.FloatTensor):
            The input IDs for purely structure-based features. Should be of shape `(batch_size, structure_seq_length, structure_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        msa_feats (torch.FloatTensor):
            The input IDs for purely MSA-based features. Should be of shape `(batch_size, msa_seq_length, msa_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
        structure_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely structure-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `structure_feats`. Dummpy input for now.
        msa_batch_mask (torch.Tensor):
            The batch mask to decide which protein sequences are purely MSA-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `msa_feats`. Dummpy input for now.
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r#   r   r  r  )rV   rc   rb   r  r  )r:   )rb   r:   r  r  r  r  r  r  r  r  r  r  rQ   )r   r  )r   r  r
   rV   get_seq_lengthr*   rN   r^   ri   r\   r  r_  r  r4  r   r  rQ  rE  r   )rU   r.   rb   r:   r  rc   r  r  r  r  r  r	  r  r  r   past_seen_tokensprotein_featsr  protein_outputscausal_maskr   rQ   decoder_layerr   r2   r2   r3   rh     sz   "



zEvollaModel.forward)NNNNNNNNNNNNN)rn   ro   rp   r$   r?   r  r  r   r    r"   r*   rf  r   r	   r   r4  r   r   rh   rr   r2   r2   rW   r3   r     sb    	
r   c                       s   e Zd Z fddZdd Zdd Zee								dd	ej	dB d
ej
dB dejdB dej	dB dej	dB dej
dB dedB deej
B fddZ  ZS )EvollaForProteinText2Textc                    s@   t  | t|| _|j| _tj|j| jdd| _| 	  d S rD  )
r>   r?   r   ra  rA   r   r   rB   lm_headr  rT   rW   r2   r3   r?     s
   
z"EvollaForProteinText2Text.__init__c                 C   s
   | j  S r   )ra  r  r  r2   r2   r3   r    r  z.EvollaForProteinText2Text.get_input_embeddingsc                 C   s   | j |S r   )ra  r  r  r2   r2   r3   r    r  z.EvollaForProteinText2Text.set_input_embeddingsNr   r.   rb   rc   labelsr  r  r  logits_to_keepc	              	   K   s   | j d||||||d|	}
|
j}t|trt| dn|}| |dd|ddf }d}|dur@| jd||| jd|	}t|||
j	|
j
|
jd}|S )a,  
        protein_input_ids (torch.LongTensor):
            The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
        protein_attention_mask (torch.Tensor):
            The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.

        Example:

        ```python
        >>> from transformers import EvollaProcessor, EvollaForProteinText2Text
        >>> model = EvollaForProteinText2Text.from_pretrained("westlake/Evolla-10B-hf")
        >>> processor = EvollaProcessor.from_pretrained("westlake/Evolla-10B-hf")

        >>> protein_information = {
            "aa_seq": "your amino acid sequence",
            "foldseek": "your foldseek sequence",
        }
        >>> question = "What is the function of this protein?"
        >>> message = [
            {"role": "system", "content": "You are an AI expert that can answer any questions about protein."},
            {"role": "user", "content": question},
        ]

        >>> inputs = processor(proteins=[protein_information], messages_list=[message], return_tensors="pt", padding="longest")
        >>> outputs = model.generate(**inputs)

        >>> print(processor.batch_decode(outputs, skip_special_tokens=True))
        ```)r.   rb   rc   r  r  r  N)logitsr  rA   )lossr  r  r   r  r2   )ra  r   r  r)   slicer  loss_functionrA   r   r  r   r  )rU   r.   rb   rc   r  r  r  r  r  r   outputsr   slice_indicesr  r  
lm_outputsr2   r2   r3   rh     s0   *
z!EvollaForProteinText2Text.forward)NNNNNNNr   )rn   ro   rp   r?   r  r  r   r   r*   rf  r   r   r4  r)   rh   rr   r2   r2   rW   r3   r    s>    	r  )r  r   r  )NrY   )r#   )^r   collections.abcr   dataclassesr   typingr   r*   r    r   r  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r    utils.output_capturingr!   r"   configuration_evollar$   r%   r4   Moduler5   rz   r~   r   r   r_   r   r   r   r   r   r   r   r   r   r   r  r  r!  rB  rJ  r^  r`  rg  rt  r  r  r  r  r)   r  r  r  r  r   r  __all__r2   r2   r2   r3   <module>   s   a3
P7 .:* oAFH! V