o
    i,                     @   sb   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 g d	ZG d
d deZdgZdS )z
Processor class for EVOLLA.
    N)OptionalUnion   )BatchFeature)ProcessorMixin   )AutoTokenizer)aa_seqfoldseekmsac                
       s   e Zd ZdZddgZdgZdZdZdZd! fd	d
	Z	d"ddZ
	d#defddZ				d$deeee ef  deeeee  ee f  dee dee fddZdd Zdd Zdd Zdd Z fddZe fdd Z  ZS )%EvollaProcessoran  
    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.

    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.

    Args:
        protein_tokenizer (`EsmTokenizer`):
            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
        tokenizer (`LlamaTokenizerFast`, *optional*):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text to be generated.
    protein_tokenizer	tokenizersequence_max_lengthr   N      c                    sF   |d u rt d|d u rt dt || d| j_|| _|| _d S )Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__r   	pad_tokenprotein_max_lengthtext_max_length)selfr   r   r   r   kwargs	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/evolla/processing_evolla.pyr   ;   s   
zEvollaProcessor.__init__c           	      C   s`   g }|D ]}| d}| d}ddd t||D }|| q| jj|dd|dd}|S )	Nr	   r
    c                 S   s    g | ]\}}|  |  qS r   )upperlower).0sfr   r   r   
<listcomp>L   s     z4EvollaProcessor.process_proteins.<locals>.<listcomp>ptT)return_tensors
truncation
max_lengthpadding)getjoinzipappendr   batch_encode_plus)	r   proteinsr   sa_sequencesproteinr	   r
   sa_sequence	sa_tokensr   r   r   process_proteinsG   s   


z EvollaProcessor.process_proteinsr   c                 C   sD   g }|D ]}| j j|ddd}|| q| j |dddd|d}|S )NFT)tokenizeadd_generation_promptr%   longest)add_special_tokensr&   r)   r'   r(   )r   apply_chat_templater-   )r   textsr   promptsmessagespromptprompt_inputsr   r   r   process_textT   s"   zEvollaProcessor.process_textr/   messages_listr   c           	      K   s  |du s|du rt d|dur|n| j}|dur|n| j}t|tr&|g}t|ttfr9t|d ttfs9|g}t|ttfrMtdd |D sMt dt|ttfrjtdd |D sjt dd	t	 d
| t|ttfr|D ]9}t|ttfst dt
| dtdd |D st dtdd |D stdd |D rt d| qsn
t dt
| d| ||}| ||}t|d |d |d |d ddS )av  This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
        the model.

        Args:
            proteins (`Union[List[dict], dict]`):
                A list of dictionaries or a single dictionary containing the following keys:
                    - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
                    - `"foldseek"` (`str`) -- The foldseek string of the protein.
            messages_list (`Union[List[List[dict]], List[dict]]`):
                A list of lists of dictionaries or a list of dictionaries containing the following keys:
                    - `"role"` (`str`) -- The role of the message.
                    - `"content"` (`str`) -- The content of the message.
            protein_max_length (`int`, *optional*, defaults to 1024):
                The maximum length of the sequence to be generated.
            text_max_length (`int`, *optional*, defaults to 512):
                The maximum length of the text.

        Return:
            a dict with following keys:
                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
        Nz3You need to specify `messages_list` and `proteins`.r   c                 s       | ]}t |tV  qd S N
isinstancedictr!   pr   r   r   	<genexpr>       z+EvollaProcessor.__call__.<locals>.<genexpr>zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c                 s   s&    | ]}t d d | D V  qdS )c                 s   s    | ]}|t v V  qd S rB   )PROTEIN_VALID_KEYS)r!   kr   r   r   rH      s    z5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>N)allkeysrF   r   r   r   rH      s    
z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c                 s   rA   rB   rC   r!   mr   r   r   rH      rI   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c                 s   s     | ]}t | d kV  qdS )r   N)lenrM   rO   r   r   r   rH      s    c                 s   s$    | ]}t | d dhkV  qdS )rolecontentN)setrM   rO   r   r   r   rH      s    
zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)protein_input_idsprotein_attention_maskrU   rV   )data)r   r   r   rD   rE   listtuplerL   r+   rJ   typeanyr4   r?   r   )	r   r/   r@   r   r   r   r<   r3   text_tokensr   r   r   __call__l   sd   !
  zEvollaProcessor.__call__c                 O      | j j|i |S rB   )r   batch_decoder   argsr   r   r   r   ra         zEvollaProcessor.batch_decodec                 O   r`   rB   )r   decoderb   r   r   r   re      rd   zEvollaProcessor.decodec                 O   r`   rB   )r   ra   rb   r   r   r   protein_batch_decode   rd   z$EvollaProcessor.protein_batch_decodec                 O   r`   rB   )r   re   rb   r   r   r   protein_decode   rd   zEvollaProcessor.protein_decodec                    s   | j tj|| j d| jv }|r| jdnd }|r'|d ur'| jd t	 j|fi |}|r>|d ur>| j
|d |S )Nr   )r   save_pretrainedospathr+   protein_tokenizer_dir_name
attributesindexremover   insert)r   save_directoryr   protein_tokenizer_presentprotein_tokenizer_indexoutputsr   r   r   rh      s   
zEvollaProcessor.save_pretrainedc                    s@   t  j|fi |}t|tr|d }tj|| jd}||_|S )Nr   )	subfolder)r   from_pretrainedrD   r[   r   rk   r   )clspretrained_model_name_or_pathr   	processorr   r   r   r   ru      s   
zEvollaProcessor.from_pretrained)Nr   r   )r   )r   )NNNN)__name__
__module____qualname____doc__rl   valid_kwargsprotein_tokenizer_classtokenizer_classrk   r   r4   intr?   r   r   rZ   rE   r_   ra   re   rf   rg   rh   classmethodru   __classcell__r   r   r   r   r       sB    


Yr   )r|   ri   typingr   r   feature_extraction_utilsr   processing_utilsr   autor   rJ   r   __all__r   r   r   r   <module>   s    
X