o
    ei'                      @   sN   d Z ddlmZ ddlmZ ddlmZ g dZeG dd deZdgZ	dS )	z
Processor class for EVOLLA.
   )BatchFeature)ProcessorMixin)auto_docstring)aa_seqfoldseekmsac                       s   e Zd Zd fdd	ZdddZ	ddefd	d
Ze				ddee	 e	B dB deee	  ee	 B dB dedB dedB fddZ
dd Zdd Zdd Zdd Z  ZS )EvollaProcessorN      c                    sF   |du rt d|du rt dt || d| j_|| _|| _dS )a  
        protein_tokenizer (`EsmTokenizer`):
            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text to be generated.
        Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__	tokenizer	pad_tokenprotein_max_lengthtext_max_length)selfprotein_tokenizerr   r   r   kwargs	__class__ j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/evolla/processing_evolla.pyr      s   	
zEvollaProcessor.__init__c           	      C   s^   g }|D ]}| d}| d}ddd t||D }|| q| j|dd|dd}|S )	Nr   r    c                 S   s    g | ]\}}|  |  qS r   )upperlower).0sfr   r   r   
<listcomp>7   s     z4EvollaProcessor.process_proteins.<locals>.<listcomp>ptT)return_tensors
truncation
max_lengthpadding)getjoinzipappendr   )	r   proteinsr   sa_sequencesproteinr   r   sa_sequence	sa_tokensr   r   r   process_proteins2   s   


z EvollaProcessor.process_proteinsr   c                 C   sD   g }|D ]}| j j|ddd}|| q| j |dddd|d}|S )NFT)tokenizeadd_generation_promptr    longest)add_special_tokensr!   r$   r"   r#   )r   apply_chat_templater(   )r   textsr   promptsmessagespromptprompt_inputsr   r   r   process_text?   s"   zEvollaProcessor.process_textr)   messages_listr   c           	      K   s  |du s|du rt d|dur|n| j}|dur|n| j}t|tr&|g}t|ttfr9t|d ttfs9|g}t|ttfrMtdd |D sMt dt|ttfrjtdd |D sjt dd	t	 d
| t|ttfr|D ]9}t|ttfst
dt| dtdd |D st dtdd |D stdd |D rt d| qsn
t dt| d| ||}| ||}t|d |d |d |d ddS )a  
        proteins (`Union[List[dict], dict]`):
            A list of dictionaries or a single dictionary containing the following keys:
                - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
                - `"foldseek"` (`str`) -- The foldseek string of the protein.
        messages_list (`Union[List[List[dict]], List[dict]]`):
            A list of lists of dictionaries or a list of dictionaries containing the following keys:
                - `"role"` (`str`) -- The role of the message.
                - `"content"` (`str`) -- The content of the message.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text.

        Return:
            a dict with following keys:
                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
        Nz3You need to specify `messages_list` and `proteins`.    c                 s       | ]}t |tV  qd S N
isinstancedictr   pr   r   r   	<genexpr>       z+EvollaProcessor.__call__.<locals>.<genexpr>zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c                 s   s&    | ]}t d d | D V  qdS )c                 s   s    | ]}|t v V  qd S r=   )PROTEIN_VALID_KEYS)r   kr   r   r   rC      s    z5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>N)allkeysrA   r   r   r   rC      s    
z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c                 s   r<   r=   r>   r   mr   r   r   rC      rD   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c                 s   s     | ]}t | d kV  qdS )   N)lenrH   rJ   r   r   r   rC      s    c                 s   s$    | ]}t | d dhkV  qdS )rolecontentN)setrH   rJ   r   r   r   rC      s    
zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)protein_input_idsprotein_attention_maskrQ   rR   )data)r   r   r   r?   r@   listtuplerG   r&   rE   	TypeErrortypeanyr.   r9   r   )	r   r)   r:   r   r   r   r6   r-   text_tokensr   r   r   __call__W   sd   
  zEvollaProcessor.__call__c                 O      | j j|i |S r=   )r   batch_decoder   argsr   r   r   r   r^         zEvollaProcessor.batch_decodec                 O   r]   r=   )r   decoder_   r   r   r   rb      ra   zEvollaProcessor.decodec                 O   r]   r=   )r   r^   r_   r   r   r   protein_batch_decode   ra   z$EvollaProcessor.protein_batch_decodec                 O   r]   r=   )r   rb   r_   r   r   r   protein_decode   ra   zEvollaProcessor.protein_decode)Nr	   r
   )r	   )r
   )NNNN)__name__
__module____qualname__r   r.   intr9   r   rV   r@   r\   r^   rb   rc   rd   __classcell__r   r   r   r   r      s2    

Vr   N)
__doc__feature_extraction_utilsr   processing_utilsr   utilsr   rE   r   __all__r   r   r   r   <module>   s    
