o
    ॵi8                     @   s   d Z ddlZddlZddlmZ edejZejddG dd dZ	d	e
d
e	fddZde
d
ee
 fddZde
d
e	fddZdS )zDUtilities for extracting identifiers from MSA sequence descriptions.    N)Optionala  
        ^
        # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
        (?:tr|sp)
        \|
        # A primary accession number of the UniProtKB entry.
        (?P<AccessionIdentifier>[A-Za-z0-9]{6,10})
        # Occasionally there is a _0 or _1 isoform suffix, which we ignore.
        (?:_\d)?
        \|
        # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
        # protein ID code.
        (?:[A-Za-z0-9]+)
        _
        # A mnemonic species identification code.
        (?P<SpeciesIdentifier>([A-Za-z0-9]){1,5})
        # Small BFD uses a final value after an underscore, which we ignore.
        (?:_\d+)?
        $
        T)frozenc                   @   s   e Zd ZU dZeed< dS )Identifiers 
species_idN)__name__
__module____qualname__r   str__annotations__ r   r   i/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/science/unifold/msa/msa_identifiers.pyr   0   s   
 r   msa_sequence_identifierreturnc                 C   s*   t t|  }|rt|ddS t S )a  Gets accession id and species from an msa sequence identifier.

    The sequence identifier has the format specified by
    _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
    An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`

    Args:
        msa_sequence_identifier: a sequence identifier.

    Returns:
        An `Identifiers` instance with a species_id. These
        can be empty in the case where no identifier was found.
    SpeciesIdentifier)r   )research_UNIPROT_PATTERNstripr   group)r   matchesr   r   r   _parse_sequence_identifier5   s   r   descriptionc                 C   s"   |   }|r|d dd S dS )zHExtracts sequence identifier from description. Returns None if no match.r   /N)split	partition)r   split_descriptionr   r   r   _extract_sequence_identifierI   s   r   c                 C   s   t | }|du rt S t|S )z1Computes extra MSA features from the description.N)r   r   r   )r   sequence_identifierr   r   r   get_identifiersR   s   r   )__doc__dataclassesr   typingr   compileVERBOSEr   	dataclassr   r
   r   r   r   r   r   r   r   <module>   s   
	