o
    ॵiF                     @   s|  d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ eZeeef ZejjZeZeee	e f Zejdd	G d
d dZejdd	G dd dZejdd	G dd dZejdd	G dd dZejdd	G dd dZejdd	G dd dZ G dd de!Z"dedede	eeef  fddZ#dedededeeeeef f fddZ$ej%d d!d"dd#d$ed%ed&e&de fd'd(Z'ej%d d!d"dd#d$ed%ed&e&de fd)d*Z(d+edefd,d-Z)d.Z*dedefd/d0Z+dedefd1d2Z,dede	e fd3d4Z-deeef deee	e f fd5d6Z.d7ede&fd8d9Z/dS ):zParses the mmCIF file format.    N)AnyMappingOptionalSequenceTuple)logging)PDB)SCOPData)MMCIFParserT)frozenc                   @   s   e Zd ZU eed< eed< dS )MonomeridnumN__name__
__module____qualname__str__annotations__int r   r   _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/science/unifold/msa/mmcif.pyr   "   s   
 r   c                   @   sN   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< d	S )
AtomSiteresidue_nameauthor_chain_idmmcif_chain_idauthor_seq_nummmcif_seq_numinsertion_codehetatm_atom	model_numNr   r   r   r   r   r   *   s   
 r   c                   @   s&   e Zd ZU eed< eed< eed< dS )ResiduePositionchain_idresidue_numberr   Nr   r   r   r   r   r!   7   s   
 r!   c                   @   s2   e Zd ZU ee ed< eed< eed< eed< dS )ResidueAtPositionpositionname
is_missinghetflagN)r   r   r   r   r!   r   r   boolr   r   r   r   r$   >   s
   
 r$   c                   @   sz   e Zd ZU dZeed< eed< eed< ee	e
f ed< ee	eeef f ed< eed< ee	e	f ed< ee	ef ed	< d
S )MmcifObjectap  Representation of a parsed mmCIF file.

    Contains:
        file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
            files being processed.
        header: Biopython header.
        structure: Biopython structure.
        chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
            {'A': 'ABCDEFG'}
        seqres_to_structure: Dict; for each chain_id contains a mapping between
            SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,  1: ResidueAtPosition, ...}}
        raw_string: The raw string used to construct the MmcifObject.
    file_idheader	structurechain_to_seqresseqres_to_structure
raw_stringmmcif_to_author_chain_idvalid_chainsN)r   r   r   __doc__r   r   	PdbHeaderPdbStructurer   ChainIdSeqResr   r$   r   r   r   r   r   r*   F   s   
 r*   c                   @   s6   e Zd ZU dZee ed< eee	e	f e
f ed< dS )ParsingResultzReturned by the parse function.

    Contains:
        mmcif_object: A MmcifObject, may be None if no chain could be successfully
            parsed.
        errors: A dict mapping (file_id, chain_id) to any exception generated.
    mmcif_objecterrorsN)r   r   r   r3   r   r*   r   r   r   r   r   r   r   r   r   r8   `   s   
 r8   c                   @   s   e Zd ZdZdS )
ParseErrorz;An error indicating that an mmCIF file could not be parsed.N)r   r   r   r3   r   r   r   r   r;   n   s    r;   prefixparsed_inforeturnc                    sp   g  g |  D ]\}}|| r | | qtfddD s-J d   fddt D S )an  Extracts loop associated with a prefix from mmCIF data as a list.

    Reference for loop_ in mmCIF:
        http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html

    Args:
        prefix: Prefix shared by each of the data items in the loop.
            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
            _entity_poly_seq.mon_id. Should include the trailing period.
        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
            parser.

    Returns:
        Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
    c                    s    g | ]}t |t  d  kqS )r   )len.0xsdatar   r   
<listcomp>   s    z&mmcif_loop_to_list.<locals>.<listcomp>z2mmCIF error: Not all loops are the same length: %sc                    s   g | ]	}t t |qS r   )dictzipr@   )colsr   r   rE      s    )items
startswithappendallrG   )r<   r=   keyvaluer   )rH   rD   r   mmcif_loop_to_listr   s   



rO   indexc                    s   t | |} fdd|D S )ag  Extracts loop associated with a prefix from mmCIF data as a dictionary.

    Args:
        prefix: Prefix shared by each of the data items in the loop.
            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
            _entity_poly_seq.mon_id. Should include the trailing period.
        index: Which item of loop data should serve as the key.
        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
            parser.

    Returns:
        Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
        indexed by the index column.
    c                    s   i | ]}|  |qS r   r   )rA   entryrP   r   r   
<dictcomp>   s    z&mmcif_loop_to_dict.<locals>.<dictcomp>)rO   )r<   rP   r=   entriesr   rR   r   mmcif_loop_to_dict   s   
rU      F)typed)catch_all_errorsr+   mmcif_stringrX   c              
   C   s   i }zXt dd}|j}| D ]\}}t|ts|g||< qt|}t|d}	|	s3td| dfdiW S i }
t|D ]}|j	dkrAq9|j
|
|j< q9t| |ddd||
|	d}t||d	W S  ty{ } z||| df< |sk td|d	W  Y d}~S d}~ww )
  Entry point, parses an mmcif_string.

    Args:
        file_id: A string identifier for this file. Should be unique within the
            collection of files being processed.
        mmcif_string: Contents of an mmCIF file.
        catch_all_errors: If True, all exceptions are caught and error messages are
            returned as part of the ParsingResult. If False exceptions will be allowed
            to propagate.

    Returns:
        A ParsingResult.
    TQUIETr=   N %No protein chains found in this file.1r+   r,   r-   r.   r/   r0   r1   r2   r9   r:   )r
   _mmcif_dictrI   
isinstancelist_get_header_get_protein_chainsr8   _get_atom_site_listr    r   r   r*   	Exception)r+   rY   rX   r:   parserr=   rM   rN   r,   r2   r1   atomr9   er   r   r   
fast_parse   sP   




rm   c           !   
   C   s  i }z t jdd}t|}|d|}t|}|j}| D ]\}	}
t|
t	s.|
g||	< q t
|}t|d}|sDtd| dfdiW S dd | D }i }i }t|D ]]}|jd	kr]qU|j||j< |j|v rd
}|jdkr||jdv rwd}nd|j }|j}t|jsd
}t|jt|j|d}t|j||j  }||ji }t||jd|d||< |||j< qU| D ]$\}}|| }|| }t|D ]\}}||vrtd|jdd
d||< qqi }| D ].\}}|| }g }|D ]}tj|jd}|t |dkr|nd qd!|}|||< qt"| |||||||d}t||dW S  t#yF }  z| || df< |s6 td|dW  Y d} ~ S d} ~ ww )rZ   Tr[   r^   r]   Nr_   c                 S   s$   i | ]\}}|t d d |D qS )c                 S   s   g | ]}|j qS r   )r   rA   monomerr   r   r   rE     s    z$parse.<locals>.<dictcomp>.<listcomp>min)rA   r"   seqr   r   r   rS     s    zparse.<locals>.<dictcomp>r`    HETATM)HOHWATWH_)r"   r#   r   F)r%   r&   r'   r(   X   ra   rb   )$r   r
   ioStringIOget_structure_get_first_modelrc   rI   rd   re   rf   rg   r8   rh   r    r   r   r   r   r   _is_setr!   r   r   r   getr$   	enumerater   r	   protein_letters_3to1rK   r?   joinr*   ri   )!r+   rY   rX   r:   rj   handlefull_structurefirst_model_structurer=   rM   rN   r,   r2   seq_start_numr1   seq_to_structure_mappingsrk   r(   r   r%   seq_idxcurrentr"   seq_infoauthor_chaincurrent_mappingidxro   author_chain_to_sequencerr   coder9   rl   r   r   r   parse   s   



	










r   r-   c                 C   s   t |  S )z1Returns the first model in a Biopython structure.)next
get_models)r-   r   r   r   r~   n  s   r~      c                 C   s   | d }t |S )z!Returns the oldest revision date.*_pdbx_audit_revision_history.revision_daterp   )r=   revision_datesr   r   r   get_release_datev  s   r   c              	   C   s   i }t d| }ddd |D |d< d| v rt| |d< ntd| d	  d
|d< dD ]&}|| v rRz| | d }t||d< W q, tyQ   td| |  Y q,w q,|S )zFReturns a basic header containing method, release date and resolution.z_exptl.,c                 S   s   g | ]}|d    qS )z_exptl.method)lower)rA   
experimentr   r   r   rE     s    z_get_header.<locals>.<listcomp>structure_methodr   release_datez$Could not determine release_date: %sz	_entry.idg        
resolution)z_refine.ls_d_res_highz _em_3d_reconstruction.resolutionz_reflns.d_resolution_highr   zInvalid resolution format: %s)rO   r   r   r   warningfloat
ValueErrordebug)r=   r,   experimentsres_keyraw_resolutionr   r   r   rf   |  s.   
rf   c                 C   s@   dd t | d | d | d | d | d | d | d	 | d
 D S )zGReturns list of atom sites; contains data not present in the structure.c                 S   s   g | ]}t | qS r   )r   )rA   siter   r   r   rE     s    z'_get_atom_site_list.<locals>.<listcomp>z_atom_site.label_comp_idz_atom_site.auth_asym_idz_atom_site.label_asym_idz_atom_site.auth_seq_idz_atom_site.label_seq_idz_atom_site.pdbx_PDB_ins_codez_atom_site.group_PDBz_atom_site.pdbx_PDB_model_num)rG   r]   r   r   r   rh     s   rh   c                    s   t d| }tt}|D ]}||d  t|d t|d d qtdd|  t d| }tt}|D ]}|d	 }|d
 }|| | q4i }	| D ]\}}
|| }t	 fdd|
D rh|D ]}|
|	|< qaqL|	S )zExtracts polymer information for protein chains only.

    Args:
        parsed_info: _mmcif_dict produced by the Biopython parser.

    Returns:
        A dict mapping mmcif chain id to a list of Monomers.
    z_entity_poly_seq.z_entity_poly_seq.entity_idz_entity_poly_seq.mon_idz_entity_poly_seq.num)r   r   z_chem_comp.z_chem_comp.idz_struct_asym.z_struct_asym.idz_struct_asym.entity_idc                    s   g | ]}d  |j  d v qS )peptidez_chem_comp.type)r   rn   
chem_compsr   r   rE     s    z'_get_protein_chains.<locals>.<listcomp>)
rO   collectionsdefaultdictre   rK   r   r   rU   rI   any)r=   entity_poly_seqspolymersentity_poly_seqstruct_asymsentity_to_mmcif_chainsstruct_asymr"   	entity_idr2   r   	chain_idsr   r   r   rg     s8   





rg   rD   c                 C   s   | dvS )zFReturns False if data is a special mmCIF character indicating 'unset'.).?r   rC   r   r   r   r     s   r   )0r3   r   dataclasses	functoolsr{   typingr   r   r   r   r   abslr   Bior   Bio.Datar	   Bio.PDB.MMCIFParserr
   r   r6   r4   	Structurer5   r7   	MmCIFDict	dataclassr   r   r!   r$   r*   r8   ri   r;   rO   rU   	lru_cacher)   rm   r   r~   -_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDEr   rf   rh   rg   r   r   r   r   r   <module>   s   







B !
4