o
    ॵi[                  
   @   sV  d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
mZmZmZ eee  ZejddG dd dZejddG dd	 d	Zd
edeee ee f fddZdedefddZdedefddZdee dedee fddZ		dEdede
e dedefddZdedee defdd Zd!ededefd"d#Zd!edefd$d%Zd!edefd&d'Zd(ededee
e  fd)d*Zd+ed,ed-e	e fd.d/Zd0ee defd1d2Z d3edee fd4d5Z!d6edeee"f fd7d8Z#d+ed9ede	e fd:d;Z$ejddG d<d= d=Z%d>ede%fd?d@Z&	dFdAededBedee fdCdDZ'dS )Gz+Functions for parsing various file formats.    N)DictIterableListOptionalSequenceSetTupleT)frozenc                   @   sP   e Zd ZU dZee ed< eed< ee ed< dd Zdd Z	d	e
fd
dZdS )Msaz%Class representing a parsed MSA file.	sequencesdeletion_matrixdescriptionsc                 C   sZ   t | jt | j  krt | jks+n tdt | j dt | j dt | j dd S )Nz5All fields for an MSA must have the same length. Got z sequences, z! rows in the deletion matrix and z descriptions.)lenr   r   r   
ValueErrorself r   a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/science/unifold/msa/parsers.py__post_init__!   s   
zMsa.__post_init__c                 C   s
   t | jS N)r   r   r   r   r   r   __len__*   s   
zMsa.__len__max_seqsc                 C   s,   t | jd | | jd | | jd | dS )Nr   r   r   )r
   r   r   r   )r   r   r   r   r   truncate-   s
   zMsa.truncateN)__name__
__module____qualname____doc__r   str__annotations__DeletionMatrixr   r   intr   r   r   r   r   r
      s   
 	r
   c                   @   s^   e Zd ZU dZeed< eed< eed< ee ed< eed< eed< e	e ed< e	e ed	< d
S )TemplateHitz"Class representing a template hit.indexnamealigned_cols	sum_probsqueryhit_sequenceindices_queryindices_hitN)
r   r   r   r   r!   r   r   r   floatr   r   r   r   r   r"   5   s   
 r"   fasta_stringreturnc                 C   sp   g }g }d}|   D ])}| }|dr(|d7 }||dd  |d q
|s+q
||  |7  < q
||fS )a]  Parses FASTA string and returns list of strings with amino-acid sequences.

    Arguments:
        fasta_string: The string contents of a FASTA file.

    Returns:
        A tuple of two lists:
        * A list of sequences.
        * A list of sequence descriptions taken from the comment lines. In the
            same order as the sequences.
    >   N )
splitlinesstrip
startswithappend)r,   r   r   r#   liner   r   r   parse_fastaC   s   

r7   stockholm_stringc                    s,  t  }|  D ]$}| }|r|drq| \} ||vr$d||< ||   7  < qg }g }d}g }t| D ]O\} |dkrN }dd t|D }d fdd|D }	|	|	 g }
d}t
 |D ]\}}|dkst|dkr|dkr}|d7 }qh|
	| d}qh|	|
 q;t||t| d	S )
a  Parses sequences and deletion matrix from stockholm format alignment.

    Args:
        stockholm_string: The string contents of a stockholm file. The first
            sequence in the file should be the query sequence.

    Returns:
        A tuple of:
            * A list of sequences that have been aligned to the query. These
                might contain duplicates.
            * The deletion matrix for the alignment as a list of lists. The element
                at `deletion_matrix[i][j]` is the number of residues deleted from
                the aligned sequence i at residue position j.
            * The names of the targets matched, including the jackhmmer subsequence
                suffix.
    #//r1   r   c                 S   s   g | ]
\}}|d kr|qS -r   ).0iresr   r   r   
<listcomp>       z#parse_stockholm.<locals>.<listcomp>c                    s   g | ]} | qS r   r   r>   csequencer   r   rA          r=   r0   r   )collectionsOrderedDictr2   r3   r4   split	enumeratevaluesjoinr5   zipr
   listkeys)r8   name_to_sequencer6   r$   msar   r'   keep_columns	seq_indexaligned_sequencedeletion_vecdeletion_countseq_res	query_resr   rE   r   parse_stockholm`   sD   



rZ   
a3m_stringc           	         s   t | \}}g }|D ] }g }d}|D ]}| r|d7 }q|| d}q|| q
tddtj  fdd|D }t|||dS )a  Parses sequences and deletion matrix from a3m format alignment.

    Args:
        a3m_string: The string contents of a a3m file. The first sequence in the
            file should be the query sequence.

    Returns:
        A tuple of:
            * A list of sequences that have been aligned to the query. These
                might contain duplicates.
            * The deletion matrix for the alignment as a list of lists. The element
                at `deletion_matrix[i][j]` is the number of residues deleted from
                the aligned sequence i at residue position j.
            * A list of descriptions, one per sequence, from the a3m file.
    r   r0   r1   c                    s   g | ]}|  qS r   )	translate)r>   sdeletion_tabler   r   rA      s    zparse_a3m.<locals>.<listcomp>r   )r7   islowerr5   r   	maketransstringascii_lowercaser
   )	r[   r   r   r   msa_sequencerV   rW   jaligned_sequencesr   r^   r   	parse_a3m   s$   

rg   query_non_gapssto_seqc                 c   s8    t | |D ]\}}|r|V  q|dkr| V  qd S )Nr=   )rN   lower)rh   ri   is_query_res_non_gapsequence_resr   r   r   _convert_sto_seq_to_a3m   s   
rm   stockholm_formatmax_sequencesremove_first_row_gapsc                    sz  i i }d}|   D ].}|ot||k}| r8|ds8|jdd\}}||vr0|r,q
d||< ||  |7  < q
|   D ]>}|dd dkr{|jd	d}|dd	 \}}	t|dkr_|d	 nd}
|	d
krfq=|rm||vrmq=|
|< tt|kr{ nq=i  |rtt| }dd |D }| D ]\}}|	dd}|rd
t||}| |< q fdd D }d
|d S )z3Converts MSA in Stockholm format to the A3M format.Fr9   r0   maxsplitr1   N   #=GS   DEc                 S      g | ]}|d kqS r<   r   )r>   r@   r   r   r   rA      rG   z,convert_stockholm_to_a3m.<locals>.<listcomp>.c                 3   s2    | ]}d | d |d d |  V  qdS )r/    r1   
N)get)r>   ka3m_sequencesr   r   r   	<genexpr>  s    *z+convert_stockholm_to_a3m.<locals>.<genexpr>rz   )r2   r   r3   r4   rJ   nextiterrL   itemsreplacerM   rm   )rn   ro   rp   r   reached_max_sequencesr6   seqnamealigned_seqcolumnsfeaturevaluequery_sequencerh   sto_sequenceout_sequencefasta_chunksr   r}   r   convert_stockholm_to_a3m   sZ   
r   r6   seqnamesc                 C   s   |   sdS |   dkrdS | drdS | drdS | dd dkr1| jdd	\}}}||v S | d
r8dS | dd }||v S )z'Function to decide which lines to keep.Tr;   z# STOCKHOLM#=GC RFNrs   rt      rq   r:   Fry   r   )r3   r4   rJ   	partition)r6   r   _r   r   r   r   
_keep_line  s   


r   stockholm_msac                 C   s   t  }g }|  D ]}| r(|ds(|dd }|| t||kr( nq	|  D ]}t||r9|| q-d	|d S )z<Truncates a stockholm file to a maximum number of sequences.r9   ry   r   rz   )
setr2   r3   r4   r   addr   r   r5   rM   )r   ro   r   filtered_linesr6   r   r   r   r   truncate_stockholm_msa  s   


r   c                    sP  i  i }t |  D ]\}}|dr|}|}|d\}}}g }tt|D ]%}	| D ]\}}
|
d\}}}||	 dkrF|d  nq-|d q'|||< t|s_|D ]}d |< qWn#| D ]\}}
|
d\}}}d	t
||}| d|  |< qci }q
| r|ds|||< q
| |< q
d	 fd	d
tt D S )z9Removes empty columns (dashes-only) from a Stockholm MSA.r   ry   r=   TFr1   r9   rz   c                 3   s    | ]} | V  qd S r   r   )r>   r?   processed_linesr   r   r   Y  s    z:remove_empty_columns_from_stockholm_msa.<locals>.<genexpr>)rK   r2   r4   
rpartitionranger   r   r5   anyrM   	itertoolscompressr3   )r   unprocessed_linesr?   r6   reference_annotation_ireference_annotation_liner   first_alignmentmaskre   unprocessed_lineprefix	alignment
line_indexmasked_alignmentr   r   r   'remove_empty_columns_from_stockholm_msa/  sR   






 r   c                 C   s   t t}|  D ]}| r&|ds&| }| \}}||  |7  < q	t }t }tt	|
 }dd |D }| D ]\}}dt||}	|	|v rRq@||	 || q@g }
|  D ]}t||ro|
| qcd|
d S )z;Remove duplicate sequences (ignoring insertions wrt query).r9   c                 S   rw   r<   r   rC   r   r   r   rA   m  rG   z-deduplicate_stockholm_msa.<locals>.<listcomp>r1   rz   )rH   defaultdictr   r2   r3   r4   rJ   r   r   r   rL   r   rM   r   r   r   r   r5   )r   sequence_dictr6   r   r   seen_sequencesr   query_alignr   r   r   r   r   r   deduplicate_stockholm_msa\  s.   



r   regex_patternc                 C   s*   t | |}|d u rtd| | S )NzCould not parse query line )rematchRuntimeErrorgroups)r   r6   r   r   r   r   _get_hhr_line_regex_groups  s   r   rF   start_indexindices_listc                 C   s8   |}| D ]}|dkr| d q| | |d7 }qdS )zUComputes the relative indices for each residue with respect to the original sequence.r=   r.   r0   N)r5   )rF   r   r   countersymbolr   r   r    _update_hhr_residue_indices_list  s   

r   detailed_linesc              
   C   s  t | d  d }| d dd }d}t|| d }|du r*td| | d f dd	 | D \}}}}}}}}d
}d
}	g }
g }d}| dd D ]}|dr|ds|ds|dsd}t||dd }t |d d }|d }t |d }tdd	 |D }|| | }|t|ksJ ||7 }t	|||
 qK|dr|ds|ds|dsd}t||dd }t |d d }|d }|t|ksJ |	|7 }	t	||| qKt
||t ||||	|
|dS )a  Parses the detailed HMM HMM comparison section for a single Hit.

    This works on .hhr files generated from both HHBlits and HHSearch.

    Args:
        detailed_lines: A list of lines from a single comparison section between 2
            sequences (which each have their own HMM's)

    Returns:
        A dictionary with the information from that detailed comparison section

    Raises:
        RuntimeError: If a certain line cannot be processed
    r   r.   r0   NzProbab=(.*)[	 ]*E-value=(.*)[	 ]*Score=(.*)[	 ]*Aligned_cols=(.*)[	 ]*Identities=(.*)%[	 ]*Similarity=(.*)[	 ]*Sum_probs=(.*)[	 ]*Template_Neff=(.*)r   zCCould not parse section: %s. Expected this: 
%s to contain summary.c                 S   s   g | ]}t |qS r   )r+   r>   xr   r   r   rA     rG   z"_parse_hhr_hit.<locals>.<listcomp>r1   ru   zQ z	Q ss_dsspz	Q ss_predzQ Consensusz1[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)   c                 S   s   g | ]}|d kr|qS r<   r   r   r   r   r   rA     s    zT z	T ss_dsspz	T ss_predzT Consensusz/[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)r#   r$   r%   r&   r'   r(   r)   r*   )r!   rJ   r   r   r   r   r4   r   r   r   r"   )r   number_of_hitname_hitpatternr   r   r%   r&   r'   r(   r)   r*   length_blockr6   pattr   startdelta_queryendnum_insertionsdelta_hit_sequencer   r   r   _parse_hhr_hit  sz   


r   
hhr_stringc              	   C   sl   |   }dd t|D }g }|r4|t| tt|d D ]}|t||| ||d    q |S )z)Parses the content of an entire HHR file.c                 S   s   g | ]\}}| d r|qS )zNo )r4   )r>   r?   r6   r   r   r   rA     s
    zparse_hhr.<locals>.<listcomp>r0   )r2   rK   r5   r   r   r   )r   linesblock_startshitsr?   r   r   r   	parse_hhr  s   r   tbloutc                 C   sL   ddi}dd |   D }|D ]}| }|d }|d }t|||< q|S )zDParse target to e-value mapping parsed from Jackhmmer tblout string.r'   r   c                 S   s   g | ]
}|d  dkr|qS )r   r:   r   )r>   r6   r   r   r   rA     rB   z.parse_e_values_from_tblout.<locals>.<listcomp>rs   )r2   rJ   r+   )r   e_valuesr   r6   fieldse_valuetarget_namer   r   r   parse_e_values_from_tblout	  s   r   r   c                 C   sN   g }|}| D ]}|dkr| d q| r|d7 }q| | |d7 }q|S )zHReturns indices for non-gap/insert residues starting at the given index.r=   r.   r0   )r5   r`   )rF   r   indicesr   r   r   r   r   _get_indices  s   


r   c                   @   s>   e Zd ZU eed< eed< eed< eed< eed< eed< dS )HitMetadatapdb_idchainr   r   lengthtextN)r   r   r   r   r   r!   r   r   r   r   r   *  s   
 r   descriptionc                 C   s\   t d|  }|std|  dt|d |d t|d t|d t|d |d	 d
S )z3Parses the hmmsearch A3M sequence description line.zF^>?([a-z0-9]+)_(\w+)/([0-9]+)-([0-9]+).*protein length:([0-9]+) *(.*)$zCould not parse description: "z".r0   r   ru   rs         )r   r   r   r   r   r   )r   r   r3   r   r   r!   )r   r   r   r   r   _parse_hmmsearch_description4  s   


r   r   
skip_firstc                 C   s   t tt| }|r|dd }t| dd}g }t|ddD ];\}\}}d|vr)qt|}	tdd |D }
t||	jd d}t||	j	 d|	j
 |
d| | ||d	}|| q|S )
a&  Parses an a3m string produced by hmmsearch.

    Args:
        query_sequence: The query sequence.
        a3m_string: The a3m string produced by hmmsearch.
        skip_first: Whether to skip the first sequence in the a3m string.

    Returns:
        A sequence of `TemplateHit` results.
    r0   Nr   )r   zmol:proteinc                 S   s   g | ]
}|  o|d kqS r<   )isupper)r>   rr   r   r   rA   d  rB   z'parse_hmmsearch_a3m.<locals>.<listcomp>r   r   )rO   rN   r7   r   rK   r   sumr   r"   r   r   upperr5   )r   r[   r   
parsed_a3mr)   r   r?   r(   hit_descriptionmetadatar%   r*   hitr   r   r   parse_hmmsearch_a3mJ  s.   
r   )NT)T)(r   rH   dataclassesr   r   rb   typingr   r   r   r   r   r   r   r!   r    	dataclassr
   r"   r   r7   rZ   rg   boolrm   r   r   r   r   r   r   r   r   r   r+   r   r   r   r   r   r   r   r   r   <module>   st   $

">
'

9-#



a
	