o
    ॵi|-                     @   s   d Z ddlZddlmZmZmZmZmZmZ ddl	Z
ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ eee
jf Zeejejf Zded	ed
edefddZdeej  defddZ!dededede"deeef f
ddZ#G dd dZ$dS )z@Functions for building the input features for the unifold model.    N)AnyMappingMutableMappingOptionalSequenceUnion)logging)residue_constants)msa_identifiersparsers	templates)hhblitshhsearch	hmmsearch	jackhmmersequencedescriptionnum_resreturnc                 C   s   i }t j| t jdd|d< tj|ftjd|d< tj|dgtjd|d< tjt	|tjd|d< tj|g| tjd|d	< tj| dgtjd|d
< |S )z/Constructs a feature dict of sequence features.T)r   mappingmap_unknown_to_xaatypedtypebetween_segment_residuesutf-8domain_nameresidue_index
seq_lengthr   )
r	   sequence_to_onehotrestype_order_with_xnpzerosint32arrayencodeobject_range)r   r   r   features r)   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/science/unifold/msa/pipeline.pymake_sequence_features    s$   



r+   msasc                 C   s*  | st dg }g }g }t }t| D ]E\}}|s!t d| dt|jD ]1\}}||v r/q&|| |dd |D  ||j|  t|j	| }	||	j
d q&qt| d jd }
t|}i }tj|tjd|d	< tj|tjd|d
< tj|g|
 tjd|d< tj|tjd|d< |S )z*Constructs a feature dict of MSA features.z"At least one MSA must be provided.zMSA z$ must contain at least one sequence.c                 S   s   g | ]}t j| qS r)   )r	   HHBLITS_AA_TO_ID).0resr)   r)   r*   
<listcomp>F   s    z%make_msa_features.<locals>.<listcomp>r   r   r   deletion_matrix_intmsanum_alignmentsmsa_species_identifiers)
ValueErrorset	enumerate	sequencesaddappenddeletion_matrixr
   get_identifiersdescriptions
species_idr%   lenr!   r$   r#   r&   )r,   int_msar;   species_idsseen_sequences	msa_indexr2   sequence_indexr   identifiersr   r3   r(   r)   r)   r*   make_msa_features4   sH   



rF   input_fasta_pathmsa_out_path
msa_formatuse_precomputed_msasc                 C   s   |rt j|s.| |d }t|d}|||  W d   |S 1 s'w   Y  |S td| t|ddd}|| i}W d   |S 1 sMw   Y  |S )z:Runs an MSA tool, checking if output already exists first.r   wNzReading MSA from file %srr   encoding)	ospathexistsqueryopenwriter   warningread)
msa_runnerrG   rH   rI   rJ   resultfr)   r)   r*   run_msa_toolX   s   

rZ   c                   @   s   e Zd ZdZ			ddedededed	ee d
ee dee dee dedejde	de
de
de	fddZdededefddZdededefddZdS )DataPipelinez:Runs the alignment tools and assembles the input features.  '  Fjackhmmer_binary_pathhhblits_binary_pathuniref90_database_pathmgnify_database_pathbfd_database_pathuniclust30_database_pathsmall_bfd_database_pathuniprot_database_pathtemplate_searchertemplate_featurizeruse_small_bfdmgnify_max_hitsuniref_max_hitsrJ   c                 C   s   || _ tj||d| _|rtj||d| _n
tj|||gd| _tj||d| _tj||d| _	|	| _
|
| _|| _|| _|| _dS )zInitializes the data pipeline.)binary_pathdatabase_path)rk   	databasesN)_use_small_bfdr   	Jackhmmerjackhmmer_uniref90_runnerjackhmmer_small_bfd_runnerr   HHBlitshhblits_bfd_uniclust_runnerjackhmmer_mgnify_runnerjackhmmer_uniprot_runnerrf   rg   ri   rj   rJ   )selfr^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rJ   r)   r)   r*   __init__n   s6   

zDataPipeline.__init__rG   msa_output_dirr   c                 C   s  t |dd}| }W d   n1 sw   Y  t|\}}t|dkr/td| d|d }|d }t|}	tj|d}
t	| j
||
d	| j}tj|d
}t	| j||d	| j}|d	 }tj|| jd}t|}t|}| jjd	kr| j|}n| jjdkrt|}| j|}n	td| jj tj|d| jj }t |d}|| W d   n1 sw   Y  t|d	 }|j| jd}t|d	 }|j| jd}| jj||d}| jrtj|d}t	| j||d	| j}t|d	 }ntj|d}t	| j||d| j}t|d }| jj ||d}t!|||	d}t"|||f}t#$dt| t#$dt| t#$dt| t#$d|d d  t#$d|j%d j&d  i |||j%S )z@Runs alignment tools on the input sequence and creates features.r   rM   N   z&More than one input sequence found in .r   zuniref90_hits.stostozmgnify_hits.sto)max_sequencesa3mz$Unrecognized template input format: z	pdb_hits.rK   max_seqs)output_stringinput_sequencezsmall_bfd_hits.stozbfd_uniclust_hits.a3m)query_sequencehits)r   r   r   z Uniref90 MSA size: %d sequences.zBFD MSA size: %d sequences.zMGnify MSA size: %d sequences.z,Final (deduplicated) MSA size: %d sequences.r3   zbTotal number of templates (NB: this can include bad templates and is later filtered to top 4): %d.template_domain_names)'rS   rV   r   parse_fastar?   r5   rO   rP   joinrZ   rp   rJ   rt   truncate_stockholm_msarj   deduplicate_stockholm_msa'remove_empty_columns_from_stockholm_msarf   input_formatrR   convert_stockholm_to_a3moutput_formatrT   parse_stockholmtruncateri   get_template_hitsrn   rq   rs   	parse_a3mrg   get_templatesr+   rF   r   infor(   shape)rv   rG   rx   rY   input_fasta_str
input_seqsinput_descsr   input_descriptionr   uniref90_out_pathjackhmmer_uniref90_resultmgnify_out_pathjackhmmer_mgnify_resultmsa_for_templatespdb_templates_resultuniref90_msa_as_a3mpdb_hits_out_pathuniref90_msa
mgnify_msapdb_template_hitsbfd_out_pathjackhmmer_small_bfd_resultbfd_msahhblits_bfd_uniclust_resulttemplates_resultsequence_featuresmsa_featuresr)   r)   r*   process   s   


zDataPipeline.processc                 C   sJ   t j|d}t| j||d| j}t|d }|jdd}t	|g}|S )Nzuniprot_hits.stor{   iP  r~   )
rO   rP   r   rZ   ru   rJ   r   r   r   rF   )rv   rG   rx   uniprot_pathuniprot_resultr2   all_seq_dictr)   r)   r*   process_uniprot  s   
zDataPipeline.process_uniprotN)r\   r]   F)__name__
__module____qualname____doc__strr   TemplateSearcherr   TemplateHitFeaturizerboolintrw   FeatureDictr   r   r)   r)   r)   r*   r[   k   sX    	

+
tr[   )%r   rO   typingr   r   r   r   r   r   numpyr!   abslr   &modelscope.models.science.unifold.datar	   %modelscope.models.science.unifold.msar
   r   r   +modelscope.models.science.unifold.msa.toolsr   r   r   r   r   ndarrayr   HHSearch	Hmmsearchr   r   r+   MsarF   r   rZ   r[   r)   r)   r)   r*   <module>   s8    

$

