o
    ॵiT                  
   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZ d dl m!Z! d d	l"m#Z# d d
l$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 dgZ2dZ3dZ4				d%deee5 ee5 f fddZ6	d&deee5 e5f de7dee5ef fddZ8de5de5de5dee5ef fdd Z9e/j:e1j;ej<d!G d"d de-Z=e>d#kr	e= Z?d$Z@e?Ae@\ZBZCd dlDZDeDE  dS dS )'    N)Path)AnyDictListOptionalSequenceTupleUnion)result)tqdm)Preprocessors)proteinresidue_constants)PDB_CHAIN_IDS)compress_features)parserspipeline	templates)hhsearch)divide_multi_chains)Preprocessor)PREPROCESSORS)FieldsUniFoldPreprocessorzM{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]https://api.colabfold.comTFreturnc           *   
      sn  |rdnddAfdd	}fdd}fdd	}t | tr"| gn| }	d
}
|r.d}
d}d}| }tj|s<t| | d|
 d}d\ }g fdd|	D   fdd|	D }tj|s:dt }t|t	d}|r%|
d ||
 }|d dv rdtdd }t| ||
 }|d dv s|d dkrd}|d }t||d dkrtd|d d}}|
|d  |d d v rdtdd }t| ||}|
|d  |d d!kr||7 }|j|d" |d d v s|d d#kr||k r|j|| d" d}|d dkr#d}d}|d }t||st||| W d    n	1 s5w   Y  |rD| d$g}n| d%g}|rU|| d& td'd( |D rzt|}|| W d    n	1 suw   Y  |ri }t| d)d*>}| }|D ]/}|  }|d |d+ |d, |d- f\}}} } t|}||vrg ||< || | qW d    n	1 sw   Y  i }!| D ]I\}"}#| d.|" }$tj|$st|$ d/|#d d0 }%td1 d2|% d3|$ d4 td5|$ d6|$ d7 td8|$ d9 |$|!|"< qi |D ]b}&d:\}'}t|&d*d;d<K}| }|D ]<}t|dkrqd=|v rL|d=d}d>}'|d?rj|'rjt|d+d   }d}'|vrjg |< | | q6W d    n	1 s~w   Y  q"fd@d|D |rg }(|D ]})|)|!vr|(d  q|(|!|)  q|(}!|r|!fS S )BNzticket/pairz
ticket/msae   c                    s|   |d}}| D ]}|d| d| d7 }|d7 }qt j  d ||dd}z| }W |S  ty=   dd	i}Y |S w )
N >
   /)qmode)datastatusERROR)requestspostjson
ValueError)seqsr#   Nnqueryseqresout)host_urlsubmission_endpoint ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/science/uni_fold.pysubmit3   s"   



zrun_mmseqs2.<locals>.submitc                    s@   t   d|  }z| }W |S  ty   ddi}Y |S w )Nz/ticket/r%   r&   )r'   getr)   r*   )IDr0   r1   r2   r4   r5   r%   E   s   

zrun_mmseqs2.<locals>.statusc                    sP   t   d|  }t|d}||j W d    d S 1 s!w   Y  d S )Nz/result/download/wb)r'   r7   openwritecontent)r8   pathr0   r1   r9   r4   r5   downloadM   s   "zrun_mmseqs2.<locals>.downloadenvr   Fz/out_z.tar.gz)r   Tc                       g | ]}| vr  |qS r4   append.0x)seqs_uniquer4   r5   
<listcomp>g   s    zrun_mmseqs2.<locals>.<listcomp>c                    s   g | ]	}  | qS r4   )indexrE   r/   )r,   rG   r4   r5   rH   h          )total
bar_formatSUBMITr%   )UNKNOWN	RATELIMIT   r   r&   zTMMseqs2 API is giving errors. Please confirm your input is a valid protein sequence.z2If error persists, please try again an hour later.MAINTENANCEzIMMseqs2 API is undergoing maintenance. Please try again in a few minutes.id)rP   RUNNINGPENDINGrU   )r-   COMPLETEz	/pair.a3mz/uniref.a3mz"/bfd.mgnify30.metaeuk30.smag30.a3mc                 s   s    | ]
}t j| V  qd S N)osr>   isfile)rE   a3m_filer4   r4   r5   	<genexpr>   s    zrun_mmseqs2.<locals>.<genexpr>z	/pdb70.m8rr       
   z/templates_,   zcurl -s -L z
/template/z | tar xzf - -C r!   zcp z/pdb70_a3m.ffindex z/pdb70_cs219.ffindexztouch z/pdb70_cs219.ffdata)TNzutf-8)encoding Tr   c                    s   g | ]	}d   | qS )r   )join)rE   r-   )	a3m_linesr4   r5   rH      rK   )r   )
isinstancestrrY   r>   isdirmkdirrZ   lenr   TQDM_BAR_FORMATset_descriptionrandomrandinttimesleep	ExceptionupdaterC   anytarfiler;   
extractall	readlinesrstripsplitintitemsrd   systemreplace
startswith)*rF   prefixuse_envuse_templatesuse_pairingr2   r6   r%   r?   r+   r#   r>   tar_gz_fileREDOMsTIME_ESTIMATEpbarr1   
sleep_timeerrorr8   TIMEt	a3m_filestar_gzr   flineslinepMpdb_template_pathskTMPL	TMPL_PATH	TMPL_LINEr[   update_Mtemplate_paths_r-   r4   )r,   re   r2   rG   r3   r5   run_mmseqs2*   s   





/3$





r   r    query_sequencenum_tempc              	   C   s   t | tr	t| ntdd | D }d| }t|tjjdf}t|tjjf}tj	|tjj
}t|d  |dddgt|d  |ddgd g| tt|d  |ddgd g| tj|gtjdd}|S )	Nc                 s   s    | ]}t |V  qd S rX   rj   )rE   sr4   r4   r5   r\      s    
z$get_null_template.<locals>.<genexpr>A   r    none)dtype)template_all_atom_positionstemplate_all_atom_maskstemplate_sequencetemplate_aatypetemplate_domain_namestemplate_sum_probs)rf   rg   rj   sumnpzerosr   r   atom_type_numsequence_to_onehotHHBLITS_AA_TO_IDtileencodearrayfloat32)r   r   lnoutput_templates_sequencetemplates_all_atom_positionstemplates_all_atom_maskstemplates_aatypetemplate_featuresr4   r4   r5   get_null_template   s.   
r   re   template_pathc                 C   sZ   t j|dddd d d}tjd| dgd}|| }tj|}|j||d}t	|j
S )	Nz
2100-01-01ra   kalign)	mmcif_dirmax_template_datemax_hitskalign_binary_pathrelease_dates_pathobsolete_pdbs_pathr   z/pdb70)binary_path	databases)r   hits)r   HhsearchHitFeaturizerr   HHSearchr.   r   r   	parse_hhrget_templatesdictfeatures)re   r   r   template_featurizerhhsearch_pdb70_runnerhhsearch_resulthhsearch_hitstemplates_resultr4   r4   r5   get_template	  s"   	

r   )module_namec                   @   s   e Zd Zdd ZdedededefddZd	ee d
ededededeee e	f fddZ
dd Zdefdedeeee f dedede	dededeeee  eee  ee ee eeeef  f fddZdeeef fddZdS )r   c                 K   sH   |d | _ | j sd | _ d| _d| _d| _d| _d| _tj| jdd d S )Nsymmetry_group   i  unifoldz./unifold-predictionsT)exist_ok)r   MIN_SINGLE_SEQUENCE_LENGTHMAX_SINGLE_SEQUENCE_LENGTHMAX_MULTIMER_LENGTHjobnameoutput_dir_baserY   makedirs)selfcfgr4   r4   r5   __init__"  s   
zUniFoldPreprocessor.__init__input_sequence
min_length
max_lengthr   c                 C   s   | tddd }ttj}t||s$tdt||  dt	||k r6tdt	| d| t	||krItdt	| d| d	|S )
Nr   z 
	z0Input sequence contains non-amino acid letters: z<. AlphaFold only supports 20 standard amino acids as inputs.zInput sequence is too short: z# amino acids, while the minimum is zInput sequence is too long: z# amino acids, while the maximum is zr. You may be able to run it with the full Uni-Fold system depending on your resources (system memory, GPU memory).)
	translaterg   	maketransuppersetr   restypesissubsetr*   rj   )r   r   r   r   clean_sequenceaatypesr4   r4   r5   clean_and_validate_sequence-  s2   

z/UniFoldPreprocessor.clean_and_validate_sequenceinput_sequencesr   max_multimer_lengthc           
      C   s  g }|D ]}|  r| j|||d}|| q|durH|dkrH|dr@|dd  r@td| d t|dk}|||fS td| d	t|dkrWtd
 |ddfS t|dkrtdd |D }	|	|krutd|	 d| dtdt| d |ddfS td)zGValidates and cleans input sequences and determines which model to use.)r   r   r   NC1Cr    zUsing UF-Symmetry with group zp. If you do not want to use UF-Symmetry, please use `C1` and copy the AU sequences to the count in the assembly.z,UF-Symmetry does not support symmetry group z2 currently. Cyclic groups (Cx) are supported only.zUsing the single-chain model.Fc                 S   s   g | ]}t |qS r4   r   rJ   r4   r4   r5   rH   e  s    z6UniFoldPreprocessor.validate_input.<locals>.<listcomp>z4The total length of multimer sequences is too long: z, while the maximum is z:. Please use the full AlphaFold system for long multimers.zUsing the multimer model with z sequences.TzLNo input amino acid sequence provided, please provide at least one sequence.)	stripr   rC   r}   	isnumericprintrj   r*   r   )
r   r   r   r   r   r   	sequencesr   is_multimertotal_multimer_lengthr4   r4   r5   validate_inputC  sX   




z"UniFoldPreprocessor.validate_inputc                 C   s"   |d t |  d d  S )Nr   rR   )hashlibsha1r   	hexdigest)r   rF   yr4   r4   r5   add_hasht  s   "zUniFoldPreprocessor.add_hashr    r   query_seqs_unique
result_dirmsa_moder   homooligomers_numr2   c                 C   s  |dk}g }	|rit |t|||d|d\}
}|d u r2tdt|D ]}t|| }|	| q#nLtdt|D ].}|| d ur\t|
| || || }t|d dkr[t|| }nt|| }|	| q9ntdt|D ]}t|| }|	| qp|dkrg }d}t|D ]\}}|dt||  d	 |  qnt |t|||d
|d}t|dkrt |t|||d|d}nd}g }td|D ]}|dt||  d	 |d  d	  q|||	fS )NMMseqs2T)r   r2   r   r   single_sequencer   r   r   F)r   r2   r    )	r   rg   joinpathrangerj   r   rC   r   	enumerate)r   r   r   r   r   r   r   r2   r   r   a3m_lines_mmseqs2r   rI   template_featurere   numir/   paired_a3m_linesr4   r4   r5   get_msa_and_templatesw  s   
 z)UniFoldPreprocessor.get_msa_and_templatesr$   c           !         s  t |tr|  }t|dk r|dgdt|   }d|}tdd|}| | j	| | j
|| j| j| j| jd\}}} fddtt|D }|rWt | j|| g }t||D ]
\}}	|||	g7 }q^g fdd|D  tdkrt|}
nd}
t| j	 d	d
}|d| W d    n1 sw   Y  t| j}tj| j }d}d}| j ||||
d\}}}g }g }tD ]\}}	t| }tj|	d| j	 d| t|	d}t|| }t|g}|| }i |||}t |}tj|d!|}t"j#|t$%|ddd |&| |rIt|| }t|g}t |}tj|d!|} t"j#|t$%| ddd |&| q|| |dS )N   r   z\W+)r   r   r   r   r   c                    s    g | ]}d   d t | qS )> z seq)rg   )rE   ii)	target_idr4   r5   rH     s    z0UniFoldPreprocessor.__call__.<locals>.<listcomp>c                    rA   r4   rB   rD   )unique_sequencesr4   r5   rH     s
    r    z.fastawr   r   T)r   r   r   r   r  z seq )sequencedescriptionnum_resz{}.feature.pkl.gzr:   )protocolz{}.uniprot.pkl.gz)r   pair_featuresr  r   )'rf   rg   r   rx   rj   rd   resubr   r   r   r   r   r   r   r  r   r   zipr;   r<   r   rY   r>   r
  r  r   r   make_sequence_featuresr   	parse_a3mmake_msa_featuresr   formatpickledumpgzipGzipFilerC   )!r   r$   basejobnamer   r   r   descriptionsr   desr/   r   r   r   
output_dirr   r   unpaired_msa
paired_msatemplate_resultsr   pair_features_listidxchain_idsequence_featuresmonomer_msamsa_featuresr   feature_dictfeatures_output_pathmultimer_msar  pair_feature_dictuniprot_output_pathr4   )r  r  r5   __call__  s   













zUniFoldPreprocessor.__call__N)__name__
__module____qualname__r   rg   ry   r   r   r   boolr   r   DEFAULT_API_SERVERr	   r   r   r   r   r   r
  r3  r4   r4   r4   r5   r     sR    



1
"	
T__main__ڜLILNLRGGAFVSNTQITMADKQKKFINEIQEGDLVRSYSITDETFQQNAVTSIVKHEADQLCQINFGKQHVVCTVNHRFYDPESKLWKSVCPHPGSGISFLKKYDYLLSEEGEKLQITEIKTFTTKQPVFIYHIQVENNHNFFANGVLAHAMQVSI)TFFr   )r    )Fr  r   loggingrY   r  rm   r  rt   ro   pathlibr   typingr   r   r   r   r   r   r	   unittestr
   r)   numpyr   r'   torchr   modelscope.metainfor   &modelscope.models.science.unifold.datar   r   .modelscope.models.science.unifold.data.proteinr   ,modelscope.models.science.unifold.data.utilsr   %modelscope.models.science.unifold.msar   r   r   +modelscope.models.science.unifold.msa.toolsr   +modelscope.models.science.unifold.msa.utilsr   modelscope.preprocessors.baser    modelscope.preprocessors.builderr   modelscope.utils.constantr   __all__rk   r8  rg   r   ry   r   r   register_modulescienceunifold_preprocessorr   r4  procprotein_exampler3  r   r  ipdb	set_tracer4   r4   r4   r5   <module>   s~   $
 C




  
