o
    ߥi2K                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZm Z m!Z!m"Z" ee Z#eZ$e	e%ee#e$f f Z&eeeee  f Z'eeeee  f Z(e)e*Z+d	ej,d
e%de-deej,ee% f fddZ.dej/de&dej/fddZ0ej1ddd		d5de%de%dee% de2def
ddZ3	d6de%de%dee& defd d!Z4					d7d"ee% de%dee% d#eee%  dee% d$eee&  de2de'fd%d&Z5		 			d8d	ej,d
e%d'ed(eee  d)e-d*ee- d+ee- d,e2de(fd-d.Z6	 			d9d	ej,d
e%d)e-d*ee- d+ee- d,e2fd/d0Z7G d1d2 d2eZ8G d3d4 d4e8Z9dS ):    N)DictIterableListOptionalTupleUnion)UnicoreDataset
data_utils)utils   )	NumpyDict	TorchDict)process_featuresprocess_labels)add_assembly_featuresconvert_monomer_features
merge_msaspair_and_mergepost_processconfigmodenum_resreturnc                 C   s   t | }|| }|  |jd u r||_W d    n1 s w   Y  |jj|jj }|jjr7||jj7 }|jj	rA||jj
7 }|| jrL||jj7 }||fS N)copydeepcopyunlocked	crop_sizecommonunsupervised_featuresrecycling_featuresuse_templatestemplate_featuresis_multimermultimer_features
supervisedsupervised_features)r   r   r   cfgmode_cfgfeature_names r*   ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/science/unifold/dataset.pymake_data_config!   s   



r,   all_atom_positions	operationc                 C   sD   |dkr| S |\}}t |dd}t |d}| |j | S )NI   )nparrayreshapeT)r-   r.   rottransr*   r*   r+   process_label6   s   r7      T)maxsizer   Fsequence_idmonomer_feature_diruniprot_msa_dir
is_monomerc                 C   s   t tj||  d}t|}i |}|d urOt tj||  d}|r?t|d |d |d |d \|d< |d< |S t |}dD ]}|| ||< qF|S )Nz.feature.pkl.gzz.uniprot.pkl.gzmsadeletion_matrix)msa_all_seqmsa_species_identifiers_all_seqdeletion_matrix_all_seq)r
   load_pickleospathjoinr   r   convert_all_seq_feature)r:   r;   r<   r=   monomer_featurechain_featureall_seq_featurekeyr*   r*   r+   load_single_feature@   s0   

rL   label_id	label_dirsymmetry_operationc                 C   sJ   t tj||  d}|d urt|d ||d< dd | D }|S )Nz.label.pkl.gzr-   c                 S   s   i | ]\}}|d v r||qS )aatyper-   all_atom_mask
resolutionr*   .0kvr*   r*   r+   
<dictcomp>n   s    z%load_single_label.<locals>.<dictcomp>)r
   rC   rD   rE   rF   r7   items)rM   rN   rO   labelr*   r*   r+   load_single_labeld   s   r[   sequence_ids	label_idssymmetry_operationsc           
         s    fdd| D }|d ur@t |t | ksJ d usJ |d u r*dd |D }fddt||D }dd t||D  t|}|d urPdd |D }nd }tjdd |D tjd}	 rf|d	 }nt|}t|}|	|d
< ||fS )Nc                    s   g | ]	}t | qS r*   )rL   rU   s)r=   r;   r<   r*   r+   
<listcomp>   s    zload.<locals>.<listcomp>c                 S   s   g | ]}d qS )r/   r*   )rU   _r*   r*   r+   ra      s    c                    s   g | ]
\}}t | |qS r*   )r[   )rU   llo)rN   r*   r+   ra      s    
c                 S   s   g | ]	\}}| |qS r*   )update)rU   frc   r*   r*   r+   ra      s    c                    s   g | ]  fd ddD qS )c                    s   i | ]}| | qS r*   r*   rU   rV   rf   r*   r+   rX      s    z#load.<locals>.<listcomp>.<dictcomp>rP   r*   )rU   r*   rh   r+   ra      s
    

c                 S      g | ]}|d  qS )
seq_lengthr*   )rU   cr*   r*   r+   ra          )dtyper   asym_len)lenzipr   r1   r2   int64r   r   )
r\   r;   r<   r]   rN   r^   r=   all_chain_featuresall_chain_labelsrn   r*   )r=   rN   r;   r<   r+   loadv   s6   


rt   featureslabelsseed	batch_idxdata_idxis_distillationc              	   C   s  |dkr8|d us
J t j||dd tjd| jjd }tj | | jk }	W d    n1 s2w   Y  n| jj}d}	t	||d< t	|	|d< t	||d< |r[d	|v r[|
d	 t	|d
 }
t| ||
d\}}|d ury|d d d|d< t j||dd> tjdd|d< tj||d}dd | D }t  t||j|| }W d    n1 sw   Y  W d    n1 sw   Y  |d urdd |D }t  t|}W d    ||fS 1 sw   Y  ||fS )Ntrain	recyclingrK   r   r   num_recycling_itersuse_clamped_faperz   
msa_chainsrj   )r   r   rS   protein_featurei{  crop_and_fix_size_seed)desired_keysc                 S      i | ]
\}}|t |qS r*   torchtensorrT   r*   r*   r+   rX          zprocess.<locals>.<dictcomp>c                 S   s   g | ]}d d |  D qS )c                 S   r   r*   r   rT   r*   r*   r+   rX      r   z&process.<locals>.<listcomp>.<dictcomp>rY   )rU   rc   r*   r*   r+   ra      s    zprocess.<locals>.<listcomp>)r	   
numpy_seedr1   randomrandintr   max_recycling_itersranduse_clamped_fape_probintpopr,   r3   r
   filterrY   r   no_gradr   r   )r   r   ru   rv   rw   rx   ry   rz   	num_itersr   r   r'   r)   r*   r*   r+   process   sN   




r   c           
   	   K   sR   d|vr|n| d}tdi |d|i\}}	t| |||	||||\}}	||	fS )Nr=   r*   )r   rt   r   )
r   r   rw   rx   ry   rz   load_kwargsr=   ru   rv   r*   r*   r+   load_and_process   s   r   c                   @   sj   e Zd Z				dddZdd Zdd	d
Zdd Zdd Zedd Z	ede
eee f fddZdS )UnifoldDatasetr{   NF c	                 C   s  || _ dd }	|	tj | j || d }
|	tj | j || d | _| | j| _i | _| jD ]}| j| }|
| | j|< q1|
| _t	d
t| jt| j tj | j d| _tj | j d| _tj | j |d }|d	krtj |r|s|	|| _t	d

t| j tj | j d| _tj | j d| _nd | _|jt  |jd  | _|d ur|| j nt| j| _|| _| | j\| _| _| _| | j\| _| _| _| jd ur| | j\| _| _ | _!|j"| _#|| _$|j%| _%d S )Nc                 S   s   t t| dddS )Nrutf-8encoding)jsonrt   open)filenamer*   r*   r+   	load_json   s   z*UnifoldDataset.__init__.<locals>.load_jsonz_sample_weight.jsonz_multi_label.jsonz$load {} chains (unique {} sequences)pdb_features
pdb_labelszsd_train_sample_weight.jsonr{   z"load {} self-distillation samples.sd_features	sd_labelsr   )&rE   rD   rF   multi_label_inverse_mapinverse_multi_labelsample_weightseq_sample_weightloggerinfoformatro   feature_path
label_pathisfilesd_sample_weightsd_feature_pathsd_label_path
batch_sizedistributed_utilsget_data_parallel_world_sizeupdate_freqdata_lenr   cal_sample_weightnum_seqseq_keysseq_sample_prob	num_chain
chain_keyssample_probsd_num_chainsd_chain_keyssd_sample_probdatar   rw   sd_prob)selfargsrw   r   	data_pathr   max_step
disable_sdjson_prefixr   r   chainentitysd_sample_weight_pathr*   r*   r+   __init__   sx   








zUnifoldDataset.__init__c                    s>   t   }t   fdd|D }t|}|||fS )Nc                    s   g | ]} |  qS r*   r*   rg   r   
sum_weightr*   r+   ra   1  s    z4UnifoldDataset.cal_sample_weight.<locals>.<listcomp>)listkeyssumvaluesro   )r   r   	prot_keysr   num_protr*   r   r+   r   .  s
   
z UnifoldDataset.cal_sample_weightc                 C   s
  d}| j dkrvtj| j|dd] | jd ur!tjdd | jk nd}|r7tjj	| j
| jd}| j| }|}n/|sNtjj	| j| jd}| j| }| j| }ntjj	| j| jd}| j| }tj	| j| }W d    n1 spw   Y  n
| j| }| j| }|||fS )NFr{   data_sampler}   r   r   )p)r   r	   r   rw   r   r1   r   r   r   choicer   r   r   r   r   r   r   r   r   r   r   )r   idxsample_by_seqrz   prot_idx
label_nameseq_nameseq_idxr*   r*   r+   sample_chain5  s@   







zUnifoldDataset.sample_chainc           	      C   sn   | j |dd\}}}|s| j| jfn| j| jf\}}t| j| j| j|| j	 |||g|d |g|d dd\}}|S )NT)r   
rx   ry   rz   r\   r;   r<   r]   rN   r^   r=   )
r   r   r   r   r   r   r   r   rw   r   )	r   r   r:   rM   rz   feature_dirrN   ru   rb   r*   r*   r+   __getitem__R  s2   

zUnifoldDataset.__getitem__c                 C   s   | j S r   )r   r   r*   r*   r+   __len__i  s   zUnifoldDataset.__len__c                 C   s   t j| ddS )Nr   dim)r	   collate_dict)samplesr*   r*   r+   collaterl  s   zUnifoldDataset.collatermappingc              
   C   s`   i }|   D ]'\}}|D ] }||v r(|| }||ks(J d| d| d| d|||< qq|S )Nzmultiple entities (z, z) exist for reference .r   )r   inverse_mappingentrefsrefent_2r*   r*   r+   r   q  s   

zUnifoldDataset._inverse_mapr{   NFr   )F)__name__
__module____qualname__r   r   r   r   r   staticmethodr   r   strr   r   r*   r*   r*   r+   r      s    
?

 r   c                       s   e Zd Z				ddejdedejded	ed
ee dedef fddZ	dd Z
edd Zedd Zedd Zedd Z  ZS )UnifoldMultimerDatasetr{   NFr   r   rw   r   r   r   r   r   r   c	           
   
      s   t  |||||||| || _tttj| j|d dd| _	| 
| j| _tj| jd| _tj| jd| _tj| jd| _|j| _| jdkrk| | j| j	| j| j\| _| _| | j\| _| _| _d S d S )Nzpdb_assembly.jsonr   r   r   pdb_uniprotsr   r{   )superr   r   r   rt   r   rD   rE   rF   pdb_assembly
get_chainsr   
pdb_chainsmonomer_feature_pathuniprot_msa_pathr   
max_chainsr   filter_pdb_by_max_chainsr   r   r   r   r   )
r   r   rw   r   r   r   r   r   r   kwargs	__class__r*   r+   r     s8   


zUnifoldMultimerDataset.__init__c                    s    |\}}}|r|g}|g}jd j}}}	d }
nE|  jv rGjdkrG fddj  d D }dd j  d D }
nj  }d }
fdd|D }jjj	}}}	t
jjj|j |||||||	|
dd	S )
Nr{   c                    s   g | ]} d  | qS )rb   r*   )rU   id)pdb_idr*   r+   ra     s    
z6UnifoldMultimerDataset.__getitem__.<locals>.<listcomp>chainsc                 S   s   g | ]}|qS r*   r*   )rU   tr*   r*   r+   ra     s    opersc                    s   g | ]} j | qS r*   )r   )rU   chain_idr   r*   r+   ra     s    
Fr   )r   r   r   get_pdb_namer   r   r   r   r   r   r   r   rw   r   )r   r   seq_idrM   rz   r]   r\   r   r   r   r^   r*   )r  r   r+   r     sR   





z"UnifoldMultimerDataset.__getitem__c                 C   sh   t | dkrd S dd | D }dd | D }z	tj|dd}W n ty+   td|w |s0d }||fS )Nr   c                 S   ri   )r   r*   r_   r*   r*   r+   ra     rl   z3UnifoldMultimerDataset.collater.<locals>.<listcomp>c                 S   s    g | ]}|d  dur|d  qS )r   Nr*   r_   r*   r*   r+   ra     s     r   r   zcannot collate features)ro   r	   r   BaseException
ValueError)r   featslabsr*   r*   r+   r     s   
zUnifoldMultimerDataset.collaterc                 C   s   |  dd S )Nrb   r   )split)r   r*   r*   r+   r    s   z#UnifoldMultimerDataset.get_pdb_namec                 C   s:   i }| D ]}t |}||vrg ||< || | q|S r   )r   r  append)canon_chain_mapr   r   pdbr*   r*   r+   r     s   
z!UnifoldMultimerDataset.get_chainsc                    s   i  | D ])}||v rt || d }||kr| |  |< qt | | }|dkr-| |  |< q fddD }tdt | t    dt |  dt t |  dt  d| 
  |fS )	Nr  r   c                    s$   i | ]}t | v r|| qS r*   )r   r  rg   new_pdb_chainsr   r*   r+   rX     s
    zCUnifoldMultimerDataset.filter_pdb_by_max_chains.<locals>.<dictcomp>zfiltered out z / z PDBs (z chains) by max_chains )ro   r   r   )r   r   r   r  r   sizenew_sample_weightr*   r  r+   r    s0   z/UnifoldMultimerDataset.filter_pdb_by_max_chainsr   )r   r   r   mlc
ConfigDictr   r   r   boolr   r   r   r   r  r   r  __classcell__r*   r*   r  r+   r     s>    	!1


	r   )NFr   )NNNNF)Nr   NNF)r   NNF):r   loggingrD   typingr   r   r   r   r   r   r   ml_collectionsr  numpyr1   r   unicore.datar   r	   unicore.distributedr
   r   r   data.data_opsr   r   data.processr   r   data.process_multimerr   r   r   r   r   RotationTranslationr   	OperationNumpyExampleTorchExample	getLoggerr   r   r  r   r,   ndarrayr7   	lru_cacher  rL   r[   rt   r   r   r   r   r*   r*   r*   r+   <module>   s    



&



8
	
4
 