o
    ߥi9                     @   st  d Z ddlZddlmZmZmZ ddlZddlm	Z	m
Z
 ddlmZ eeejf Zeh dZdZd	Zd
ee defddZdeeef defddZdee dedededee f
ddZdededededef
ddZdedefddZdd Zdd  Zdedefd!d"Zdeeef fd#d$Zd%d& Zd'edefd(d)Z d*edefd+d,Z!d-d. Z"d/d0 Z#d1d2 Z$d3d4 Z%dS )5z+Feature processing logic for multimer data     N)IterableListMutableMapping)msa_pairingresidue_constants   )correct_template_restypes>!   msaaatypesym_idasym_idnum_symasym_lenmem_peakmsa_maskseq_mask	bert_mask	entity_id
msa_chains
queue_size
resolution
seq_lengthentity_maskall_atom_maskdeletion_meannum_templatesresidue_indexnum_alignmentsdeletion_matrixtemplate_aatypecluster_bias_maskall_atom_positionstemplate_sum_probsassembly_num_chainsall_chains_entity_idstemplate_all_atom_maskall_crops_all_chains_masktemplate_all_atom_positionsall_crops_all_chains_positions all_crops_all_chains_residue_ids   i   chainsreturnc                 C   s&   t ttdd | D }|dkS )z@Checks if a list of chains represents a homomer/monomer example.c                 S   s&   g | ]}t |d  |d  dk qS )r   r   )npunique).0chain r1   k/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/science/unifold/data/process_multimer.py
<listcomp>I   s    z*_is_homomer_or_monomer.<locals>.<listcomp>r   )lenr-   r.   concatenate)r+   num_unique_chainsr1   r1   r2   _is_homomer_or_monomerD   s   
r7   all_chain_featuresc                 C   s\   t |  | }t| }|rtj|d}t|}t|t|td}tj||td}t	|}|S )zRuns processing on features to augment, pair and merge.

    Args:
        all_chain_features: A MutableMap of dictionaries of features for each chain.

    Returns:
        A dictionary of features.
    )r+   msa_crop_sizepair_msa_sequencesmax_templates)np_chains_listr;   r<   )
process_unmerged_featuresr7   r   create_paired_featuresdeduplicate_unpaired_sequencescrop_chainsMSA_CROP_SIZEMAX_TEMPLATESmerge_chain_featuresprocess_final)r8   r=   r;   
np_exampler1   r1   r2   pair_and_mergeP   s.   
rG   chains_listr:   r;   r<   c                 C   s,   g }| D ]}t ||||d}|| q|S )ak  Crops the MSAs for a set of chains.

    Args:
        chains_list: A list of chains to be cropped.
        msa_crop_size: The total number of sequences to crop from the MSA.
        pair_msa_sequences: Whether we are operating in sequence-pairing mode.
        max_templates: The maximum templates to use per chain.

    Returns:
        The chains cropped.
    r9   )_crop_single_chainappend)rH   r:   r;   r<   cropped_chainsr0   cropped_chainr1   r1   r2   rA   u   s   rA   r0   c                 C   s  | d }|r@| d }t ||d }| d d|ddf }t t j|tjkdd}t ||}t || d}	t ||	}nt ||}d	| v oK|}
|
r[| d	 jd }t ||}| D ]E}|d
d }|tj	v rz| | d|ddf | |< q]|tj
v rd
|v r|r| | d|ddf | |< q]| | d|ddf | |< q]t j|t jd| d< |
rt j|t jd| d< |rt j|t jd| d< | S )z'Crops msa sequences to `msa_crop_size`.r   num_alignments_all_seq   msa_all_seqNr   axisr   r   _all_seqdtyper   )r-   minimumsumanyr   MSA_GAP_IDXmaximumshapesplitTEMPLATE_FEATURESMSA_FEATURESasarrayint32)r0   r:   r;   r<   msa_sizemsa_size_all_seqmsa_crop_size_all_seqrO   num_non_gapped_pairsmax_msa_crop_sizeinclude_templatesr   templates_crop_sizekk_splitr1   r1   r2   rI      sN   



rI   rF   c                 C   s   t | } t| } t| } | S )zCFinal processing steps in data pipeline, after merging and pairing.)_make_seq_mask_make_msa_mask_filter_featuresrF   r1   r1   r2   rE      s   rE   c                 C   s   | d dk tj| d< | S )Nr   r   r   )astyper-   float32rl   r1   r1   r2   ri      s   ri   c                 C   sD   t j| d t jd| d< | d dkt j}| d  |d 9  < | S )z:Mask features are all ones, but will later be zero-padded.r	   rS   r   r   r   N)r-   	ones_likeint8rm   )rF   r   r1   r1   r2   rj      s   rj   c                 C   s   dd |   D S )z4Filters features of example to only those requested.c                 S   s   i | ]\}}|t v r||qS r1   )REQUIRED_FEATURES)r/   rg   vr1   r1   r2   
<dictcomp>   s    z$_filter_features.<locals>.<dictcomp>)itemsrl   r1   r1   r2   rk      s   rk   c                 C   s   t | }| D ]R}d|v rtj|dtjd|d< d|v r*tj|dtjd|d< tj|d dd|d< d	|vrQtj|d
  }||d< tt	|j
dg |d	< t||d< q| D ]}|d dktj|d< q[dS )z;Postprocessing stage for per-chain features before merging.deletion_matrix_intrS   r   deletion_matrix_int_all_seqdeletion_matrix_all_seqr   rP   r   r!   r
   r      r#   r   r   N)r4   r-   r^   poprn   meanr   STANDARD_ATOM_MASKzeroslistrZ   rm   r_   )r8   
num_chainschain_featuresr   r1   r1   r2   r>      s<   


r>   c                 C   sX   t d| ft jt d| ddft jt dt jt d| dft jdS )Nr   %   rx   )r   r   )r   r'   r"   r%   )r-   r|   rm   int64rn   )n_resr1   r1   r2   empty_template_feats	  s
   r   monomer_featuresc                 C   s  | d j d dkr| t| d j d  i }h d}|  D ]S\}}||v r2tj|d |jd}n1|dkrBtj|ddtj	}n!|dkrR|j d dkrQt
|}n|dkrYd	}n
|d
krc|tj}|drn|tj}|||< qd| v r| dtj| d< |d |S )z;Reshapes and modifies monomer features for multimer models.r   r   r
   >   sequencer   domain_namer   rS   rP   template_all_atom_masksr%   r	   _maskru   r   r"   )rZ   updater   rt   r-   r^   rT   argmaxrm   r_   r   uint8endswithrn   ry   )r   	convertedunnecessary_leading_dim_featsfeature_namefeaturer1   r1   r2   convert_monomer_features  s@   

r   numc                 C   sd   | dkrt d|  d| d } g }| dkr-|t| d td  | d d } | dksd|S )a`  Encodes a number as a string, using reverse spreadsheet style naming.

    Args:
        num: A positive integer.

    Returns:
        A string that encodes the positive integer using reverse spreadsheet style,
        naming e.g. 1 = A, 2 = B, ..., 27 = AA, 28 = BA, 29 = CA, ... This is the
        usual way to encode chain IDs in mmCIF files.
    r   z$Only positive integers allowed, got .r      A )
ValueErrorrJ   chrordjoin)r   outputr1   r1   r2   int_id_to_str_id@  s   
r   c                 C   s   i }t t}| D ]#}d|v sJ t|d }||vr#t|d ||< |||  | q	g }d}| D ]D\}}t|}	t|ddD ]5\}
}|d }|t	| |d< |
t	| |d< |t	| |d< |	t	| |d< |d7 }|| qCq5|S )	a  Add features to distinguish between chains.

    Args:
        all_chain_features: A dictionary which maps chain_id to a dictionary of
            features for each chain.

    Returns:
        all_chain_features: A dictionary which maps strings of the form
            `<seq_id>_<sym_id>` to the corresponding chain features. E.g. two
            chains from a homodimer would have keys A_1 and A_2. Two chains from a
            heterodimer would have keys A_1 and B_1.
    r   r   )startr   r   r   r   r   )
collectionsdefaultdictr}   strr4   rJ   rt   	enumerater-   ones)r8   seq_to_entity_idgrouped_chainsr   seqnew_all_chain_featureschain_idr   group_chain_featuresr   r   r   r1   r1   r2   add_assembly_featuresV  s,   
	r   c                 C   sj   t | } | d jd }||k r3dD ]}t| | d|| fdf| |< qt| d d|| ff| d< | S )Nr	   r   )r	   r   r   r   r   )r   r   r    )dictrZ   r-   pad)rF   min_num_seqnum_seqfeatr1   r1   r2   pad_msa}  s   

r   c                 C   s:   t | d} g d}|D ]}|| v r| | d| |< q| S )Ni   )r   r#   r   r   r   r   )r   reshape)rF   no_dim_keysrg   r1   r1   r2   post_process  s   
r   c           
      C   sr   t dd | D }g }t|D ]\}}t||vr|| qtj| || gdd}tj||| gdd}	||	fS )Nc                 S   s   g | ]}t |qS r1   )tuple)r/   mr1   r1   r2   r3     s    zmerge_msas.<locals>.<listcomp>r   rP   )setr   r   rJ   r-   r5   )
r	   del_matnew_msanew_del_matcur_msa_setnew_rowsisret_msaret_del_matr1   r1   r2   
merge_msas  s   
r   )&__doc__r   typingr   r   r   numpyr-   &modelscope.models.science.unifold.datar   r   utilsr   r   ndarrayFeatureDict	frozensetrq   rC   rB   boolr7   rG   intrA   rI   rE   ri   rj   rk   r>   r   r   r   r   r   r   r   r1   r1   r1   r2   <module>   s`   %

%

 
4#*'