o
    ϯimZ                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZ dd Zd	d
 Z		d.ddZ	d/ddZG dd dejZG dd deZ G dd deZ!G dd de!Z"G dd de!Z#dd Z$G dd deZ%G dd dejZ&G d d! d!e&Z'e d"d# Z(d$d% Z)d&d' Z*d(d) Z+G d*d+ d+e,Z-G d,d- d-eZ.dS )0    N)	lru_cache)CallableListOptionalTuple)	g_pathmgr)trunc_normal_)VerboseNNModulecast_if_src_dtypec                    s    fddt fddt| D }t |dddddf |dddddf< t |dddddf |dddddf< t|dS )	z Sinusoid position encoding tablec                    s    fddt D S )Nc              	      s(   g | ]}t d d|d     qS )i'     )nppower).0hid_j)d_hidposition ]/home/ubuntu/.local/lib/python3.10/site-packages/imagebind/models/multimodal_preprocessors.py
<listcomp>   s    zOget_sinusoid_encoding_table.<locals>.get_position_angle_vec.<locals>.<listcomp>)ranger   )r   r   r   get_position_angle_vec   s   z;get_sinusoid_encoding_table.<locals>.get_position_angle_vecc                    s   g | ]} |qS r   r   )r   pos_i)r   r   r   r   %       z/get_sinusoid_encoding_table.<locals>.<listcomp>Nr   r      )r   arrayr   sincostorchFloatTensor	unsqueeze)
n_positionr   sinusoid_tabler   )r   r   r   get_sinusoid_encoding_table   s   ..r#   c              	   C   s   |j d }|| kr|S |j d }t|tjtj\}}tjj|dt	t
|t	t
||ddddt
| | dd}|rJt|tjtj\}}|dddddd|}|S )Nr   r      r   bicubic)scale_factormode)shaper
   r   bfloat16float32nn
functionalinterpolatereshapeintmathsqrtpermuteview)target_spatial_size	pos_embedNdimupdated_r   r   r   interpolate_pos_encoding_2d-   s    

$r;   r   c           	      C   s
  |dks|dksJ d|j d | }| |kr|S |d |d ks%J d|d d d |f }|d d |d f }|d u sC|d dkrIt| |}n3|d dkrxt|dksYJ d|d }|d |d	  }|d||d}t| |d
 d}ntdtj||fddS )Nr   r   zthere is 1 CLS token or noner$   z?Interpolation of pos embed not supported for non-square layouts   z$temporal interpolation not supportedr   )r   r   .z,This type of interpolation isn't implementedr8   )r)   r;   lenr4   r    
ValueErrorr   cat)	npatch_per_imgr6   patches_layoutinput_shapefirst_patch_idxr7   	class_emb
num_framesnum_spatial_tokensr   r   r   interpolate_pos_encodingA   s*   rI   c                 C   s   t | ||||d}|S )N)rD   rE   )rI   )rB   r6   rC   rD   rE   r   r   r   _get_pos_embeddingh   s   rJ   c                       s>   e Zd ZdZd
deej f fddZdd Zdd	 Z	  Z
S )PatchEmbedGenericz
    PatchEmbed from Hydra
    N
norm_layerc                    s8   t    t|dkrtj| | _n|d | _|| _d S Nr   r   )super__init__r?   r,   
SequentialprojrL   )self	proj_stemrL   	__class__r   r   rO   ~   s
   


zPatchEmbedGeneric.__init__c                 C   sr   t   t dg| }| |}W d    n1 sw   Y  |jd }t|jdd  }t|}|||fS )Nr   r   )r   no_gradzerosrQ   r)   tupler   prod)rR   img_size	dummy_img	dummy_out	embed_dimrC   num_patchesr   r   r   get_patch_layout   s   



z"PatchEmbedGeneric.get_patch_layoutc                 C   s4   |  |}|ddd}| jd ur| |}|S )Nr   r   )rQ   flatten	transposerL   rR   xr   r   r   forward   s
   


zPatchEmbedGeneric.forwardN)__name__
__module____qualname____doc__r   r,   ModulerO   r_   rd   __classcell__r   r   rT   r   rK   y   s
    rK   c                       s>   e Zd Zdedededededdf fdd	Zd
d Z  ZS ) SpatioTemporalPosEmbeddingHelperrC   r^   num_cls_tokensr]   	learnablereturnNc                    st   t    || _|| _|| _|| | _|| _| jr.tt	
d| j|| _t| jdd d S | dt| j| d S )Nr   {Gz?stdr6   )rN   rO   rm   rC   r^   
num_tokensrn   r,   	Parameterr   rW   r6   r   register_bufferr#   )rR   rC   r^   rm   r]   rn   rT   r   r   rO      s   

z)SpatioTemporalPosEmbeddingHelper.__init__c                 C   s.   |j }t|d| j | j| j|| jd}|S )Nr   )r6   rC   rD   rE   )r)   rJ   sizerm   r6   rC   )rR   vision_inputall_vision_tokensrD   r6   r   r   r   get_pos_embedding   s   z2SpatioTemporalPosEmbeddingHelper.get_pos_embedding)	rf   rg   rh   r   r0   boolrO   ry   rk   r   r   rT   r   rl      s    rl   c                       st   e Zd Z					ddedee ded	ed
ee dede	ddf fddZ
e dd Zdd ZdddZ  ZS )RGBDTPreprocessorr%      r}   r   NFopenclip	rgbt_stem
depth_stemrZ   rm   pos_embed_fnuse_type_embedinit_param_stylero   c           	         s   t    |d ur|n|}||\| _| _| _|| _|| _|d u| _|| _	|| _
| jr8|| j|| j| jd| _| j
dkrJttd| j
| j| _| j	rYttdd| j| _| | d S )N)rC   rm   r^   r]   r   r   )rN   rO   r_   rC   r^   r]   r   r   use_pos_embedr   rm   pos_embedding_helperr,   rt   r   rW   	cls_token
type_embedinit_parameters)	rR   r   r   rZ   rm   r   r   r   stemrT   r   r   rO      s4   



zRGBDTPreprocessor.__init__c                 C   s   |dkr0| j d }| jrtj| jj | j j|9  _| jdkr/tj| j |  j|9  _n|dkr<| jj	
d ntd| | jrOtj| j d S d S )Nr~         r   vitUnknown init )r]   r   r,   initnormal_r   r6   rm   r   datafill_r@   r   r   rR   r   scaler   r   r   r      s   

z!RGBDTPreprocessor.init_parametersc                 C   s   ||}|j dksJ |jd | jksJ |jd }| jdkr0| j|dd}tj||fdd}| jr>| j	
||}|| }| jrK|| j|dd }|S Nr%   r   r   r$   r   r>   )ndimr)   r]   rm   r   expandr   rA   r   r   ry   r   r   )rR   inputr   masktokensBclass_tokensr6   r   r   r   tokenize_input_and_cls_pos   s   

z,RGBDTPreprocessor.tokenize_input_and_cls_posc                 C   sz   |d urt  |d ur| || j|}|d ur| || j|}|d ur,|d ur,|| }n|d ur2|n|}d|ii d}|S )Nr   trunkhead)NotImplementedErrorr   r   r   )rR   visiondepth
patch_maskvision_tokensdepth_tokensfinal_tokensreturn_dictr   r   r   rd     s$   
zRGBDTPreprocessor.forward)r|   r   NFr~   )NNN)rf   rg   rh   rK   r   r   r0   r   rz   strrO   r   rV   r   r   rd   rk   r   r   rT   r   r{      s6    	'
r{   c                       4   e Zd Zdeddf fddZd fdd	Z  ZS )	AudioPreprocessor
audio_stemro   Nc                       t  jd|d d| d S N)r   r   r   rN   rO   )rR   r   kwargsrT   r   r   rO   .     zAudioPreprocessor.__init__c                       t  j|dS N)r   rN   rd   )rR   audiorT   r   r   rd   1     zAudioPreprocessor.forwardre   rf   rg   rh   rK   rO   rd   rk   r   r   rT   r   r   -      r   c                       r   )	ThermalPreprocessorthermal_stemro   Nc                    r   r   r   )rR   r   r   rT   r   r   rO   6  r   zThermalPreprocessor.__init__c                    r   r   r   )rR   thermalrT   r   r   rd   9  r   zThermalPreprocessor.forwardre   r   r   r   rT   r   r   5  r   r   c                 C   s,   t j| | dd}|td |d |S )NF)requires_gradz-infr   )r   emptyr   floattriu_)context_lengthr   r   r   r   build_causal_attention_mask=  s   
r   c                       s`   e Zd Z			ddededededed	ed
eddf fddZe dddZ	dd Z
  ZS )TextPreprocessorTr   r~   
vocab_sizer   r]   causal_maskingsupply_seq_len_to_headrm   r   ro   Nc           	         s   t    || _|| _t||| _tt	d| j| || _
|| _| jr1t| j}| d| || _|| _|| _|dkrS| jdu sGJ dttd| j|| _| | d S )Nr   r   r   Fz%Masking + CLS token isn't implemented)rN   rO   r   r   r,   	Embeddingtoken_embeddingrt   r   r   r6   r   r   ru   r   rm   r]   rW   r   r   )	rR   r   r   r]   r   r   rm   r   r   rT   r   r   rO   G  s(   


zTextPreprocessor.__init__c                 C   s   t jj| jjdd t jj| jdd |dkr3| jd }| jdkr1t j| j |  j|9  _d S d S |dkr@| jj	
d d S td| )	Nrp   rq   {Gz?r~   r   r   r   r   )r,   r   r   r   weightr6   r]   rm   r   r   r   r@   r   r   r   r   r   i  s   

z TextPreprocessor.init_parametersc                 C   s   |  |}| jdkr |jd }| j|dd}tj||fdd}|| j }d|ii d}| jr;|j	dd}d|i|d< | j
rH|d	 d
| ji |S )Nr   r$   r   r>   r   r   seq_lenr   r   	attn_mask)r   rm   r)   r   r   r   rA   r6   r   argmaxr   updater   )rR   texttext_tokensr   r   r   text_lengthsr   r   r   rd   z  s&   



zTextPreprocessor.forward)Tr   r~   )r~   )rf   rg   rh   r0   rz   r   rO   r   rV   r   rd   rk   r   r   rT   r   r   F  s0    	"r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )Im2Videoz&Convert an image into a trivial video.r   c                    s   t    || _d S re   )rN   rO   time_dim)rR   r   rT   r   r   rO     s   

zIm2Video.__init__c                 C   s4   |j dkr|| jS |j dkr|S td|j )Nr=      zDimension incorrect )r   r    r   r@   r)   rb   r   r   r   rd     s
   

zIm2Video.forwardr   )rf   rg   rh   ri   rO   rd   rk   r   r   rT   r   r     s    r   c                       s*   e Zd Zd fdd	Z fddZ  ZS )PadIm2Videor   c                    s6   t  j|d |dksJ |dv sJ || _|| _d S )N)r   r   )zerorepeat)rN   rO   ntimespad_type)rR   r   r   r   rT   r   r   rO     s
   
zPadIm2Video.__init__c                    s   t  |}|j| j dkrM| jdkr(dgt|j }| j|| j< ||}|S | jdkrMddgt|j }| j|j| j  |d| j d < tj	
||}|S )Nr   r   r   r   r   )rN   rd   r)   r   r   r?   r   r   r,   r-   pad)rR   rc   	new_shapepadargrT   r   r   rd     s   


 zPadIm2Video.forwardr   )rf   rg   rh   rO   rd   rk   r   r   rT   r   r     s    r   c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a9  
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr)r   nr   r   r   r     r   z$bytes_to_unicode.<locals>.<listcomp>)listr   ordappenddictzip)bscsr   br   r   r   bytes_to_unicode  s    
r   c                 C   s6   t  }| d }| dd D ]}|||f |}q|S )zReturn set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charcharr   r   r   	get_pairs  s   r   c                 C   s"   t | } tt| } |  S re   )ftfyfix_texthtmlunescapestripr   r   r   r   basic_clean  s   
r   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr   r   r   r   r   whitespace_clean  s   r   c                   @   s>   e Zd ZddefddZdd Zdd Zd	d
 ZdddZdS )SimpleTokenizerM   bpe_pathc                 C   s@  t  | _dd | j D | _t|d}t| }t	| 
dd}W d    n1 s4w   Y  |dd }dd	 |D }tt   }|d
d	 |D  }|D ]
}|d| qX|ddg tt|tt|| _dd | j D | _tt|tt|| _ddd| _tdtj| _|| _d S )Nc                 S      i | ]\}}||qS r   r   r   kvr   r   r   
<dictcomp>      z,SimpleTokenizer.__init__.<locals>.<dictcomp>rbutf-8
r   i  c                 S   s   g | ]}t | qS r   )rX   split)r   merger   r   r   r     s    z,SimpleTokenizer.__init__.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )</w>r   )r   r  r   r   r   r     r    <|startoftext|><|endoftext|>c                 S   r  r   r   r  r   r   r   r    r  )r  r  z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)r   byte_encoderitemsbyte_decoderr   openioBytesIOreadgzipdecoder  r   valuesr   joinextendr   r   r   r?   encoderdecoder	bpe_rankscacher   compile
IGNORECASEpatr   )rR   r  r   fh	bpe_bytesmergesvocabr  r   r   r   rO     s0   
zSimpleTokenizer.__init__c           
         sj  | j v r
 j | S t|d d |d d f }t|}|s#|d S 	 t| fddd}| jvr4nu|\}}g }d}|t|k rz|||}	||||	  |	}W n   |||d   Y n3|| |kr|t|d k r||d  |kr|||  |d	7 }n|||  |d7 }|t|k sBt|}|}t|dkrnt|}q$d
	|}| j |< |S )Nr$   r  Tc                    s    j | tdS )Ninf)r  getr   )pairrR   r   r   <lambda>  s    z%SimpleTokenizer.bpe.<locals>.<lambda>)keyr   r   r   r   )
r   rX   r   minr  r?   indexr  r   r  )
rR   tokenr   r   bigramfirstsecondnew_wordijr   r+  r   bpe  sH   


,


zSimpleTokenizer.bpec                    sn   g }t t| }t j|D ]#}d fdd|dD }| fdd 	|
dD  q|S )Nr  c                 3       | ]} j | V  qd S re   )r  )r   r   r+  r   r   	<genexpr>:  s    z)SimpleTokenizer.encode.<locals>.<genexpr>r	  c                 3   r8  re   )r  )r   	bpe_tokenr+  r   r   r9  ;  s    

r   )r   r   lowerr   findallr#  r  encoder  r7  r  )rR   r   
bpe_tokensr0  r   r+  r   r=  6  s   
zSimpleTokenizer.encodec                    sD   d  fdd|D }t fdd|D jddddd	}|S )
Nr  c                       g | ]} j | qS r   )r  )r   r0  r+  r   r   r   A  r  z*SimpleTokenizer.decode.<locals>.<listcomp>c                    r?  r   )r  )r   cr+  r   r   r   C  r  r	  replace)errorsr  r   )r  	bytearrayr  rA  )rR   r   r   r   r+  r   r  @  s   zSimpleTokenizer.decodeNc                    s   |s j }t|tr|g} jd  jd } fdd|D }tjt||tjd}t|D ]\}}|d |d  |g }t	|||d t|f< q0t|dkrW|d S |S )Nr  r  c                    s   g | ]
}g  | qS r   )r=  )r   r   rR   	sot_tokenr   r   r   R  s    z,SimpleTokenizer.__call__.<locals>.<listcomp>)dtyper   r   )
r   
isinstancer   r  r   rW   r?   long	enumeratetensor)rR   textsr   	eot_token
all_tokensresultr5  r   r   rD  r   __call__I  s   


zSimpleTokenizer.__call__)r   re   )	rf   rg   rh   r   rO   r7  r=  r  rO  r   r   r   r   r     s    )
	r   c                       sl   e Zd Z				ddedededed	ed
ee deddf fddZ	e
 dd Zdd Zdd Z  ZS )IMUPreprocessor   i  r   Nr~   kernel_sizeimu_stemr]   rZ   rm   r   r   ro   c                    s   t    || _|| _|d u| _|| _|| _tt	
d|d | | || _| jdkr9tt	d| j| j| _| | d S rM   )rN   rO   rT  r]   r   rm   rS  r,   rt   r   r   r6   rW   r   r   )rR   rS  rT  r]   rZ   rm   r   r   rT   r   r   rO   _  s   



zIMUPreprocessor.__init__c                 C   sz   t jj| jdd |dkr)| jd }| jdkr't j| j |  j|9  _d S d S |dkr6| jjd d S t	d| )Nr   rq   r~   r   r   r   r   )
r,   r   r   r6   r]   rm   r   r   r   r@   r   r   r   r   r   z  s   

zIMUPreprocessor.init_parametersc                 C   s|   | ||}|jdksJ |jd | jksJ |jd }| jdkr4| j|dd}tj	||fdd}| j
r<|| j }|S r   )rL   rQ   r   r)   r]   rm   r   r   r   rA   r   r6   )rR   r   r   r   r   r   r   r   r   r     s   


z*IMUPreprocessor.tokenize_input_and_cls_posc                 C   sX   | d| j| jdddd}||d|dd}| || j}d|ii d}|S )Nr$   r   r   r   r%   r   r   )unfoldrS  r3   r/   rv   r   rT  )rR   imu
imu_tokensr   r   r   r   rd     s"   zIMUPreprocessor.forward)rQ  r   Nr~   )rf   rg   rh   r0   rK   r   r   r   r   rO   r   rV   r   r   rd   rk   r   r   rT   r   rP  ^  s4    	
rP  )Nr   )r   )/r  r   r  r1   	functoolsr   typingr   r   r   r   r   numpyr   regexr   r   torch.nnr,   iopath.common.file_ior   timm.layersr   imagebind.models.helpersr	   r
   r#   r;   rI   rJ   rj   rK   rl   r{   r   r   r   r   r   r   r   r   r   r   objectr   rP  r   r   r   r   <module>   sJ   
,
'#j	P
m