o
    Si                     @   s   d dl mZmZ d dlmZmZmZmZmZm	Z	 d dl
Zd dlZd dlmZmZ d dlmZmZmZ eG dd dZeG dd	 d	eZdS )
    )asdict	dataclass)AnyDictListOptionalSequenceUnionN)FeatureExtractorregister_extractor)Secondscompute_num_frames_from_samplesis_module_availablec                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< deeef fddZedeeef dd fddZdS )S3PRLSSLConfiga  
    In general, the output feature dimension of base model (e.g., wav2vec2) and
    large model (e.g., wav2vec2_large_ll60k) are 768 and 1024, repectively. The
    frame shift (stride) is 0.02s (20ms).

    Please check
        https://github.com/s3prl/s3prl/blob/main/s3prl/upstream/README.md and
        https://s3prl.github.io/s3prl/tutorial/upstream_collection.html
    for details of available self-supervised models.
    >  sampling_ratewav2vec2_large_ll60k	ssl_modellayerg{Gz?frame_shifti   feature_dimcpudevicereturnc                 C   s   t | S N)r   self r   G/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/features/ssl.pyto_dict   s   zS3PRLSSLConfig.to_dictdatac                 C   s   t di | S )Nr   )r   )r!   r   r   r   	from_dict"   s   zS3PRLSSLConfig.from_dictN)__name__
__module____qualname____doc__r   int__annotations__r   strr   r   floatr   r   r   r   r    staticmethodr"   r   r   r   r   r      s   
  r   c                       s(  e Zd ZdZeZddee f fddZe	de
fddZe	defd	d
ZdedefddZdejdedejfddZ	ddeejejeej eej f dedeeejejf  deejejeej eej f fddZdeejejeej eej f dedeejeej f fddZ  ZS )S3PRLSSLz	s3prl-sslNconfigc                    s   t  | tdsJ ddd lm} | jjt|v s&J d| jj d| jjdks0J dt	|| jj }|
| jj| _d S )Nz	s3prl.hubz8To use s3prl ssl extractors, please install s3prl first.r   zS3PRL dose not suport model: .r   ?All the upstream models in S3PRL now only support 16 kHz audio.)super__init__r   	s3prl.hubhubr-   r   dirr   getattrtor   )r   r-   r3   r   	__class__r   r   r1   ,   s"   zS3PRLSSL.__init__r   c                 C      | j jS r   )r-   r   r   r   r   r   r   =      zS3PRLSSL.frame_shiftc                 C   r9   r   )r-   r   r   r   r   r   r   A   r:   zS3PRLSSL.sampling_rater   c                 C   s   |dksJ d| j jS )Nr   r/   )r-   r   )r   r   r   r   r   r   E   s   
zS3PRLSSL.feature_dimfeatsnum_samplesc                 C   s`   |j \}}t|| j| jd}t|| }|dksJ |dkr.td|g}tj||gdd}|S )N)r<   r   r      r   axis)shaper   r   r   absnpzerosconcatenate)r   r;   r<   
num_framesnum_featuresexpected_num_framesnum_frames_diffpadr   r   r   fix_off_by_one_errorK   s   
zS3PRLSSL.fix_off_by_one_errorsampleslengthsc                 C   s*   |d urdd t ||D }| j||dS )Nc                 S   s   g | ]
\}}|d | qS r   r   ).0xlr   r   r   
<listcomp>e       z*S3PRLSSL.extract_batch.<locals>.<listcomp>)rK   r   )zipextract)r   rK   r   rL   r   r   r   extract_batch[   s   	zS3PRLSSL.extract_batchc                    s  |dksJ dt |t}|rdd |D }n| }|s"|jdkr*dd |D }nt |tjr5t|n|g}fdd|D }dd |D }j	  t
  |d	 jj  W d    n1 sgw   Y      jd
kr     |d  |r g  S fdd|D }dd t |D  fddt |D  t fdd dd  D rtj dd  S )Nr   r/   c                 S   s   g | ]}|  qS r   )squeezerM   sr   r   r   rP   x   s    z$S3PRLSSL.extract.<locals>.<listcomp>r=   c                 S   s&   g | ]}t |tjrt|n|qS r   )
isinstancerB   ndarraytorch
from_numpyrV   r   r   r   rP   ~   s    c                    s   g | ]	}|  jjqS r   )r6   r-   r   rV   r   r   r   rP      s    c                 S   s   g | ]}|j d  qS )r   r@   rV   r   r   r   rP      s    hidden_states   r   c                    s    g | ]}t | jj jjqS r   )r   r-   r   r   )rM   r<   r   r   r   rP      s    c                 S   s$   g | ]\}}|d |    qS r   )r   numpyrM   frO   r   r   r   rP      s   $ c                    s   g | ]
\}}  ||qS r   )rJ   r`   r   r   r   rP      rQ   c                 3   s     | ]}|j  d  j kV  qdS )r   Nr\   )rM   item)r;   r   r   	<genexpr>   s    z#S3PRLSSL.extract.<locals>.<genexpr>r>   )rX   listrU   ndimrB   rY   rZ   r[   r   evalno_gradr-   r   r   r_   rJ   rR   allstack)r   rK   r   input_is_listrL   out_lensr   )r;   r   r   rS   h   sH   






zS3PRLSSL.extractr   )r#   r$   r%   namer   config_typer   r   r1   propertyr   r   r'   r   r   rB   rY   rJ   r	   rZ   Tensorr   r   rT   rS   __classcell__r   r   r7   r   r,   '   s>    
r,   )dataclassesr   r   typingr   r   r   r   r   r	   r_   rB   rZ   lhotse.features.baser
   r   lhotse.utilsr   r   r   r   r,   r   r   r   r   <module>   s     