o
    ix1                     @   s   d dl mZmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
m	  mZ d dlZd dlmZ d dlmZmZmZ d dlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ G dd de	j edddZ!G dd de!Z"G dd dedddZ#dS )    )OptionalDict)PathN)
transforms)PyTorchModelHubMixinModelHubMixinhf_hub_download)AutoFeatureExtractorHubertModelWav2Vec2BertModel   )CodecEncoder)DistillCodecEncoder)CodecDecoderVocos)SemanticEncoderc                       s   e Zd Zdedef fddZedd Zedddddddd	d
d	dede	e de	e de
de	e de
de
de	e dede
fddZdejeB eB fddZdejeB eB dejfddZdejdejfdd Z  ZS )!NeuCodecsample_rate
hop_lengthc                    s   t    || _|| _tjddd| _| jjjd d  t	d| _
tddd| _t | _t|d| _tdd| _tdd| _d S )Nzfacebook/w2v-bert-2.0Toutput_hidden_states      r      )super__init__r   r   r   from_pretrainedsemantic_modelencoderlayersr	   feature_extractorr   SemanticEncoder_moduler   CodecEncr   	generatornnLinearfc_prior	fc_post_aselfr   r   	__class__ B/home/ubuntu/.local/lib/python3.10/site-packages/neucodec/model.pyr      s   
zNeuCodec.__init__c                 C   s   t |  jS N)next
parametersdevice)r)   r,   r,   r-   r1   ,   s   zNeuCodec.deviceNFcpuT	revision	cache_dirforce_downloadproxiesresume_downloadlocal_files_onlytokenmap_locationstrictmodel_idr4   r5   r6   r7   r8   r9   r:   r;   r<   c       
            s   |dv sJ |dkrddgn|dkrg t |d|||||||d	}t |d|||||||d	}| d	d
}t||	}dd   fdd| D }|j|dd |S )N)neuphonic/neucodecneuphonic/distill-neucodecr>   	fc_post_sSemanticDecoderr?   zpytorch_model.bin	repo_idfilenamer4   r5   r6   r7   r8   r9   r:   	meta.yaml]  i  c                    s   t  fdd|D S )Nc                 3   s    | ]}| v V  qd S r.   r,   ).0isr,   r-   	<genexpr>f   s    z>NeuCodec._from_pretrained.<locals>.<lambda>.<locals>.<genexpr>)any)rJ   lr,   rI   r-   <lambda>f   s    z+NeuCodec._from_pretrained.<locals>.<lambda>c                    s    i | ]\}} |s||qS r,   r,   )rG   kvcontains_listignore_keysr,   r-   
<dictcomp>g   s    z-NeuCodec._from_pretrained.<locals>.<dictcomp>F)r<   )r   torchloaditemsload_state_dict)clsr=   r4   r5   r6   r7   r8   r9   r:   r;   r<   model_kwargs	ckpt_path_model
state_dictr,   rQ   r-   _from_pretrained0   sF   

zNeuCodec._from_pretrainedaudio_or_pathc                 C   s   t |ttfr&t|\}}|dkr%t|d|d}}|d d d f }nt |tjr@|}t	|j
dkr8|}ntd|j
 d|j
d d  }tjj|d|f}|S )N>     zPNeuCodec expects tensor audio input to be of shape [B, 1, T] -- received shape: i@  r   )
isinstancer   str
torchaudiorV   TResamplerU   Tensorlenshape
ValueErrorr$   
functionalpad)r)   r`   ysrpad_for_wavr,   r,   r-   _prepare_audioq   s    
zNeuCodec._prepare_audioreturnc                 C   sJ  |  |}g }t|dD ]}| j||ddf  dddj| j}|| qt	
|}| || j}|dd}| |dd jd dd}| |}|jd	 |jd	 krt|jd	 |jd	 }	|ddddd|	f }|ddddd|	f }t	j||gdd
}
| |
dddd}
| j|
dd\}}}|S )
        Args:
            audio_or_path: torch.Tensor [B, 1, T] | Path | str, input audio

        Returns:
            fsq_codes: torch.Tensor [B, 1, F], 50hz FSQ codes
        r   Nra   ptsampling_ratereturn_tensorsr      r   rc   dimTvq)rr   rangesizer    r2   input_featurestor1   appendrU   vstackr"   	transposer   hidden_statesr!   rk   mincatr&   r#   )r)   r`   ro   all_semantic_featuresrH   semantic_featuresacoustic_embsemantic_outputsemantic_encodedmin_len
concat_embr\   	fsq_codesr,   r,   r-   encode_code   s2   




zNeuCodec.encode_coder   c                 C   sZ   | j j|dd}|dd}| |dddd}| j |ddddd }|S )z
        Args:
            fsq_codes: torch.Tensor [B, 1, F], 50hz FSQ codes

        Returns:
            recon: torch.Tensor [B, 1, T], reconstructed 24kHz audio
        r   ry   Fr|   r   )r#   	quantizerget_output_from_indicesr   r'   )r)   r   fsq_post_embreconr,   r,   r-   decode_code   s
   	zNeuCodec.decode_code)__name__
__module____qualname__intr   propertyr1   classmethodre   r   boolr   r_   rU   ri   r   rr   r   r   __classcell__r,   r,   r*   r-   r      sL    
	
@.r   z%https://github.com/neuphonic/neucodecz
apache-2.0)repo_urllicensec                   @   s<   e Zd ZdedefddZdejeB eB dejfddZ	d	S )
DistillNeuCodecr   r   c                 C   s   t j|  || _|| _tjddd| _td| _	t
ddd| _t | _t|d| _t dd| _t d	d| _t dd| _d S )
Nzntu-spml/distilhubertTr   i   r   r   i   r   i   )r$   Moduler   r   r   r
   r   r   r	   r    r   r!   r   codec_encoderr   r#   r%   r&   fc_sq_priorr'   r(   r,   r,   r-   r      s$   zDistillNeuCodec.__init__r`   rs   c                 C   sR  |  |}g }t|dD ]#}| jt||ddf  ddddj| j	
d}|| qt|}| | || j	}|dd}| |jdd}| |}|jd	 |jd	 krt|jd	 |jd	 }|ddddd|f }|ddddd|f }tj||gdd
}	| |	dddd}	| j|	dd\}
}}
|S )rt   r   N)   r   ra   ru   rv   r   ry   rc   rz   Tr|   )rr   r~   r   r    Frn   r2   input_valuesr   r1   squeezer   rU   r   r   r   r   r   last_hidden_stater!   rk   r   r   r&   r#   )r)   r`   ro   r   rH   r   fsq_embsemantic_targetr   r   r\   r   r,   r,   r-   r      s:   


	


zDistillNeuCodec.encode_codeN)
r   r   r   r   r   rU   ri   r   re   r   r,   r,   r,   r-   r      s    "r   c                   @   s   e Zd Zdd Zedddddddddd	ded	ee d
ee dedee dededee dedefddZ	dd Z
dejdejfddZdS )NeuCodecOnnxDecoderc              
   C   s\   zdd l }W n ty } ztd|d }~ww | }|jj|_|j||d| _d| _d S )Nr   z[Failed to import `onnxruntime`. Install with the following command: pip install onnxruntime)sess_optionsrF   )	onnxruntimeImportErrorSessionOptionsGraphOptimizationLevelORT_ENABLE_ALLgraph_optimization_levelInferenceSessionsessionr   )r)   	onnx_pathr   esor,   r,   r-   r     s   


zNeuCodecOnnxDecoder.__init__NFr2   Tr3   r=   r4   r5   r6   r7   r8   r9   r:   r;   r<   c       
         K   sP   t |d|||||||d	}t |d|||||||d	}| |}|	dkr&td|S )Nz
model.onnxrB   rE   r2   z6The onnx decoder currently only supports CPU runtimes.)r   rl   )rY   r=   r4   r5   r6   r7   r8   r9   r:   r;   r<   rZ   r   r\   r]   r,   r,   r-   r_   $  s4   z$NeuCodecOnnxDecoder._from_pretrainedc                 O   s   t d)NzfThe onnx decoder has no functionality to encode codes, as it only contains the compiled decoder graph.)NotImplementedError)r)   argskwargsr,   r,   r-   r   X  s   zNeuCodecOnnxDecoder.encode_codecodesrs   c                 C   sZ   t |tjs
tdt|jdkr|jd dkrtd| jdd|id tj	}|S )z
        Args:
            fsq_codes: np.array [B, 1, F], 50hz FSQ codes

        Returns:
            recon: np.array [B, 1, T], reconstructed 24kHz audio
        z`Codes` should be an np.array.rb   r   z%`Codes` should be of shape [B, 1, F].Nr   r   )
rd   npndarrayrl   rj   rk   r   runastypefloat32)r)   r   r   r,   r,   r-   r   ]  s   
zNeuCodecOnnxDecoder.decode_code)r   r   r   r   r   re   r   r   r   r_   r   r   r   r   r,   r,   r,   r-   r     sF    	
3r   )$typingr   r   pathlibr   numpyr   rU   torch.nnr$   torch.nn.functionalrm   r   rf   r   rg   huggingface_hubr   r   r   transformersr	   r
   r   r   r   codec_encoder_distillr   codec_decoder_vocosr   moduler   r   r   r   r   r,   r,   r,   r-   <module>   s6    
 7
E
