o
    i*                     @   s   d dl mZ d dlmZ d dlmZ d dlmZmZ ddl	Z	ddl
Z
ddlmZ ddlmZ dd	lmZ ddlZddlZG d
d dZdS )   )SopranoDecoder)
clean_text)split_and_recombine_text)select_deviceselect_backend    N)	unidecode)wavfile)hf_hub_downloadc                   @   sp   e Zd ZdZ					dddZdd	d
Zdd Z					dddZ					dddZ				dddZ	dS )
SopranoTTSa  
    Soprano Text-to-Speech model.
    
    Args:
        backend: Backend to use for inference. Options:
            - 'auto' (default): Automatically select best backend. Tries lmdeploy first (fastest),
                               falls back to transformers. CPU always uses transformers.
            - 'lmdeploy': Force use of LMDeploy (fastest, CUDA only)
            - 'transformers': Force use of HuggingFace Transformers (slower, all devices)
        device: Device to run inference on ('auto', 'cuda', 'cpu', 'mps')
        cache_size_mb: Cache size in MB for lmdeploy backend
        decoder_batch_size: Batch size for decoder
    autod   r   Nc           	      C   s   t |d}t||d}|dkrddlm} ||||d| _n|dkr/ddlm} |||d	| _|| _|| _t	 
|| _|rFtj|d
}ntdd
d}| jtj||d || _d| _d| _| d d S )Ndevice)backendr   lmdeployr   )LMDeployModel)r   cache_size_mb
model_pathtransformers)TransformersModel)r   r   zdecoder.pthzekwek/Soprano-1.1-80M)repo_idfilename)map_location   i   zHello world!)r   r   backends.lmdeployr   pipelinebackends.transformersr   r   r   r   todecoderospathjoinr
   load_state_dicttorchloaddecoder_batch_sizeRECEPTIVE_FIELD
TOKEN_SIZEinfer)	selfr   r   r   r&   r   r   r   decoder_path r,   ?/home/ubuntu/.local/lib/python3.10/site-packages/soprano/tts.py__init__   s&   
zSopranoTTS.__init__   c              	   C   s  g }t |D ]\}}| }t|}t|}g }|D ]
}	||	|d q|dkrt|dkrg }
d}|t|k r|| }t|d |k r|
rZ|
d d d |d   |
d d< n*|d t|k ry|d d ||d  d   ||d  d< n|
| n|
| |d7 }|t|k s9|
}i }|D ],}|d |vrd||d < |d|d  d	|d ||d  f ||d   d7  < qq|S )
z
        adds prompt format and sentence/part index
        Enforces a minimum sentence length by merging short sentences.
        )texttext_idxr   r   r0    r1   z[STOP][TEXT]z[START])	enumeratestripr   r   appendlen)r*   texts
min_lengthresr1   r0   cleaned_text	sentences	processedsentencemergedicursentence_idxesitemr,   r,   r-   _preprocess_text:   s>   
*>
	(zSopranoTTS._preprocess_textc           
      C   s   d}d}t ||krdS d}tt |d D ]1}|| }||d  }t|| }|jdd}	|	|k r8|d7 }n|dkr@|d8 }||krG dS qdS )zP
        Analyzes hidden states to find long runs of similar sequences.
        i,     Fr   r   dimT)r7   ranger$   abssum)
r*   hidden_stateDIFF_THRESHOLDMAX_RUNLENGTHaah_runlengthr@   current_sequencesnext_sequencesdiffs
total_diffr,   r,   r-   hallucination_detector_   s$   
z!SopranoTTS.hallucination_detectorffffff?        333333?r   c                 C   s:   | j |g|||d |dd }|rt|d|   |S )N)top_ptemperaturerepetition_penaltyout_dirretriesr    }  )infer_batchr	   writecpunumpy)r*   r0   out_pathrW   rX   rY   r[   resultsr,   r,   r-   r)   u   s   zSopranoTTS.inferc                    s&  |  |}ttdd | d gt  }ttdt }	dtd| }
|
dkr|	r fdd|	D }| jj||||d}g }t|D ],\}}|d }|||	| < |d	 d
kr\t	d |dkrp| 
|rpt	d ||	|  qD|stn|}	|
d8 }
|
dkrt	dt|	 d |
dkr|	s+tt||}|jdd d t| \}}t|}dd t|D }|D ]}||d  d  qtdt|| jD ]}g }ttdd |||| j  }t|}t|D ]0}|tjtjdd|d ||  f| jd|||  ddd| jtjgdd qt|}t  | |}W d    n	1 s.w   Y  t|D ]*}|||  d }|||  d }||  || | j | j  d  || |< q7qdd |D }|rtj|dd tt|D ]}t| d| dd||     qz|S )Nc                 S   s   | d S Nr   r,   xr,   r,   r-   <lambda>   s    z(SopranoTTS.infer_batch.<locals>.<lambda>r   r   c                    s   g | ]} | qS r,   r,   ).0r@   promptsr,   r-   
<listcomp>   s    z*SopranoTTS.infer_batch.<locals>.<listcomp>rW   rX   rY   rK   finish_reasonstopzMWarning: A sentence did not complete generation, likely due to hallucination.z.Warning: A sentence contained a hallucination.z	Warning: z! sentence(s) will be regenerated.c                 S   s   | d  d S rc   sizerd   r,   r,   r-   rf          )keyc                 S   s   g | ]}g qS r,   r,   )rg   _r,   r,   r-   rj      rp   c                 S   s
   |  dS rc   rn   rd   r,   r,   r-   rf      s   
 i   r      rF   c                 S   s   g | ]	}t | qS r,   )r$   catr_   )rg   re   r,   r,   r-   rj      s    T)exist_ok/z.wavr\   )!rD   listmapr7   rH   maxr   r)   r4   printrS   r6   zipsortr&   r$   rt   zerosr   	unsqueeze	transposer   float32no_gradr   squeezer(   r    makedirsr	   r^   r_   r`   )r*   r8   rZ   rW   rX   rY   r[   sentence_datahidden_statespending_indices
tries_leftcurrent_prompts	responsesbad_indicesidxresponserK   combined	num_textsaudio_concatr>   batch_hidden_stateslengthsNr@   audiotext_idsentence_idr,   rh   r-   r]      s|   
  (


2*zSopranoTTS.infer_batchc              
   c   s   t   }| |g}d}|D ]\}	}
}
| jj|	|||d}g }|}|D ]}|d d u}|s7||d d  |d| j |  d  }|sNt|| j| kr|sT||krt|}|	d
dd| jtj}t  | |d }W d    n1 sw   Y  |r|| j| d | j | j  d  }n|| j| | j | j  | j| j | j   }d}|rtd	d
t   |  dd d}| V  |d7 }q$qd S )NTrk   rl   rK   r2   rs   r   r   zStreaming latency: i  z.2fz msF)timerD   r   stream_inferr6   r'   r7   r$   stackr~   r   r   r   r   r   r   r(   rz   r_   )r*   r0   
chunk_sizerW   rX   rY   
start_timer   first_chunkr>   rr   r   hidden_states_bufferchunk_countertokenfinishedr   inpr   audio_chunkr,   r,   r-   infer_stream   sF   
"
&0
zSopranoTTS.infer_stream)r   r   r   r   N)r/   )NrT   rU   rV   r   )r   rT   rU   rV   )
__name__
__module____qualname____doc__r.   rD   rS   r)   r]   r   r,   r,   r,   r-   r      s6    

%

Er   )vocos.decoderr   utils.text_normalizerr   utils.text_splitterr   utils.auto_selectr   r   r$   rer   scipy.ior	   huggingface_hubr
   r    r   r   r,   r,   r,   r-   <module>   s    