o
    㥵i0                     @   s   d dl mZmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZmZmZ ddefddZeG d	d
 d
ZeddG dd deZeddG dd deZeddG dd deZeddG dd dZeG dd dZdS )    )	dataclassfield)ListLiteralUnionN)IM_END_TOKENMODALITY_TOKENSFishTokenizerF	to_tensorc                 C   sT   t | trd| v rtj| d | d d| d } |r(t | tjr(t|  } | S )N__ndarray__datadtyper   shape)	
isinstancedictnp
frombufferreshapendarraytorch
from_numpycopy)objr
    r   P/home/ubuntu/.local/lib/python3.10/site-packages/fish_speech/content_sequence.pyrestore_ndarray   s
    r   c                   @   s.   e Zd ZU dZed dB ed< dZeed< dS )BasePartN)textvqaudiotypeFcal_loss)__name__
__module____qualname__r!   r   __annotations__r"   boolr   r   r   r   r      s   
 r   T)kw_onlyc                   @   &   e Zd ZU dZejed< dddZdS )VQPartr   codesselfc                 C      d| _ t| jdd| _d S )Nr   Tr
   )r!   r   r+   r,   r   r   r   __post_init__#      zVQPart.__post_init__N)r,   r*   r#   r$   r%   r!   r   Tensorr&   r0   r   r   r   r   r*         
 
r*   c                   @   s@   e Zd ZU dZdZedB ed< dZee	 dB ed< dddZ
dS )TextPartr   Ntokensr,   c                 C   s*   d| _ | jd u r| jd u rtdd S d S )Nr   z&Either text or tokens must be provided)r!   r   r6   
ValueErrorr/   r   r   r   r0   .   s   zTextPart.__post_init__)r,   r5   )r#   r$   r%   r!   r   strr&   r6   listintr0   r   r   r   r   r5   (   s
   
 r5   c                   @   r)   )	AudioPartr    featuresr,   c                 C   r-   )Nr    Tr.   )r!   r   r<   r/   r   r   r   r0   9   r1   zAudioPart.__post_init__N)r,   r;   r2   r   r   r   r   r;   4   r4   r;   c                   @   s   e Zd ZU ejed< ejed< dZejdB ed< dZejdB ed< eej ed< dZ	ejdB ed< eej ed< dZ
ejdB ed	< dZedB ed
< dS )EncodedMessager6   labelsNvq_mask_tokensvq_mask_labelsvq_partsvq_require_lossesaudio_partsaudio_masksmetadata)r#   r$   r%   r   r3   r&   r?   r@   r9   rB   rD   rE   r   r   r   r   r   r=   >   s   
 

r=   c                   @   s4  e Zd ZU dZeedZee ed< dZ	e
d dB ed< dZedB ed< 			ddd deeeB  dB de
d dB dedB fd	d
Z		ddd deeee f dedeeef dB fddZdg fdd dededee def
ddZdd dededeejejejf fddZg dfdd dedee defddZdS ) ContentSequencez
    Flexible sequence of content parts that supports interleaved multimodal format.
    Example format: <|interleave|><|speaker:1|> TEXT AUDIO <|im_end|><|speaker:2|> TEXT AUDIO <|im_end|>
    )default_factorypartsN)r   voice
interleavemodalityrE   r,   c                 C   s,  || _ |pi | _g }|pg D ]?}t|trH|d dkr#td	i |}n%|d dkr1td	i |}n|d dkr?td	i |}n	td|d  || q|| _	| j rt
| j	dkrt| j	d tdu rt| j	d tr| j	d jd ur| j	d jt| j  st| j  }| j	dt|d d S d S d S )
Nr!   r   r    r   Unsupported part type: r   Fr   r   )rK   rE   r   r   r*   r;   r5   r7   appendrH   lenr   
startswithr   insert)r,   rH   rK   rE   fixed_partspartmodality_tokenr   r   r   __init__V   s4   


zContentSequence.__init__Fpart_or_partsadd_endspeakerc                 C   sn   t |ts|gn|}|durd| d}| jt|d | j| |r5| jtt| jd jd dS dS )a.  
        Append a part or list of parts to the sequence.

        Args:
            part_or_parts: A single part or list of parts to add
            add_end: Whether to add the IM_END_TOKEN after these parts
            speaker: Optional speaker identifier (name or ID) to add before the parts
        Nz
<|speaker:z|>rM   )r   r"   )r   r9   rH   rN   r5   extendr   r"   )r,   rV   rW   rX   parts_to_addspeaker_tokenr   r   r   rN   y   s   zContentSequence.appendT	tokenizer	add_shiftignore_loss_tokensreturnc                    s  g }g }g }g }g }g }	g }
 fdd|D }| j D ]}t|tr@|jdu r4|jdus-J  |j}n|j}tj|tjd}n6t|t	rm|j
 tj}tj fdd|d  D tjd}|| ||j n	tdt| || t|t	r|tj|tjd |
tj|tjd n<t|tr|tj|tjd tj|tjd}d|d< d|d	< |
| n|tj|tjd |
tj|tjd |jrt|ts||  q|t|d
 qtj|dd}tj|dd}tj|dd}tj|
dd}
tj|tjd}|}|}|r<|dd	 }|dd }|dd	 }|dd	 }|dd }|
dd	 }
|D ]}|d
krJ|dusLJ d
|||k< q>|jtjtjfv sfJ d|j t|||||||	|
| jd	S )an  
        Encode the sequence parts into tokens for the model.

        Args:
            tokenizer: The tokenizer to use
            add_shift: Whether to shift tokens for next-token prediction
            ignore_loss_tokens: List of token strings to ignore when calculating loss

        Returns:
            EncodedMessage with tensors ready for the model
        c                    s   g | ]}  |qS r   )get_token_id.0ir]   r   r   
<listcomp>   s    z*ContentSequence.encode.<locals>.<listcomp>Nr   c                    s   g | ]} j t|  qS r   )semantic_id_to_token_idr:   itemrb   re   r   r   rf      s    r   rL   FrY   dim   zInvalid dtype: )	r6   r>   rA   r?   r@   rB   rC   rD   rE   )rH   r   r5   r6   r   encoder   tensorr:   r*   r+   clonetorN   r"   r7   r!   	ones_liker'   
zeros_liker;   	full_likecatr   longr=   rE   )r,   r]   r^   r_   
all_tokens
all_labelsrA   vq_masksrB   rC   rD   ignore_loss_token_idsrS   r6   
curr_codes
audio_maskr>   r?   r@   rd   r   re   r   rm      s   











zContentSequence.encodenum_codebooksc           	      C   s  | j |dd}|j}tj|d t|ftjd}||d< |jd u s(t|jdkr9|jd u s4t|jdkr9|d d fS d  }}|jd urht|jdkrh|j}tj|dd}|d |j	 |d|j
f< ||dd |j
f< |jd urt|jdkrtj|jdd}|jd d d f }|||fS )NF)r^   rl   r   r   rj   )rm   r6   r   zerosrO   r:   rA   rC   rt   semantic_begin_idr?   rD   )	r,   r]   r|   encodedr6   valuesrC   rD   rA   r   r   r   encode_for_inference  s$   

z$ContentSequence.encode_for_inferencemerge_semantic_tokensc                    s"  | j |d|d}dddddd d fd	d
fddfdd}d}d}t|j|jD ]M\}}	t| }
|ri|j|
  krK|jkr\n n|du sU||	kr\|d7 }|	}q3|dkri||| d}d}|t| g}|	dkr|| q3| q3|r|dkr||| t	  dS )z
        Visualize the encoded sequence with color-coded tokens.
        Blue/cyan tokens contribute to loss, green tokens do not.
        F)r^   r_   z[94mz[96mz[92mz[32m)bluecyangreen
dark_greenr   c                    s>    d dkr
d nd }t | |  ddd  d7  d S )	N   r   r   r   [0m endrl   printxcolor)blue_idxcolorsr   r   print_in_blueD     z0ContentSequence.visualize.<locals>.print_in_bluec                    s>   d dkr
 d n d }t | |  ddd d7 d S )	Nr   r   r   r   r   r   r   rl   r   r   )r   	green_idxr   r   print_in_greenJ  r   z1ContentSequence.visualize.<locals>.print_in_greenc                    s,   d| d}| dkr| d S  | d S )Nz[<|semantic|>x]ri   r   )r   countval)r   r   r   r   print_semantic_tokenP  s   z7ContentSequence.visualize.<locals>.print_semantic_tokenNrl   ri   )
rm   zipr6   r>   r:   rh   r~   semantic_end_iddecoder   )r,   r]   r_   r   r   r   count_semantic_tokenssemantic_labeltoklabtoken_idr   r   )r   r   r   r   r   r   	visualize,  sD   





zContentSequence.visualize)NNN)FN)r#   r$   r%   __doc__r   r9   rH   r   r&   rK   r   rE   r   rU   r   r   r'   r8   r:   rN   r	   r=   rm   tupler   r3   r   r   r   r   r   r   rF   K   sz   
 

&
$
t
!rF   )F)dataclassesr   r   typingr   r   r   numpyr   r   fish_speech.tokenizerr   r   r	   r'   r   r   r*   r5   r;   r=   rF   r   r   r   r   <module>   s$    
		