o
    i57                     @   s   d Z ddlZddlmZmZ G dd dZG dd deZG dd	 d	eZG d
d deZ		dde	de	de
de
de	f
ddZdS )a  
Text Chunking Utility for Long Inputs

Splits long texts into manageable chunks with smart boundary detection.
Supports English, Hindi, Telugu, and mixed-language content.

Why Text-Based Chunking?
- Preserves semantic meaning (sentences, paragraphs)
- Natural boundaries for better prosody
- Simpler than token-based (model handles tokenization internally)
- Works with SNAC token sliding window (separate layer)
    N)ListTuplec                   @   sp   e Zd ZdZddefddZdedee fdd	Zdedefd
dZ		ddededee
eeef  fddZdS )TextChunkerz
    Smart text chunking for long inputs.
    
    Splits text at natural boundaries (paragraphs, sentences, commas, spaces)
    to avoid mid-word/mid-sentence splits.
      max_chunk_lengthc                 C   s
   || _ dS )z
        Initialize text chunker.
        
        Args:
            max_chunk_length: Maximum characters per chunk (default: 1000)
        N)r   )selfr    r   >/home/ubuntu/veenaModal/veena3modal/processing/text_chunker.py__init__   s   
zTextChunker.__init__textreturnc                 C   s   t || jkr
|gS g }|}t || jkrB|d| j }| |}|dkr(| j}||d|   ||d  }t || jks|rI|| |S )a3  
        Split text into chunks at natural boundaries.
        
        Priority:
        1. Paragraph breaks (\n\n)
        2. Sentence ends (. ! ?)
        3. Commas
        4. Spaces
        
        Args:
            text: Text to chunk
        
        Returns:
            List of text chunks
        Nr   )lenr   _find_split_pointappendstrip)r   r   chunks	remainingchunksplit_pointr   r   r	   
chunk_text#   s   

zTextChunker.chunk_textc                 C   sz   t d|}|r| S d}tt ||}|r|d  S tt d|}|r.|d  S |d}|dkr;|d S dS )z
        Find the best split point in text.
        
        Returns index of split point (0 if none found).
        
        Enhanced to handle:
        - Abbreviations (Dr., Mr., Mrs., etc.)
        - Decimals (3.14)
        - Ellipsis (...)
        \n\nz)[.!?](?!\.\.)(?![0-9])(?![A-Z][a-z]\.)\s+,\s+ r      researchendlistfinditerrfind)r   r   
para_matchsentence_patternsentence_matchescomma_matchesspace_matchr   r   r	   r   L   s   
zTextChunker._find_split_point2   overlap_charsc                 C   s   t || jkr|dt |fgS g }d}|t |k r^t|| j t |}||| }|t |k rB| |}|dkrB|| }||| }|| ||f |t |k rV|| n|}|t |k s|S )a  
        Chunk text with overlap for crossfade stitching.
        
        Args:
            text: Text to chunk
            overlap_chars: Number of characters to overlap (default: 50)
        
        Returns:
            List of (chunk_text, start_idx, end_idx) tuples
        r   )r   r   minr   r   r   )r   r   r(   r   	start_idxend_idxr   r   r   r   r	   chunk_with_overlapr   s    
zTextChunker.chunk_with_overlapN)r   )r'   )__name__
__module____qualname____doc__intr
   strr   r   r   r   r,   r   r   r   r	   r      s    	))r   c                       s*   e Zd ZdZdedef fddZ  ZS )SentenceBoundaryChunkerz
    Enhanced chunker that prioritizes sentence boundaries.
    
    Better handles abbreviations, decimals, and edge cases.
    r   r   c                    s   d}t t||}g }|D ]1}| }|td|d |d   t fdddD r-qd v s9d	 v s9d
 v r:q|| q|rH|d  S t 	|S )z
        Find split point prioritizing sentence boundaries.
        
        Enhanced detection to avoid splitting on:
        - Common abbreviations (Dr., Mr., Mrs., Prof., etc.)
        - Decimal numbers (3.14)
        - Ellipsis (...)
        [.!?](?!\.\.)(?![0-9])\s+r      r   c                 3   s    | ]}| v V  qd S )Nr   ).0abbrlookbackr   r	   	<genexpr>   s    z<SentenceBoundaryChunker._find_split_point.<locals>.<genexpr>)zDr.zMr.zMrs.zMs.zProf.zSr.zJr.ze.g.zi.e.zetc.r   )
r   r   r    startmaxanyr   r   superr   )r   r   r#   r$   valid_splitsmatchpos	__class__r8   r	   r      s   z)SentenceBoundaryChunker._find_split_point)r-   r.   r/   r0   r2   r1   r   __classcell__r   r   rB   r	   r3      s    r3   c                   @   s4   e Zd ZdZdedefddZdedefddZdS )	IndicSentenceChunkeru  
    Enhanced chunker with Indic language support (Hindi, Telugu, multilingual).
    
    Handles:
    - Hindi: Devanagari danda (।), double danda (॥)
    - Telugu: Sentence terminators (same Unicode as Hindi)
    - English: Standard punctuation (. ! ?)
    - Mixed language content
    
    Why this matters:
    - Indic languages use different punctuation (। instead of .)
    - Better prosody when splitting at natural language boundaries
    - Preserves meaning across languages
    r   r   c                 C   s   t d|}|r| S t d|}|r| S t d|}|r$| S tt d|}|r4|d  S |d}|dkrA|d S dS )	u   
        Find split point with Indic language awareness.
        
        Priority:
        1. Paragraph breaks (

)
        2. Indic sentence markers (। ॥)
        3. English sentence markers (. ! ?)
        4. Commas
        5. Spaces
        r   u   [।॥]\s*r4   r   r   r   r   r   r   )r   r   r"   indic_sentenceenglish_sentencer%   r&   r   r   r	   r      s&   
z&IndicSentenceChunker._find_split_pointc           
      C   s   t td|}t td|}t td|}|| | }|dkr)dddddS || d }|| d }|| d }|t||krEd}	n|t||krOd	}	n|t||krYd
}	nd}	|	t|dt|dt|ddS )z
        Detect language composition in text.
        
        Returns:
            dict with language percentages and primary language
        z[\u0900-\u097F]z[\u0C00-\u0C7F]z[a-zA-Z]r   unknown)primaryhinditeluguenglishd   rJ   rK   rL   mixedr   )r   r   findallr<   round)
r   r   devanagari_charstelugu_charsenglish_charstotal_chars	hindi_pct
telugu_pctenglish_pctrI   r   r   r	   detect_language_mix  s*   z(IndicSentenceChunker.detect_language_mixN)	r-   r.   r/   r0   r2   r1   r   dictrX   r   r   r   r	   rE      s    /rE   c                       s.   e Zd ZdZdedee f fddZ  ZS )ParagraphChunkerz
    Chunker that splits primarily on paragraph boundaries.
    
    Useful for document-style text with clear paragraph structure.
    r   r   c                    s   t || jkr
|gS td|}g }d}|D ];}| }|sq|r'|d | n|}t || jkr3|}q|r:|| t || jkrOt |}|| d}q|}q|rY|| |r]|S |gS )z#Split text on paragraph boundaries.z\n\n+ z

)	r   r   r   splitr   r   r>   r   extend)r   r   
paragraphsr   current_chunkpara
test_chunkpara_chunksrB   r   r	   r   A  s,   


zParagraphChunker.chunk_text)r-   r.   r/   r0   r2   r   r   rD   r   r   rB   r	   rZ   :  s    "rZ     ]  audio1audio2crossfade_samplessample_rater   c                 C   s   ddl }|j| |jd}|j||jd}|dkr| | S t||k s(t||k r,| | S |dd|}|dd|}|| d }	|d| }
|	| |
|  |j}||d|  |||d g}| S )a'  
    Crossfade two audio segments for seamless stitching.
    
    Uses linear crossfade with smooth amplitude transition to avoid
    pops, clicks, or audible artifacts at chunk boundaries.
    
    Args:
        audio1: First audio segment (int16 PCM)
        audio2: Second audio segment (int16 PCM)
        crossfade_samples: Number of samples to crossfade (default: 1200 = 50ms @ 24kHz)
        sample_rate: Sample rate (default: 24000 Hz)
    
    Returns:
        Stitched audio with crossfade
        
    Example:
        >>> audio1 = b'\x00\x01' * 24000  # 1 second
        >>> audio2 = b'\x00\x01' * 24000  # 1 second
        >>> result = crossfade_audio(audio1, audio2, crossfade_samples=1200)
        >>> len(result) < len(audio1) + len(audio2)  # Overlap reduces total length
        True
    r   N)dtypeg      ?g        )numpy
frombufferint16r   linspaceastypeconcatenatetobytes)re   rf   rg   rh   nparr1arr2fade_outfade_intailhead
crossfadedresultr   r   r	   crossfade_audiol  s$   
rz   )rc   rd   )r0   r   typingr   r   r   r3   rE   rZ   bytesr1   rz   r   r   r   r	   <module>   s*     3j5