o
    㥵i{                  	   @   s
  d dl Z d dlZd dlZd dlmZ d dl mZ d dlmZ d dlm	Z	 d dl
Z
d dlmZ d dlmZ d dlmZ d dl
mZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZm Z  de!de!de!fddZ"eG dd dZ#eG dd de#Z$eG dd de#Z%G dd dej&Z'eG dd dZ(eG dd dZ)G d d! d!ej&Z*G d"d# d#e*Z+G d$d% d%e*Z,G d&d' d'ej&Z-G d(d) d)ej&Z.G d*d+ d+ej&Z/G d,d- d-ej&Z0d8d/e!d0e!d1e!defd2d3Z1d4ed5edefd6d7Z2dS )9    N)OrderedDict)	dataclass)Path)Optional)	rearrange)logger)Tensor)
functional)
SDPBackendsdpa_kernel)
checkpoint)AutoTokenizer)
LoraConfig
setup_lora)SEMANTIC_TOKENSFishTokenizernkreturnc                 C   s    | | dkr| S | | | |  S )Nr    )r   r   r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/fish_speech/models/text2semantic/llama.pyfind_multiple   s   r   c                   @   s>  e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d Zeed!< dZeed"< d#Zeed$< dZeed%< dZeed&< d'd( Zed)efd*d+Z d)efd,d-Z!d
S ).BaseModelArgsbase
model_typei }  
vocab_size    n_layern_headi   dimNintermediate_sizen_local_heads@   head_dim'  	rope_baseh㈵>norm_epsi   max_seq_len        dropoutTtie_word_embeddingsFattention_qkv_biasattention_o_biasattention_qk_norm   codebook_size   num_codebooksuse_gradient_checkpointingg{Gz?initializer_rangeis_reward_modelscale_codebook_embeddingsc                 C   sb   | j dkr	| j| _ | jd u r!d| j }td| d }t|d| _| jd u r/| j| j | _d S d S )Nr!   r2            )r"   r   r    r   intr   r$   )self
hidden_dimn_hiddenr   r   r   __post_init__?   s   



zBaseModelArgs.__post_init__pathc                 C   s   t | } |  r| d } t| ddd}t|}W d    n1 s#w   Y  |d  dkr3 t}ndkr9t}n
	 td|d  |d	i |S )
Nconfig.jsonrzutf-8)encodingr   naivedual_arUnknown model type: r   )r   is_diropenjsonloadNaiveModelArgsDualARModelArgs
ValueError)r@   fdataclsr   r   r   from_pretrainedI   s   
zBaseModelArgs.from_pretrainedc                 C   sF   t |d}tj| j|dddd W d    d S 1 sw   Y  d S )Nwr2   TF)indent	sort_keysensure_ascii)rH   rI   dump__dict__)r<   r@   rN   r   r   r   save]   s   "zBaseModelArgs.save)"__name__
__module____qualname__r   str__annotations__r   r;   r   r   r   r    r"   r$   r&   floatr(   r)   r+   r,   boolr-   r.   r/   r1   r3   r4   r5   r6   r7   r?   staticmethodrQ   rX   r   r   r   r   r      s6   
 
r   c                   @   s   e Zd ZU dZeed< dS )rK   rD   r   N)rY   rZ   r[   r   r\   r]   r   r   r   r   rK   b   s   
 rK   c                       s   e Zd ZU dZeed< dZeed< dZedB ed< dZ	edB ed< dZ
edB ed< dZedB ed	< dZedB ed
< dZedB ed< dZedB ed< dZedB ed<  fddZ  ZS )rL   rE   r   r2   n_fast_layerNfast_dimfast_n_headfast_n_local_headsfast_head_dimfast_intermediate_sizefast_attention_qkv_biasfast_attention_qk_normfast_attention_o_biasc                    s   t    | jp
| j| _| jp| j| _| jp| j| _| jp| j	| _| j
p&| j| _
| jd ur0| jn| j| _| jd ur<| jn| j| _| jd urK| j| _d S | j| _d S N)superr?   rb   r   rc   r   rd   r"   re   r$   rf   r    rg   r-   rh   r/   ri   r.   )r<   	__class__r   r   r?   t   s(   




zDualARModelArgs.__post_init__)rY   rZ   r[   r   r\   r]   ra   r;   rb   rc   rd   re   rf   rg   r_   rh   ri   r?   __classcell__r   r   rl   r   rL   g   s   
 rL   c                       s*   e Zd Zejf fdd	Zdd Z  ZS )KVCachec                    sF   t    ||||f}| dtj||d | dtj||d d S )Nk_cachedtypev_cache)rk   __init__register_buffertorchzeros)r<   max_batch_sizer)   n_headsr$   rr   cache_shaperl   r   r   rt      s   
zKVCache.__init__c                 C   sX   |j d |j d ksJ | j}| j}||d d d d |f< ||d d d d |f< ||fS )Nr   r8   )shaperp   rs   )r<   	input_posk_valv_valk_outv_outr   r   r   update   s   zKVCache.update)rY   rZ   r[   rv   bfloat16rt   r   rn   r   r   rl   r   ro      s    ro   c                   @      e Zd ZU eed< eed< dS )TransformerForwardResulttoken_logitscodebook_logitsNrY   rZ   r[   r   r]   r   r   r   r   r         
 r   c                   @   r   )BaseTransformerForwardResultlogitshidden_statesNr   r   r   r   r   r      r   r   c                       s  e Zd Z	d'dedededdf fddZejfd	e	d
e	dej
fddZdedefddZ	d(dedee defddZ				d)dedee dee dee dedefddZdd Ze				d*dedede	dB d edB d!e	dB dd fd"d#Zd+ded$efd%d&Z  ZS ),BaseTransformerTconfig	tokenizerinit_weightsr   Nc              	      s  t     | _|| _t|j | _t	 j
 j| _t	 j j  j| _t fddt jD | _t j jd| _| jjdu rRtj j j
dd| _| jdt j j jdd | jdt tj! j jtj"d	dd d
| _#d
| _|r| $| j% d S d S )Nc                 3       | ]	}t  d dV  qdS )Tuse_sdpaNTransformerBlock.0_r   r   r   	<genexpr>   s    
z+BaseTransformer.__init__.<locals>.<genexpr>epsFbias	freqs_cis
persistentcausal_maskrq   r!   )&rk   rt   r   r   listsemantic_id_to_token_idvaluessemantic_token_idsnn	Embeddingr   r   
embeddingsr1   r3   codebook_embeddings
ModuleListranger   layersRMSNormr(   normr,   Linearoutputru   precompute_freqs_cisr)   r$   r&   rv   trilonesr_   rx   apply_init_weights)r<   r   r   r   rl   r   r   rt      s^   


	zBaseTransformer.__init__rx   r)   rr   c                 C   s\   | j |kr| j|krd S t|d}|| _ || _| jD ]}t||| jj| jj|d|j_	qd S )N   rq   )
r)   rx   r   r   ro   r   r"   r$   	attentionkv_cacher<   rx   r)   rr   br   r   r   setup_caches   s   

zBaseTransformer.setup_cachesinpc                 C   s   g }t j| j|j|jd}t| jjD ]}| |d d |d f || jj	  }|
| qt j|ddjdd}d|t |d d df | < | |d d df | }|S )Ndevicerr      r   r   )rv   tensorr   r   rr   r   r   r3   r   r1   appendstacksumisinr   )r<   r   embedssemantic_token_ids_tensoriembvq_embeds_sumxr   r   r   embed   s   zBaseTransformer.embedkey_padding_maskc                 C   s   | d}| |}| jd | }d }|d ur4| jd |d |f }t|d}t|d}| }||@ }| jD ]}	| jjrJ| j	rJt
|	|||dd}q7|	|||}q7| |}
| jjrct|
| jj}n| |
}t||dS )Nr8   zq k -> 1 1 q kzb s -> b 1 1 sTuse_reentrantr   r   )sizer   r   r   r   logical_notr   r   r4   trainingr   r   r,   Flinearr   weightr   r   )r<   r   r   seq_lenr   r   maskcausal
atten_masklayerslow_outr   r   r   r   forward  s,   






zBaseTransformer.forwardFr|   audio_masksaudio_parts
return_allc                 C   s  g }t | jjD ]}| |d d |d f || jj  }|| qtj|ddjdd}	|d d df | j	j
k|d d df | j	jk@ }
d|	|
 < | |d d df |	 }| jjrt|
d|}t||t| jjd  |}|d ur| |}| jjr|td ||< n|||< |d u rtj|jd |jd}|jd }n| j}| jd d |d |f }| j| }| jD ]
}|||||d}q|ddkr|s|d d dd f }| |}| jjr| |}n| jjrt !|| jj"}n| #|}t$||dS )	Nr   r   r   r!   r8   r   r|   r   )%r   r   r3   r   r1   r   rv   r   r   r   semantic_begin_idsemantic_end_idr   r7   	unsqueeze	expand_aswheremathsqrtaudio_projectoraranger{   r   r)   r   r   r   r   r   r6   score_outputr,   r   r   r   r   r   )r<   r   r|   r   r   r   r   r   r   r   vq_masksr   vq_masks_expandedaudio_embedsr)   r   r   r   r   r   r   r   r   forward_generate?  sT   





z BaseTransformer.forward_generatec                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )Nr*   )meanstd)r   r5   
isinstancer   r   r   rO   normal_r   zero_r   padding_idx)r<   moduler   r   r   r   r     s   

zBaseTransformer._init_weightsr@   load_weights
max_lengthlora_configr&   c              
   C   s  t t| }|d ur||_td|  |d ur%||_td|  |j dkr/ t}ndkr5t	}n		 t
d|j t| }td|  d|  |||d}|d uret|| td	|  |d
u rptd |S dtt| v rtd ddlm}	 |	|}
|
 }dtt| v rtd | jd}|d dsJ t|d dd  }ddlm} |||}
|
 }tjt| d dddd}d|v rtd |d }tt| drtd t }| D ]\}}|||dd< q|}t| D ]}d |v r| | q|! D ]/\}}||vr,td!|  q|j"|| j"krGtd"| d#|j" d$|| j"  q|j#|d
dd%}td&|  |S )'NzOverride max_seq_len to zOverride rope_base to rD   rE   rF   zLoading model from z
, config: )r   zLoRA setup: FzRandomly initialized modelint8z$Using int8 weight-only quantization!r   )WeightOnlyInt8QuantHandlerint4zUsing int4 quantization!-gr   )WeightOnlyInt4QuantHandler	model.pthcpuT)map_locationmmapweights_only
state_dictziUsing a TextToSemantic LightningModule checkpoint, please make sure it is a full model, not a LoRA model.zmodel.zJRemove prefix 'model.' created by TextToSemantic LightningModule from keys audio_zNo weight for zShape mismatch for z: z vs )strictassignzLoaded weights with error: )$r   rQ   r\   r)   r   infor&   r   NaiveTransformerDualARTransformerrM   r   r   r   tools.llama.quantizer   convert_for_runtimenamesplit
startswithr;   r  rv   rJ   warningnextiterkeysr   itemsreplacer   popnamed_parametersr{   load_state_dict)r@   r   r   r   r&   r   	model_clsr   modelr   simple_quantizer
path_comps	groupsizer  weightsnew_weightsr   verrr   r   r   rQ     s   



<






zBaseTransformer.from_pretrained	drop_lorac                 C   s   t |}|jddd | j|d  |  }|r4t| D ]}d|vr&q|| t	d|  qt
||d  | j| d S )NT)parentsexist_okrA   lorazDrop LoRA parameter: r  )r   mkdirr   rX   r  r   r  r  r   r  rv   r   save_pretrained)r<   r@   r&  r  keyr   r   r   r+    s   
zBaseTransformer.save_pretrainedTrj   )NNNF)FNNN)F)rY   rZ   r[   r   r   r_   rt   rv   r   r;   rr   r   r   r   r   r   r   r   r   r`   r\   r   rQ   r+  rn   r   r   rl   r   r      s    =

0
K`r   c                       s   e Zd Zdededdf fddZdedefdd	Z	dd
e	de
e	 def fddZ	dde	de
e	 def fddZ  ZS )r  r   r   r   Nc                    sP   t  j|d|d t|j|jd| _tj|j|j|j	 dd| _
| | j d S )NFr   r   r   r   )rk   rt   r   r   r(   codebook_normr   r   r1   r3   codebook_outputr   r   r<   r   r   rl   r   r   rt   
  s   
zNaiveTransformer.__init__resultc                 C   s:   |j }|j}| | |}t|d| jjd}t||dS )Nzb n (c d) -> b n c d)cr   r   )r   r   r0  r/  r   r   r3   r   )r<   r2  r   r   r   r   r   r   decode  s   
zNaiveTransformer.decoder   r   c                    s   t  j||d}| |S )N)r   r   )rk   r   r5  )r<   r   r   r2  rl   r   r   r   %  s
   
zNaiveTransformer.forwardr   r|   c                    s   t  ||}| |S rj   )rk   r   r5  )r<   r   r|   r2  rl   r   r   r   0  s   
z!NaiveTransformer.forward_generaterj   )rY   rZ   r[   rK   r   rt   r   r   r5  r   r   r   r   rn   r   r   rl   r   r  	  s&    r  c                       s   e Zd Zdededdf fddZejfdeded	ej	f fd
dZ
							ddedee dee dee dee dee dee dee def fddZ	ddedee defddZ			d dedee dee dee def
 fddZ  ZS )!r  r   r   r   Nc                    s   t  j|d|d |jd ur|j|jkrt|j|j| _nt | _t|j	|j| _
tj||j|j|j|j|j|j|j|jd	 t fddt|jD | _t|j|jd| _tj|j|j	dd| _| jdt|j|j|jdd	 |  | j! d S )
NFr.  )r   r   r"   r$   r    r-   r/   r.   c                 3   r   )Fr   Nr   r   override_configr   r   r   Q  s
    

z-DualARTransformer.__init__.<locals>.<genexpr>r   r   fast_freqs_cisr   )"rk   rt   rb   r   r   r   fast_project_inIdentityr   r1   fast_embeddingsdataclassesr  rc   rd   re   rf   rg   rh   ri   r   r   ra   fast_layersr   r(   	fast_normfast_outputru   r   r3   r&   r   r   r1  rl   r6  r   rt   8  sF   

	zDualARTransformer.__init__rx   r)   rr   c                    sB   t  ||| | jD ]}t|| jj| jj| jj|d|j_	qd S )Nrq   )
rk   r   r=  ro   r   r3   rd   re   r   r   r   rl   r   r   r   g  s   
zDualARTransformer.setup_cachesr   labelsr   vq_partsr   vq_require_losses	mel_parts	mel_masksc	                    sv  t  j||||||d}	|	j}
|	j}| jj}| jd d d |d |f }| jd | }|| jk}|| }|j	d dkr\t
jd| jjf|j|jd}t
j|j	d | jjd f|jt
jd}n|dd df | ||  }| |}| |}t
j|d d d f |gdd}| jD ]}| jjr| jrt||||d	d
}q||||}q| |}| |}|j	d | jjksJ t|
|dS )N)r   r   rA  r   rC  rD  r   r2   r   r   .r!   r   Tr   r4  )rk   r   r   r   r   r3   r   r8  semantic_token_idr{   rv   rw   r   r   rr   r;   r9  r;  catr=  r4   r   r   r>  r?  r   )r<   r   r@  r   rA  r   rB  rC  rD  parent_resultr   r   fast_seq_len	fast_maskr8  codebook_mask	codebooksr   r   fast_outr   rl   r   r   r   w  sZ   






zDualARTransformer.forwardr   r|   c                 C   sl   | |jd dd}| jd d |d | jjf }| j| }| jD ]
}|||||d}q| |}| |}|S )Nr   r   r!   r   )	viewr{   r   r   r3   r8  r=  r>  r?  )r<   r   r|   rI  r8  r   rL  r   r   r   r   forward_generate_fast  s   



z'DualARTransformer.forward_generate_fastr   r   c                    s$   t  ||||}| |j|_|S rj   )rk   r   r9  r   )r<   r   r|   r   r   rl   r   r   r     s   z"DualARTransformer.forward_generate)NNNNNNNrj   )NNN)rY   rZ   r[   rK   r   rt   rv   r   r;   rr   r   r   r   r   r   rN  r   rn   r   r   rl   r   r  7  st    0	
I
r  c                       sN   e Zd Zddededdf fddZ	dded	ed
ededef
ddZ  ZS )r   Tr   r   r   Nc                    sF   t    t||d| _t|| _t|j|j| _	t|j|j| _
d S )Nr   )rk   rt   	Attentionr   FeedForwardfeed_forwardr   r   r(   ffn_normattention_norm)r<   r   r   rl   r   r   rt     s
   

zTransformerBlock.__init__r   r   r   r|   c                 C   s2   ||  | |||| }|| | | }|S rj   )r   rS  rQ  rR  )r<   r   r   r   r|   houtr   r   r   r     s   zTransformerBlock.forwardr-  rj   )	rY   rZ   r[   r   r_   rt   r   r   rn   r   r   rl   r   r     s    r   c                       sl   e Zd Zddedef fddZdd Z	dd	ed
ededee def
ddZ			dde
jfddZ  ZS )rO  Tr   r   c                    s   t    |j|j dksJ |jd|j  |j }tj|j||jd| _	tj|j|j |j|j
d| _d | _|jrLt|j|j| _t|j|j| _|j| _|j| _|j| _|j| _|j| _|| _|j| _|| _| | j d S )Nr   r8   r   )rk   rt   r   r   r"   r$   r   r   r-   wqkvr.   wor   r/   r   r(   q_normk_normr+   r   r   "_register_load_state_dict_pre_hook	load_hook)r<   r   r   total_head_dimrl   r   r   rt     s,   

zAttention.__init__c                 G   sV   |d |v r)| |d }| |d }| |d }t|||g||d < d S d S )Nz	wq.weightz	wk.weightz	wv.weightzwqkv.weight)r  rv   rF  )r<   r  prefixargswqwkwvr   r   r   r[    s   zAttention.load_hookNr   r   r   r|   r   c                 C   s  |j \}}}| j| j }| j| j }	| |j||	|	gdd\}
}}|
||| j| j}
|||| j| j}|||| j| j}| jrM| |
}
| 	|}t
|
|}
t
||}tdd |
||f\}
}}| jd urs| j|||\}}|j| j| j dd}|j| j| j dd}| jr|d u rttj tj|
||| jr| jnddd}W d    n1 sw   Y  n!tj|
|||| jr| jndd	}n| j|
|||| jr| jndd	}|dd
 |||}| |S )Nr!   r   c                 S   s   |  ddS )Nr   r8   )	transpose)r   r   r   r   <lambda>,  s    z#Attention.forward.<locals>.<lambda>r   r*   T)	dropout_p	is_causal)	attn_maskrd  r8   )r{   r   r$   r"   rV  r  rM  r/   rX  rY  apply_rotary_embmapr   r   repeat_interleaver   r   r
   FLASH_ATTENTIONr   scaled_dot_product_attentionr   r+   eq_scaled_dot_product_attentionrb  
contiguousrW  )r<   r   r   r   r|   bszseqlenr   q_sizekv_sizeqr   r$  yr   r   r   r     sZ    






zAttention.forwardr*   c                 C   s   | d| d}}dt| d }tjdd|||j|jd}	|d ur;|jtjkr7|	|	 t
d n|	|7 }	||dd | }
|
|	7 }
tj|
dd}
tj|
|dd}
|
| S )	Nr   r   r!   )rr   r   z-infr   T)train)r   r   r   rv   rw   rr   r   r_   masked_fill_r   r^   rb  softmaxr+   )r<   queryr,  valuerf  rd  LSscale_factor	attn_biasattn_weightr   r   r   rl  T  s   z)Attention.eq_scaled_dot_product_attentionr-  rj   )Nr*   )rY   rZ   r[   r   r_   rt   r[  r   r   r   rv   rl  rn   r   r   rl   r   rO    s(    
ErO  c                       s8   e Zd Zdeddf fddZdedefddZ  ZS )	rP  r   r   Nc                    sP   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _d S )NFr   )	rk   rt   r   r   r   r    w1w3w2)r<   r   rl   r   r   rt   r  s   
zFeedForward.__init__r   c                 C   s    |  t| || | S rj   )r  r   silur~  r  r<   r   r   r   r   r   x  s    zFeedForward.forward)rY   rZ   r[   r   rt   r   r   rn   r   r   rl   r   rP  q  s    rP  c                       sB   e Zd Zddedef fddZdd Zded	efd
dZ  Z	S )r   r'   r   r   c                    s&   t    || _tt|| _d S rj   )rk   rt   r   r   	Parameterrv   r   r   )r<   r   r   rl   r   r   rt   }  s   
zRMSNorm.__init__c                 C   s$   |t t j|| ddd| j  S )Nr!   T)r   keepdim)rv   rsqrtr   r   r  r   r   r   _norm  s   $zRMSNorm._normr   r   c                 C   s   |  | |}|| j S rj   )r  r^   type_asr   )r<   r   r   r   r   r   r     s   
zRMSNorm.forward)r'   )
rY   rZ   r[   r;   r^   rt   r  r   r   rn   r   r   rl   r   r   |  s    r   r%   r   n_elemr   c                 C   s|   d|t d|dd|d   |   }t j| |jd}t ||}t t ||}t j|j|j	gdd}|j
t jdS )	a  
    Precomputes frequency tensors for complex exponentials (cis)

    Args:
        seq_len: Length of the sequence for which positional embeddings are needed.
        n_elem: Number of elements in the frequency tensor.
        base: Base value for the frequency scaling (default: 10000).

    Returns:
        A tensor containing the precomputed frequencies in real and imaginary parts (bfloat16).
    g      ?r   r8   Nr   r!   r   rq   )rv   r   r^   r   outerpolar	ones_liker   realimagtor   )r   r  r   freqstr   cacher   r   r   r     s   $r   r   r   c                 C   s   |   jg | jd d ddR  }|d|dd|dd}t|d |d  |d |d   |d |d  |d |d   gd}|d}|| S )Nr!   r8   r   r9   ).r   ).r   )	r^   reshaper{   rM  r   rv   r   flattenr  )r   r   xshapedx_out2r   r   r   rg    s   &

rg  )r%   )3r<  rI   r   collectionsr   r   pathlibr   typingr   rv   torch.nnr   einopsr   logurur   r   r	   r   torch.nn.attentionr
   r   torch.utils.checkpointr   transformersr   %fish_speech.models.text2semantic.lorar   r   fish_speech.tokenizerr   r   r;   r   r   rK   rL   Modulero   r   r   r   r  r  r   rO  rP  r   r   rg  r   r   r   r   <module>   sV    D'  [. ) 