o
    wi                     @   s  d Z ddlmZmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ e  rddl0m1Z1 e"2e3Z4eG dd deZ5G dd dej6Z7G dd de)Z8G dd de%Z9G dd de&Z:G d d! d!e$ej6Z;G d"d# d#ej6Z<G d$d% d%eZ=G d&d' d'e5Z>G d(d) d)eZ?G d*d+ d+e5Z@ed,d-G d.d/ d/e5ZAed0d-G d1d2 d2e5e/ZBg d3ZCdS )4zPyTorch Dia model.    )CallableOptionalUnionN)nn   )DynamicCacheEncoderDecoderCache)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )LlamaAttentionLlamaRMSNormLlamaRotaryEmbeddingeager_attention_forward)Phi3MLP   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   @   s@   e Zd ZeZdZdZdZdZdZ	dZ
dZdZddgZdd ZdS )	DiaPreTrainedModelmodelT	input_idsDiaEncoderLayerDiaDecoderLayerc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S d S )N        )meanstdg      ?)configinitializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx
DiaRMSNormfill_)selfmoduler-    r=   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/dia/modular_dia.py_init_weightsF   s   


z DiaPreTrainedModel._init_weightsN)__name__
__module____qualname__r!   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_static_cachemain_input_name_no_split_modulesr?   r=   r=   r=   r>   r&   9   s    r&   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r.   c                    s^   t    t|j|j |j| _|j| _|j| _tj	|jtj
d|j }| jd|dd d S )N)dtypeoffsetsF)
persistent)super__init__r   r7   
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)r;   r.   rO   	__class__r=   r>   rR   b   s   
z!DiaMultiChannelEmbedding.__init__audio_codesreturnc                 C   sH   || j |j d}| ||jd |jd d| j}|jddS )Nr    r   r   )dim)	rO   todevicesqueezerV   viewshaperU   sum)r;   r]   tokensembedsr=   r=   r>   forwardj   s   $z DiaMultiChannelEmbedding.forward)
r@   rA   rB   __doc__r"   rR   rW   Tensorri   __classcell__r=   r=   r[   r>   rM   T   s    rM   c                   @      e Zd ZdS )DiaMLPNr@   rA   rB   r=   r=   r=   r>   rn   p       rn   c                   @   rm   )r9   Nro   r=   r=   r=   r>   r9   t   rp   r9   c                   @   rm   )DiaRotaryEmbeddingNro   r=   r=   r=   r>   rq   x   rp   rq   c                   @   s0   e Zd ZdZd	deeef dedefddZ	dS )
DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperFr.   	layer_idx	is_causalc                 C   s   t j  || _|| _|j| _| jj| _| jjp| j| _| j| j | _	t
|d|j| j | _d| _d| _|| _t j| j| j| j dd| _t j| j| j| j dd| _t j| j| j| j dd| _t j| j| j | jdd| _d S )Nhead_dimr    r+   Fr5   )r   ModulerR   r.   rt   rU   num_attention_heads	num_headsnum_key_value_headsnum_key_value_groupsgetattrrv   scalingattention_dropoutru   r1   q_projk_projv_projo_proj)r;   r.   rt   ru   r=   r=   r>   rR      s   

 zDiaSelfAttention.__init__N)F)
r@   rA   rB   rj   r   r#   r"   intboolrR   r=   r=   r=   r>   rr   |   s    $rr   c                       st   e Zd ZdZdedef fddZ		ddejdejd	e	ej d
e	e
 dee deeje	ej f fddZ  ZS )DiaCrossAttentionrs   r.   rt   c                    s   t    || _|| _|j| _|j| _| jj| _| jj| _	| j| j	 | _
|j| _d| _d| _d| _tj| j| j| j dd| _tj| j| j	| j dd| _tj| j| j	| j dd| _tj| j| j | jdd| _d S )Nr    r+   Frw   )rQ   rR   r.   rt   rU   cross_hidden_sizecross_num_attention_headsrz   cross_num_key_value_headsr{   r|   cross_head_dimrv   r~   r   ru   r   r1   r   r   r   r   r;   r.   rt   r[   r=   r>   rR      s    


 zDiaCrossAttention.__init__Nhidden_statescross_attention_statesattention_maskpast_key_valueskwargsr^   c                 K   s^  |j d d }g |d| jR }g |j d d d| jR }| ||dd}	|d ur7|j| jnd}
|d urN|
rN|jj	| j }|jj
| j }n-| ||dd}| ||dd}|d ur{|j||| j\}}d|j| j< t}| jjdkrt| jj }|| |	|||fd| ji|\}}|g |dR  }| |}||fS )Nr_   r    r   FTeagerr~   )re   rv   r   rd   	transpose
is_updatedgetrt   cross_attention_cache	key_cachevalue_cacher   r   updater   r.   _attn_implementationr   r~   reshape
contiguousr   )r;   r   r   r   r   r   input_shapehidden_shapecross_shapequery_statesr   
key_statesvalue_statesattention_interfaceattn_outputattn_weightsr=   r=   r>   ri      sD   


zDiaCrossAttention.forwardNN)r@   rA   rB   rj   r"   r   rR   rW   rk   r   r   r   r   tupleri   rl   r=   r=   r[   r>   r      s$    r   c                       sv   e Zd Zdedef fddZ		ddejdee	ejejf  deej d	e
e d
e	ejeej f f
ddZ  ZS )r)   r.   rt   c                    sL   t    t|j|jd| _t||dd| _t|j|jd| _t	|| _
d S )NepsFru   )rQ   rR   r9   rU   norm_epspre_sa_normrr   self_attentionpost_sa_normrn   mlpr   r[   r=   r>   rR      s
   
zDiaEncoderLayer.__init__Nr   position_embeddingsr   r   r^   c           
      K   sZ   |}|  |}| j|f||d|\}}|| }|}| |}| |}	||	 }||fS )Nr   r   )r   r   r   r   )
r;   r   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outr=   r=   r>   ri      s    



zDiaEncoderLayer.forwardr   )r@   rA   rB   r#   r   rR   rW   rk   r   r   r   r   ri   rl   r=   r=   r[   r>   r)      s    
r)   c                       s   e Zd Zdef fddZee			ddejde	ej de	e
 d	e	e
 d
ee deeef fddZdeejdf dejfddZ  ZS )
DiaEncoderr.   c                    sd   t     | _t j j| _t fddt	 j
D | _t j jd| _t | _d S )Nc                       g | ]}t  |qS r=   )r)   .0rt   r.   r=   r>   
<listcomp>      z'DiaEncoder.__init__.<locals>.<listcomp>r   )rQ   rR   r.   r   r7   rS   rU   	embedding
ModuleListrangenum_hidden_layerslayersr9   r   normrq   rotary_embeddingsr;   r.   r[   r   r>   rR      s   zDiaEncoder.__init__NFr(   r   output_attentionsoutput_hidden_statesr   r^   c                 K   s   |  |}tj|jd |jdd d d f }| ||}| ||}|r&dnd }	|r,dnd }
| jD ]!}|r:|	|f }	||f||d|}|d }|rR|
|d f }
q1| |}|r_|	|f7 }	t	||	|
dS )Nr_   rb   r=   r   r   r    last_hidden_stater   
attentions)
r   rW   rX   re   rb   r   _update_full_maskr   r   r   )r;   r(   r   r   r   r   r   position_idsr   encoder_statesall_attentionsencoder_layerlayer_outputsr=   r=   r>   ri   	  s<   

"



zDiaEncoder.forwardinputs_embedsc                 C   s   |d ur>| j jdkrd|v r|}|S d }|S | j jdkr$t||j}|S | j jdkr8t|tjr6t|dd}|S t||j}|S )Nflash_attention_2r   sdpaflex_attentionFr   	r.   r   r   rN   r0   rW   rk   r%   r
   )r;   r   r   r=   r=   r>   r   <  s   zDiaEncoder._update_full_mask)NFF)r@   rA   rB   r#   rR   r   r   rW   rk   r   r   r   r   r   r   r   ri   r   rl   r=   r=   r[   r>   r      s2    
1r   c                       s   e Zd Zdedef fddZ						ddejdee	ejejf  deej d	eej d
eej dee
 deej de	ejeej eej f fddZ  ZS )r*   r.   rt   c                    sr   t    |j| _t||dd| _t||| _t|j|j	d| _
t|j|j	d| _t|j|j	d| _t|| _d S )NTr   r   )rQ   rR   rU   	embed_dimrr   r   r   cross_attentionr9   r   r   pre_ca_normpre_mlp_normrn   r   r   r[   r=   r>   rR   T  s   
zDiaDecoderLayer.__init__Nr   r   r   encoder_hidden_statesencoder_attention_maskr   cache_positionr^   c                 K   s   |}	t |	tr
|	j}	|}
| |}| j||||	fd|i|\}}|
| }|}
| |}| j||f||d|\}}|
| }|}
| |}| |}|
| }|||fS )Nr   )r   r   )	r0   r   self_attention_cacher   r   r   r   r   r   )r;   r   r   r   r   r   r   r   r   self_attn_cacher   r   r   r   cross_statescross_attn_weightsr   r=   r=   r>   ri   ^  sB   








zDiaDecoderLayer.forward)NNNNNN)r@   rA   rB   r"   r   rR   rW   rk   r   r   r   
LongTensorri   rl   r=   r=   r[   r>   r*   S  s2    
r*   c                       s   e Zd ZdZdef fddZee								ddej	de
ej d	e
ej	 d
e
ej de
ej de
e de
e de
e de
ej deeef fddZd
eej	df deej	df dejdej	fddZ  ZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r.   c                    sf   t     j| _ j| _t | _t | _t	 fddt
 jD | _t j jd| _d S )Nc                    r   r=   )r*   r   r   r=   r>   r     r   z'DiaDecoder.__init__.<locals>.<listcomp>r   )rQ   rR   rT   rS   rM   
embeddingsrq   r   r   r   r   r   r   r9   rU   r   r   r   r[   r   r>   rR     s   

zDiaDecoder.__init__NFr(   r   r   r   r   r   r   r   r   r^   c
                 K   s  |  dd \}}|dur| nd}|	du r#tj||| |jd}	|du r/|	dddf }| |}| ||}|du rNt sN|| }tj|||jd}t	| j
|||	||d}| |||jdd |}|rjdnd}|rpdnd}|rz|durzdnd}| jD ]0}|r||f7 }|||||f|||	d|
}|d }|r||d	 f }|dur||d f }q| |}|r||f7 }t|||||d
S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        Nr_   r   r   )r.   input_embedsr   r   r   r   r   r=   )r   r   r   r    )r   r   r   r   cross_attentions)sizeget_seq_lengthrW   rX   rb   r   r   r   onesr	   r.   _update_cross_attn_maskre   r   r   r   )r;   r(   r   r   r   r   r   r   r   r   r   
batch_size
seq_lengthpast_key_values_lengthr   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr   r=   r=   r>   ri     sz   





zDiaDecoder.forwardr   r   c                 C   s   |d urM|d urM| j jdkrd|v r|}|S d }|S | j jdkr,t||j|d d}|S | j jdkrCt|tjrAt||d dd}|S t||j|d d}|S )	Nr   r   r   r_   )tgt_lenr   F)query_lengthru   r   )r;   r   r   r   r   r=   r=   r>   r     s2   z"DiaDecoder._update_cross_attn_mask)NNNNNFFN)r@   rA   rB   rj   r"   rR   r   r   rW   rk   r   r   FloatTensorr   r   r   r   r   ri   Sizer   rl   r=   r=   r[   r>   r     sV    	

]r   z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                       s   e Zd Zdef fddZdd Zdd Zee											dd	e	e
j d
e	e
j de	e
j de	e
j de	e
j de	eeef  de	e de	e de	e de	e de	e
j deeef fddZ  ZS )DiaModelr.   c                    s6   t  | || _t|j| _t|j| _| 	  d S N)
rQ   rR   r.   r   encoder_configencoderr   decoder_configdecoder	post_initr   r[   r=   r>   rR   %  s
   zDiaModel.__init__c                 C      | j S r   )r   r;   r=   r=   r>   get_encoder,     zDiaModel.get_encoderc                 C   r   r   )r   r   r=   r=   r>   get_decoder/  r  zDiaModel.get_decoderNr(   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher   r   r   r^   c                 K   s  |du r|du rt d|	dur|	n| jj}	|
dur|
n| jj}
|dur&|n| jj}| jr9| jr9|r9td d}|rF|du rFt	t
 t
 }|du rX| jd|||	|
d|}n"t|tszt|d t|dkrk|d ndt|dkrv|d ndd	}|d jd d
| jjj}}}|du rtj|d|f| jj| jd}|jdkr||||dd}| jd||||d |||	|
||d
|}t|j|j|j|j|j|d |j|jdS )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r(   r   r   r   r   r    r   r   r_   )r   
fill_valuerb   )
r(   r   r   r   r   r   r   r   r  r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentionsr=   ) 
ValueErrorr.   r   r   r  is_gradient_checkpointingtrainingloggerwarning_oncer   r   r   r0   r   lenre   r   rT   rW   fullbos_token_idrb   ndimr   r   r   r   r   r   r   r   r   )r;   r(   r   r  r  r  r  r   r  r   r   r   r   bszseq_lenchannelsdecoder_outputsr=   r=   r>   ri   2  s|   '
 
zDiaModel.forward)NNNNNNNNNNN)r@   rA   rB   r!   rR   r   r  r   r   r   rW   r   r   r   r   r   r   r   ri   rl   r=   r=   r[   r>   r     sT    	

r   zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       s   e Zd ZdZdef fddZdd Zdd Zee																									dd
e
ej de
ej de
ej de
ej de
ej de
eeef  de
e de
e de
e de
e de
ej de
ej deeef fddZ  ZS )DiaForConditionalGenerationr'   r.   c                    s`   t  | || _t|| _|jj| _|jj| _tj	|jj
| j| j dd| _d| _|   d S )NFrw   ForMaskedLM)rQ   rR   r.   r   r'   r   rT   rS   r   r1   rU   logits_dense	loss_typer   r   r[   r=   r>   rR     s   


z$DiaForConditionalGeneration.__init__c                 C   
   | j  S r   )r'   r   r   r=   r=   r>   r        
z'DiaForConditionalGeneration.get_encoderc                 C   r  r   )r'   r  r   r=   r=   r>   r    r  z'DiaForConditionalGeneration.get_decoderNr(   r   r  r  r  r  r   r  r   r   labelsr   r^   c                 K   s   | j d	|||||||||	|
|d|}|d }|jd }| ||d| j| jfdd || j d| j}d}|durM| jd	||| jd|}t	|||j
|j|j|j|j|j|jd	S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r(   r   r  r  r  r  r   r  r   r   r   r   r_   r    r   N)logitsr   rS   )	lossr!  r   r	  r
  r   r  r   r  r=   )r'   re   r  rd   rT   rS   r   r   loss_functionr   r   r	  r
  r   r  r   r  )r;   r(   r   r  r  r  r  r   r  r   r   r   r   r   outputsr   r   audio_logitsr"  r=   r=   r>   ri     sJ   ,
z#DiaForConditionalGeneration.forward)NNNNNNNNNNNN)r@   rA   rB   rD   r!   rR   r   r  r   r   r   rW   r   r   r   r   r   r   r   ri   rl   r=   r=   r[   r>   r    s\    	

r  )r   r&   r  )Drj   typingr   r   r   rW   r   cache_utilsr   r   masking_utilsr	   modeling_attn_mask_utilsr
   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   llama.modeling_llamar   r   r   r   phi3.modeling_phi3r   configuration_diar!   r"   r#   generation_diar$   integrations.flex_attentionr%   
get_loggerr@   r  r&   rx   rM   rn   r9   rq   rr   r   r)   r   r*   r   r   r  __all__r=   r=   r=   r>   <module>   sT   
J!V; ~o