o
    wiʧ                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ e' rddl0m1Z1 e)2e3Z4e%G dd de!Z5G dd dej6Z7G dd dej6Z8edG dd dej6Z9G dd dej6Z:d d! Z;dGd"d#Z<d$ej=d%e>d&ej=fd'd(Z?	)dHd*ej6d+ej=d,ej=d-ej=d.eej= d/e@d0e@fd1d2ZAG d3d4 d4ej6ZBG d5d6 d6ej6ZCG d7d8 d8eZDG d9d: d:e5ZEG d;d< d<eZFG d=d> d>e5ZGe%d?d@G dAdB dBe5ZHe%dCd@G dDdE dEe5e/ZIg dFZJdS )I    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hub)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   @   s@   e Zd ZeZdZdZdZdZdZ	dZ
dZdZddgZdd ZdS )	DiaPreTrainedModelmodelT	input_idsDiaEncoderLayerDiaDecoderLayerc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S d S )N        )meanstdg      ?)configinitializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx
DiaRMSNormfill_)selfmoduler,    r<   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/dia/modeling_dia.py_init_weightsD   s   


z DiaPreTrainedModel._init_weightsN)__name__
__module____qualname__r    config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_static_cachemain_input_name_no_split_modulesr>   r<   r<   r<   r=   r%   7   s    r%   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r-   c                    s^   t    t|j|j |j| _|j| _|j| _tj	|jtj
d|j }| jd|dd d S )NdtypeoffsetsF
persistent)super__init__r   r6   
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)r:   r-   rO   	__class__r<   r=   rS   `   s   
z!DiaMultiChannelEmbedding.__init__audio_codesreturnc                 C   sH   || j |j d}| ||jd |jd d| j}|jddS )Nr   r      dim)	rO   todevicesqueezerW   viewshaperV   sum)r:   r^   tokensembedsr<   r<   r=   forwardh   s   $z DiaMultiChannelEmbedding.forward)
r?   r@   rA   __doc__r!   rS   rX   Tensorrl   __classcell__r<   r<   r\   r=   rL   R   s    rL   c                       s2   e Zd Z fddZdejdejfddZ  ZS )DiaMLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )Nra   Fr4   )rR   rS   r-   r   r0   rV   intermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnr:   r-   r\   r<   r=   rS   o   s
   
zDiaMLP.__init__hidden_statesr_   c                 C   s4   |  |}|jddd\}}|| | }| |S )Nra   r`   rb   )rs   chunkrv   rt   )r:   rx   	up_statesgater<   r<   r=   rl   w   s   

zDiaMLP.forward)r?   r@   rA   rS   rX   FloatTensorrl   ro   r<   r<   r\   r=   rp   n   s    rp   RMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	r8   ư>c                    s&   t    tt|| _|| _dS )z9
        DiaRMSNorm is equivalent to T5LayerNorm
        N)rR   rS   r   	ParameterrX   onesr1   variance_epsilon)r:   rV   epsr\   r<   r=   rS      s   

zDiaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nra   r`   T)keepdim)	rN   rd   rX   float32powr+   rsqrtr   r1   )r:   rx   input_dtypevariancer<   r<   r=   rl      s
   zDiaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler1   rh   r   r:   r<   r<   r=   
extra_repr   s   zDiaRMSNorm.extra_repr)r~   )r?   r@   rA   rS   rl   r   ro   r<   r<   r\   r=   r8      s    r8   c                       s8   e Zd Zddef fddZe edd Z  Z	S )DiaRotaryEmbeddingNr-   c                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqFrP   )rR   rS   hasattrr   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr-   r   rope_init_fnattention_scalingr[   r   original_inv_freq)r:   r-   re   r   r\   r<   r=   rS      s   
zDiaRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r`   r   mpscpuF)device_typeenabledra   rb   rM   )r   floatexpandrh   rd   re   r/   r   strrX   autocast	transposecatcosr   sinrN   )
r:   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r<   r<   r=   rl      s   0&zDiaRotaryEmbedding.forwardN)
r?   r@   rA   r    rS   rX   no_gradr   rl   ro   r<   r<   r\   r=   r      s
    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr`   ra   rb   )rh   rX   r   )r   x1x2r<   r<   r=   rotate_half   s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embedr<   r<   r=   apply_rotary_pos_emb   s
   

r   rx   n_repr_   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rh   r   reshape)rx   r   batchnum_key_value_headsslenhead_dimr<   r<   r=   	repeat_kv   s
   0r   r*   r;   querykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nra   r   r`   )rc   rN   )ptrainingr   )r   num_key_value_groupsrX   matmulr   rh   r   
functionalsoftmaxr   rd   rN   r   r   
contiguous)r;   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr<   r<   r=   eager_attention_forward   s   
&r   c                       s   e Zd ZdZddeeef dedef fddZ			dd	e
jd
ee
je
jf dee
j dee dee
j dee dee
jee
j eee
j  f fddZ  ZS )DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperFr-   	layer_idx	is_causalc                    s   t    || _|| _|j| _| jj| _| jjp| j| _| j| j | _t	|d|j| j | _
d| _d| _|| _tj| j| j| j
 dd| _tj| j| j| j
 dd| _tj| j| j| j
 dd| _tj| j| j
 | jdd| _d S )Nr   r   r*   Frq   )rR   rS   r-   r   rV   num_attention_heads	num_headsr   r   getattrr   r   attention_dropoutr   r   r0   q_projk_projv_projo_proj)r:   r-   r   r   r\   r<   r=   rS     s   

 zDiaSelfAttention.__init__Nrx   position_embeddingsr   past_key_valuecache_positionr   r_   c                 K   s$  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| jd|\}}|jg |dR   }| |}||fS )Nr`   r   ra   )r   r   r   eagerr*   )r   r   )rh   r   r   rg   r   r   r   r   updater   r   r-   _attn_implementationr   r   r   r   r   r   r   )r:   rx   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r<   r<   r=   rl     s8   	

zDiaSelfAttention.forward)FNN)r?   r@   rA   rm   r   r"   r!   intboolrS   rX   rn   r   r   r   
LongTensorr   r   rl   ro   r<   r<   r\   r=   r      s(    $r   c                       st   e Zd ZdZdedef fddZ		ddejdejd	e	ej d
e	e
 dee deeje	ej f fddZ  ZS )DiaCrossAttentionr   r-   r   c                    s   t    || _|| _|j| _|j| _| jj| _| jj| _	| j| j	 | _
|j| _d| _d| _d| _tj| j| j| j dd| _tj| j| j	| j dd| _tj| j| j	| j dd| _tj| j| j | jdd| _d S )Nr   r*   Frq   )rR   rS   r-   r   rV   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimr   r   r   r   r   r0   r   r   r   r   r:   r-   r   r\   r<   r=   rS   C  s    


 zDiaCrossAttention.__init__Nrx   cross_attention_statesr   past_key_valuesr   r_   c                 K   s^  |j d d }g |d| jR }g |j d d d| jR }| ||dd}	|d ur7|j| jnd}
|d urN|
rN|jj	| j }|jj
| j }n-| ||dd}| ||dd}|d ur{|j||| j\}}d|j| j< t}| jjdkrt| jj }|| |	|||fd| ji|\}}|g |dR  }| |}||fS )Nr`   r   ra   FTr   r   )rh   r   r   rg   r   
is_updatedr   r   cross_attention_cache	key_cachevalue_cacher   r   r   r   r-   r   r   r   r   r   r   )r:   rx   r   r   r   r   r   r   cross_shaper   r   r   r   r   r   r   r<   r<   r=   rl   V  sD   


zDiaCrossAttention.forwardr   )r?   r@   rA   rm   r!   r   rS   rX   rn   r   r
   r   r   r   rl   ro   r<   r<   r\   r=   r   @  s$    r   c                       sv   e Zd Zdedef fddZ		ddejdee	ejejf  deej d	e
e d
e	ejeej f f
ddZ  ZS )r(   r-   r   c                    sL   t    t|j|jd| _t||dd| _t|j|jd| _t	|| _
d S )Nr   Fr   )rR   rS   r8   rV   norm_epspre_sa_normr   self_attentionpost_sa_normrp   mlpr   r\   r<   r=   rS     s
   
zDiaEncoderLayer.__init__Nrx   r   r   r   r_   c           
      K   sZ   |}|  |}| j|f||d|\}}|| }|}| |}| |}	||	 }||fS )Nr   r   )r   r   r  r  )
r:   rx   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outr<   r<   r=   rl     s    



zDiaEncoderLayer.forwardr   )r?   r@   rA   r"   r   rS   rX   rn   r   r   r   r   rl   ro   r<   r<   r\   r=   r(     s    
r(   c                       s   e Zd Zdef fddZee			ddejde	ej de	e
 d	e	e
 d
ee deeef fddZdeejdf dejfddZ  ZS )
DiaEncoderr-   c                    sd   t     | _t j j| _t fddt	 j
D | _t j jd| _t | _d S )Nc                       g | ]}t  |qS r<   )r(   .0r   r-   r<   r=   
<listcomp>      z'DiaEncoder.__init__.<locals>.<listcomp>r   )rR   rS   r-   r   r6   rT   rV   	embedding
ModuleListrangenum_hidden_layerslayersr8   r   normr   rotary_embeddingsrw   r\   r  r=   rS     s   zDiaEncoder.__init__NFr'   r   output_attentionsoutput_hidden_statesr   r_   c                 K   s   |  |}tj|jd |jdd d d f }| ||}| ||}|r&dnd }	|r,dnd }
| jD ]!}|r:|	|f }	||f||d|}|d }|rR|
|d f }
q1| |}|r_|	|f7 }	t	||	|
dS )Nr`   re   r<   r  r   r   last_hidden_staterx   
attentions)
r  rX   rY   rh   re   r  _update_full_maskr  r  r   )r:   r'   r   r  r  r   rx   r   r   encoder_statesall_attentionsencoder_layerlayer_outputsr<   r<   r=   rl     s<   

"



zDiaEncoder.forwardinputs_embedsc                 C   s   |d ur>| j jdkrd|v r|}|S d }|S | j jdkr$t||j}|S | j jdkr8t|tjr6t|dd}|S t||j}|S )Nflash_attention_2r   sdpaflex_attentionFr   	r-   r   r   rN   r/   rX   rn   r$   r   )r:   r   r"  r<   r<   r=   r    s   zDiaEncoder._update_full_mask)NFF)r?   r@   rA   r"   rS   r   r   rX   rn   r   r   r   r   r   r   r   rl   r  ro   r<   r<   r\   r=   r	    s2    
1r	  c                       s   e Zd Zdedef fddZ						ddejdee	ejejf  deej d	eej d
eej dee
 deej de	ejeej eej f fddZ  ZS )r)   r-   r   c                    sr   t    |j| _t||dd| _t||| _t|j|j	d| _
t|j|j	d| _t|j|j	d| _t|| _d S )NTr   r   )rR   rS   rV   	embed_dimr   r   r   cross_attentionr8   r   r   pre_ca_normpre_mlp_normrp   r  r   r\   r<   r=   rS     s   
zDiaDecoderLayer.__init__Nrx   r   r   encoder_hidden_statesencoder_attention_maskr   r   r_   c                 K   s   |}	t |	tr
|	j}	|}
| |}| j||||	fd|i|\}}|
| }|}
| |}| j||f||d|\}}|
| }|}
| |}| |}|
| }|||fS )Nr   )r   r   )	r/   r
   self_attention_cacher   r   r)  r(  r*  r  )r:   rx   r   r   r+  r,  r   r   r   self_attn_cacher  r  r  r  cross_statescross_attn_weightsr  r<   r<   r=   rl     sB   








zDiaDecoderLayer.forward)NNNNNN)r?   r@   rA   r!   r   rS   rX   rn   r   r   r
   r   rl   ro   r<   r<   r\   r=   r)     s2    
r)   c                       s   e Zd ZdZdef fddZee								ddej	de
ej d	e
ej	 d
e
ej de
ej de
e de
e de
e de
ej deeef fddZd
eej	df deej	df dejdej	fddZ  ZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r-   c                    sf   t     j| _ j| _t | _t | _t	 fddt
 jD | _t j jd| _d S )Nc                    r
  r<   )r)   r  r  r<   r=   r  F  r  z'DiaDecoder.__init__.<locals>.<listcomp>r   )rR   rS   rU   rT   rL   
embeddingsr   r  r   r  r  r  r  r8   rV   r   r  rw   r\   r  r=   rS   ?  s   

zDiaDecoder.__init__NFr'   r   r   r+  r,  r   r  r  r   r_   c
                 K   s  |  dd \}}|dur| nd}|	du r#tj||| |jd}	|du r/|	dddf }| |}| ||}|du rNt sN|| }tj|||jd}t	| j
|||	||d}| |||jdd |}|rjdnd}|rpdnd}|rz|durzdnd}| jD ]0}|r||f7 }|||||f|||	d|
}|d }|r||d	 f }|dur||d f }q| |}|r||f7 }t|||||d
S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        Nr`   r   r  )r-   input_embedsr   r   r   r   ra   r<   )r,  r   r   r   )r  r   rx   r  cross_attentions)sizeget_seq_lengthrX   rY   re   r2  r  r   r   r   r-   _update_cross_attn_maskrh   r  r  r   )r:   r'   r   r   r+  r,  r   r  r  r   r   
batch_size
seq_lengthpast_key_values_lengthrx   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr!  r<   r<   r=   rl   J  sz   





zDiaDecoder.forwardr   r"  c                 C   s   |d urM|d urM| j jdkrd|v r|}|S d }|S | j jdkr,t||j|d d}|S | j jdkrCt|tjrAt||d dd}|S t||j|d d}|S )	Nr#  r   r$  r`   )tgt_lenr%  F)query_lengthr   r&  )r:   r+  r,  r   r"  r<   r<   r=   r7    s2   z"DiaDecoder._update_cross_attn_mask)NNNNNFFN)r?   r@   rA   rm   r!   rS   r   r   rX   rn   r   r   r|   r
   r   r   r   r   rl   Sizer7  ro   r<   r<   r\   r=   r1  <  sV    	

]r1  z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                       s   e Zd Zdef fddZdd Zdd Zee											dd	e	e
j d
e	e
j de	e
j de	e
j de	e
j de	eeef  de	e de	e de	e de	e de	e
j deeef fddZ  ZS )DiaModelr-   c                    s6   t  | || _t|j| _t|j| _| 	  d S r   )
rR   rS   r-   r	  encoder_configencoderr1  decoder_configdecoder	post_initrw   r\   r<   r=   rS     s
   zDiaModel.__init__c                 C      | j S r   )rF  r   r<   r<   r=   get_encoder     zDiaModel.get_encoderc                 C   rJ  r   )rH  r   r<   r<   r=   get_decoder  rL  zDiaModel.get_decoderNr'   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher  r  r   r_   c                 K   s  |du r|du rt d|	dur|	n| jj}	|
dur|
n| jj}
|dur&|n| jj}| jr9| jr9|r9td d}|rF|du rFt	t
 t
 }|du rX| jd|||	|
d|}n"t|tszt|d t|dkrk|d ndt|dkrv|d ndd	}|d jd d
| jjj}}}|du rtj|d|f| jj| jd}|jdkr||||dd}| jd||||d |||	|
||d
|}t|j|j|j|j|j|d |j|jdS )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r'   r   r  r  r   r   ra   r  r`   )r5  
fill_valuere   )
r'   r   r   r+  r,  r   r  r  rR  r   )r  r   decoder_hidden_statesdecoder_attentionsr4  encoder_last_hidden_stater+  encoder_attentionsr<   ) 
ValueErrorr-   r  r  rR  is_gradient_checkpointingr   loggerwarning_oncer
   r	   rF  r/   r   lenrh   rG  rU   rX   fullbos_token_idre   ndimr   r   rH  r   r  r   rx   r  r4  )r:   r'   r   rN  rO  rP  rQ  r   rR  r  r  r   r   bszseq_lenchannelsdecoder_outputsr<   r<   r=   rl     s|   '
 
zDiaModel.forward)NNNNNNNNNNN)r?   r@   rA   r    rS   rK  rM  r   r   r   rX   r   r   r   r   r
   r   r   rl   ro   r<   r<   r\   r=   rD    sT    	

rD  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       s   e Zd ZdZdef fddZdd Zdd Zee																									dd
e
ej de
ej de
ej de
ej de
ej de
eeef  de
e de
e de
e de
e de
ej de
ej deeef fddZ  ZS )DiaForConditionalGenerationr&   r-   c                    s`   t  | || _t|| _|jj| _|jj| _tj	|jj
| j| j dd| _d| _|   d S )NFrq   ForMaskedLM)rR   rS   r-   rD  r&   rG  rU   rT   r   r0   rV   logits_dense	loss_typerI  rw   r\   r<   r=   rS   X  s   


z$DiaForConditionalGeneration.__init__c                 C   
   | j  S r   )r&   rK  r   r<   r<   r=   rK  g     
z'DiaForConditionalGeneration.get_encoderc                 C   rh  r   )r&   rM  r   r<   r<   r=   rM  j  ri  z'DiaForConditionalGeneration.get_decoderNr'   r   rN  rO  rP  rQ  r   rR  r  r  labelsr   r_   c                 K   s   | j d	|||||||||	|
|d|}|d }|jd }| ||d| j| jfdd || j d| j}d}|durM| jd	||| jd|}t	|||j
|j|j|j|j|j|jd	S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r'   r   rN  rO  rP  rQ  r   rR  r  r  r   r   r`   r   ra   N)logitsrj  rT   )	lossrk  r   rT  rU  r4  rV  r+  rW  r<   )r&   rh   rf  rg   rU   rT   r   r   loss_functionr   r   rT  rU  r4  rV  r+  rW  )r:   r'   r   rN  rO  rP  rQ  r   rR  r  r  rj  r   r   outputsr  r8  audio_logitsrl  r<   r<   r=   rl   m  sJ   ,
z#DiaForConditionalGeneration.forward)NNNNNNNNNNNN)r?   r@   rA   rC   r    rS   rK  rM  r   r   r   rX   r   r   r   r   r
   r   r   rl   ro   r<   r<   r\   r=   rd  P  s\    	

rd  )rD  r%   rd  )Nr   )r*   )Ktypingr   r   r   rX   r   activationsr   cache_utilsr   r	   r
   integrationsr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   configuration_diar    r!   r"   generation_diar#   integrations.flex_attentionr$   
get_loggerr?   rZ  r%   ModulerL   rp   r8   r   r   r   rn   r   r   r   r   r   r   r(   r	  r)   r1  rD  rd  __all__r<   r<   r<   r=   <module>   s|   
"

AJ!V; ~o