o
    eiK                     @   s  d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 e,5e6Z7e)G dd de$Z8G dd dej9Z:G dd dej9Z;edG dd dej9Z<G dd  d ej9Z=d!d" Z>ed#dJd$d%Z?d&ej@d'eAd(ej@fd)d*ZB	+dKd,ej9d-ej@d.ej@d/ej@d0ej@dB d1eCd2eCd3e&e( fd4d5ZDee?G d6d7 d7ej9ZEG d8d9 d9ej9ZFG d:d; d;eZGG d<d= d=e8ZHG d>d? d?eZIG d@dA dAe8ZJe)dBdCG dDdE dEe8ZKe)dFdCG dGdH dHe8e4ZLg dIZMdS )L    )Callable)OptionalN)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)maybe_autocast   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixinc                       sJ   e Zd ZU eed< dZdZdZdZdZ	dZ
dZddgZ fddZ  ZS )	DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerc                    sH   t  | t|tr"tj| jjtjd| jj	 }t
|j| d S d S )Ndtype)super_init_weights
isinstanceDiaMultiChannelEmbeddingtorcharanger'   num_channelslong
vocab_sizeinitcopy_offsets)selfmoduler9   	__class__ b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dia/modeling_dia.pyr/   @   s
   
z DiaPreTrainedModel._init_weights)__name__
__module____qualname__r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modulesr/   __classcell__r>   r>   r<   r?   r&   4   s   
 r&   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	r1   a  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r'   c                    s^   t    t|j|j |j| _|j| _|j| _tj	|jtj
d|j }| jd|dd d S )Nr,   r9   F
persistent)r.   __init__r   	Embeddingr6   r4   hidden_sizeembedr2   r3   r5   register_buffer)r:   r'   r9   r<   r>   r?   rO   U   s   
z!DiaMultiChannelEmbedding.__init__audio_codesreturnc                 C   sH   || j |j d}| ||jd |jd d| j}|jddS )Nr!   r      dim)	r9   todevicesqueezerR   viewshaperQ   sum)r:   rT   tokensembedsr>   r>   r?   forward]   s   $z DiaMultiChannelEmbedding.forward)
r@   rA   rB   __doc__r#   rO   r2   Tensorrb   rL   r>   r>   r<   r?   r1   G   s    r1   c                       s2   e Zd Z fddZdejdejfddZ  ZS )DiaMLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )NrW   Fbias)r.   rO   r'   r   LinearrQ   intermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnr:   r'   r<   r>   r?   rO   d   s
   
zDiaMLP.__init__hidden_statesrU   c                 C   s4   |  |}|jddd\}}|| | }| |S )NrW   rV   rX   )rj   chunkrm   rk   )r:   ro   	up_statesgater>   r>   r?   rb   l   s   

zDiaMLP.forward)r@   rA   rB   rO   r2   FloatTensorrb   rL   r>   r>   r<   r?   re   c   s    re   RMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )
DiaRMSNormư>epsrU   Nc                    s&   t    tt|| _|| _dS )z9
        DiaRMSNorm is equivalent to T5LayerNorm
        N)r.   rO   r   	Parameterr2   onesweightvariance_epsilon)r:   rQ   rw   r<   r>   r?   rO   w   s   

zDiaRMSNorm.__init__ro   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )NrW   rV   T)keepdim)	r-   rZ   r2   float32powmeanrsqrtr{   rz   )r:   ro   input_dtypevariancer>   r>   r?   rb      s
   zDiaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerz   r^   r{   )r:   r>   r>   r?   
extra_repr   s   zDiaRMSNorm.extra_repr)rv   )
r@   rA   rB   floatrO   r2   rd   rb   r   rL   r>   r>   r<   r?   ru   u   s    ru   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )DiaRotaryEmbeddinginv_freqNr'   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   FrM   original_inv_freq)r.   rO   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr'   rope_parametersr   compute_default_rope_parametersr   attention_scalingrS   clone)r:   r'   r[   rope_init_fnr   r<   r>   r?   rO      s   


zDiaRotaryEmbedding.__init__r[   ztorch.deviceseq_lenrU   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   rW   r,   )r[   r-   )	r   getattrrQ   num_attention_headsr2   r3   int64rZ   r   )r'   r[   r   baserY   attention_factorr   r>   r>   r?   r      s   
&z2DiaRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   rV   r!   mpscpuF)device_typeenabledrW   rX   r,   )r   r   expandr^   rZ   r[   r0   typestrr    	transposer2   catcosr   sinr-   )
r:   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r>   r>   r?   rb      s   0&zDiaRotaryEmbedding.forwardN)NNN)r@   rA   rB   r2   rd   rC   r"   rO   staticmethodr   intr   r   r   no_gradr   rb   rL   r>   r>   r<   r?   r      s&   
 

r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrV   rW   rX   )r^   r2   r   )r   x1x2r>   r>   r?   rotate_half   s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embedr>   r>   r?   apply_rotary_pos_emb   s
   

r   ro   n_reprU   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r^   r   reshape)ro   r   batchnum_key_value_headsslenr   r>   r>   r?   	repeat_kv   s
   0r           r;   querykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )NrW   r   rV   )rY   r-   )ptrainingr!   )r   num_key_value_groupsr2   matmulr   r   
functionalsoftmaxr}   rZ   r-   r   r   
contiguous)r;   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr>   r>   r?   eager_attention_forward   s   
r   c                       s   e Zd ZdZddeeB dedef fddZ				dd	e	j
d
ee	j
e	j
f dB de	j
dB dedB de	jdB dee dee	j
e	j
f fddZ  ZS )DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperFr'   	layer_idx	is_causalc                    s   t    || _|| _|j| _| jj| _| jjp| j| _| j| j | _t	|d|j| j | _
d| _d| _|| _tj| j| j| j
 dd| _tj| j| j| j
 dd| _tj| j| j| j
 dd| _tj| j| j
 | jdd| _d S )Nr   r!   r   Frf   )r.   rO   r'   r   rQ   r   	num_headsr   r   r   r   r   attention_dropoutr   r   rh   q_projk_projv_projo_proj)r:   r'   r   r   r<   r>   r?   rO     s   

 zDiaSelfAttention.__init__Nro   position_embeddingsr   past_key_valuescache_positionr   rU   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jd|\}}|jg |dR   }| |}||fS )NrV   r!   rW   )r   r   r   r   )r   r   )r^   r   r   r]   r   r   r   r   updater   r   get_interfacer'   _attn_implementationr   r   r   r   r   r   r   )r:   ro   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r>   r>   r?   rb   '  s8   	

zDiaSelfAttention.forward)F)NNNN)r@   rA   rB   rc   r$   r#   r   boolrO   r2   rd   r   r   
LongTensorr   r   rb   rL   r>   r>   r<   r?   r     s,     r   c                       st   e Zd ZdZdedef fddZ		ddejdejd	ejdB d
e	dB de
e deejejdB f fddZ  ZS )DiaCrossAttentionr   r'   r   c                    s   t    || _|| _|j| _|j| _| jj| _| jj| _	| j| j	 | _
|j| _d| _d| _d| _tj| j| j| j dd| _tj| j| j	| j dd| _tj| j| j	| j dd| _tj| j| j | jdd| _d S )Nr!   r   Frf   )r.   rO   r'   r   rQ   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimr   r   r   r   r   rh   r   r   r   r   r:   r'   r   r<   r>   r?   rO   V  s    


 zDiaCrossAttention.__init__Nro   cross_attention_statesr   r   r   rU   c                 K   sV  |j d d }g |d| jR }g |j d d d| jR }| ||dd}	|d ur7|j| jnd}
|d urP|
rP|jj	| j j
}|jj	| j j}n-| ||dd}| ||dd}|d ur}|j||| j\}}d|j| j< t| jjt}|| |	|||fd| ji|\}}|g |dR  }| |}||fS )NrV   r!   rW   FTr   )r^   r   r   r]   r   
is_updatedgetr   cross_attention_cachelayerskeysvaluesr   r   r   r   r   r'   r   r   r   r   r   r   )r:   ro   r   r   r   r   r   r   cross_shaper   r   r   r   r   r   r   r>   r>   r?   rb   i  sD   


zDiaCrossAttention.forwardNN)r@   rA   rB   rc   r#   r   rO   r2   rd   r
   r   r   r   rb   rL   r>   r>   r<   r?   r   S  s$    r   c                       sv   e Zd Zdedef fddZ		ddejdeejejf dB dejdB d	e	e
 d
eejejdB f f
ddZ  ZS )r*   r'   r   c                    sL   t    t|j|jd| _t||dd| _t|j|jd| _t	|| _
d S )Nrw   Fr   )r.   rO   ru   rQ   norm_epspre_sa_normr   self_attentionpost_sa_normre   mlpr   r<   r>   r?   rO     s
   
zDiaEncoderLayer.__init__Nro   r   r   r   rU   c           
      K   sZ   |}|  |}| j|f||d|\}}|| }|}| |}| |}	||	 }||fS )N)r   r   )r   r   r   r  )
r:   ro   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outr>   r>   r?   rb     s    



zDiaEncoderLayer.forwardr   )r@   rA   rB   r$   r   rO   r2   rd   r   r   r   rb   rL   r>   r>   r<   r?   r*     s    
r*   c                       sl   e Zd Zdef fddZee			ddejdejdB de	dB d	e	dB d
e
e deeB fddZ  ZS )
DiaEncoderr'   c                    sn   t     | _t j j| _t fddt	 j
D | _t j jd| _t d| _|   d S )Nc                       g | ]}t  |qS r>   )r*   .0r   r'   r>   r?   
<listcomp>      z'DiaEncoder.__init__.<locals>.<listcomp>r   r  )r.   rO   r'   r   rP   r6   rQ   	embedding
ModuleListrangenum_hidden_layersr   ru   r   normr   
rotary_emb	post_initrn   r<   r  r?   rO     s   zDiaEncoder.__init__NFr)   r   output_attentionsoutput_hidden_statesr   rU   c                 K   s   |  |}tj|jd |jdd d d f }t| j||d}| j||d}|r)dnd }	|r/dnd }
| jD ]"}|r=|	|f }	||f|||d|}|d }|rV|
|d f }
q4| 	|}|rc|	|f7 }	t
||	|
d	S )
NrV   r[   )r'   inputs_embedsr   r   r>   )r   r   r   r   r!   last_hidden_statero   
attentions)r  r2   r3   r^   r[   r   r'   r  r   r  r   )r:   r)   r   r  r  r   ro   r   r   encoder_statesall_attentionsencoder_layerlayer_outputsr>   r>   r?   rb     s@   

"



zDiaEncoder.forward)NFF)r@   rA   rB   r$   rO   r   r   r2   rd   r   r   r   r   r   rb   rL   r>   r>   r<   r?   r    s(    r  c                       s   e Zd Zdedef fddZ						ddejdeejejf dB dejdB d	ejdB d
ejdB de	dB dej
dB deejejdB ejdB f fddZ  ZS )r+   r'   r   c                    sr   t    |j| _t||dd| _t||| _t|j|j	d| _
t|j|j	d| _t|j|j	d| _t|| _d S )NTr   r   )r.   rO   rQ   	embed_dimr   r   r   cross_attentionru   r   r   pre_ca_normpre_mlp_normre   r  r   r<   r>   r?   rO     s   
zDiaDecoderLayer.__init__Nro   r   r   encoder_hidden_statesencoder_attention_maskr   r   rU   c                 K   s   |}	t |	tr
|	j}	|}
| |}| j||||	fd|i|\}}|
| }|}
| |}| j||f||d|\}}|
| }|}
| |}| |}|
| }|||fS )Nr   )r   r   )	r0   r
   self_attention_cacher   r   r#  r"  r$  r  )r:   ro   r   r   r%  r&  r   r   r   self_attn_cacher  r  r  r  cross_statescross_attn_weightsr  r>   r>   r?   rb     sB   








zDiaDecoderLayer.forward)NNNNNN)r@   rA   rB   r#   r   rO   r2   rd   r   r
   r   rb   rL   r>   r>   r<   r?   r+     s2    
r+   c                       s   e Zd ZdZdef fddZee								ddej	dej
dB d	ej	dB d
ejdB dej
dB dedB dedB dedB dej
dB deeB fddZ  ZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r'   c                    sp   t     j| _ j| _t | _t fddt j	D | _
t j jd| _t d| _|   d S )Nc                    r  r>   )r+   r	  r  r>   r?   r  E  r  z'DiaDecoder.__init__.<locals>.<listcomp>r   r  )r.   rO   r4   r6   r1   
embeddingsr   r  r  r  r   ru   rQ   r   r  r   r  r  rn   r<   r  r?   rO   ?  s   
zDiaDecoder.__init__NFr)   r   r   r%  r&  r   r  r  r   rU   c
                 K   s  |  dd \}}|dur| nd}|	du r#tj||| |jd}	|du r/|	dddf }| |}|du rHt sH|| }tj|||jd}t| j	|||	|d}t
| j	|||d}| j||d}|rfdnd}|rldnd}|rv|durvdnd}| jD ]1}|r||f7 }|||||f|||	|d	|
}|d }|r||d
 f }|dur||d f }q{| |}|r||f7 }t|||||dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrV   r   r  )r'   r  r   r   r   )r'   r  r   r%  r  r>   )r&  r   r   r   r!   rW   )r  r   ro   r  cross_attentions)sizeget_seq_lengthr2   r3   r[   r,  r   ry   r   r'   r   r  r   r  r   )r:   r)   r   r   r%  r&  r   r  r  r   r   
batch_size
seq_lengthpast_key_values_lengthro   mask_seq_lengthr   all_hidden_statesall_self_attnsall_cross_attentionslayerr   r>   r>   r?   rb   L  sz   




zDiaDecoder.forward)NNNNNFFN)r@   rA   rB   rc   r#   rO   r   r   r2   rd   r   rs   r
   r   r   r   rb   rL   r>   r>   r<   r?   r+  <  sD    	
r+  z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                       s   e Zd Zdef fddZee											ddejdB dejdB dejdB dejdB d	ejdB d
e	e
B dB dedB dedB dedB dedB dejdB de
eB fddZ  ZS )DiaModelr'   c                    s6   t  | || _t|j| _t|j| _| 	  d S r   )
r.   rO   r'   r  encoder_configencoderr+  decoder_configdecoderr  rn   r<   r>   r?   rO     s
   zDiaModel.__init__Nr)   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher  r  r   rU   c                 K   s  |du r|du rt d|	dur|	n| jj}	|
dur|
n| jj}
|dur&|n| jj}| jr9| jr9|r9td d}|rL|du rLt	t
| jdt
| jd}|du r^| jd|||	|
d|}n"t|tst|d t|dkrq|d ndt|d	kr||d	 ndd
}|d jd d| jjj}}}|du rtj|d|f| jjj| jd}|jd	kr||||dd	}| jd||||d |||	|
||d
|}t|j|j|j|j|j|d |j|jdS )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr  )r)   r   r  r  r   r!   rW   r  rV   )r.  
fill_valuer[   )
r)   r   r   r%  r&  r   r  r  rB  r   )r  r   decoder_hidden_statesdecoder_attentionsr-  encoder_last_hidden_stater%  encoder_attentionsr>   ) 
ValueErrorr'   r  r  rB  is_gradient_checkpointingr   loggerwarning_oncer
   r	   r;  r0   r   lenr^   r<  r4   r2   fullbos_token_idr[   ndimr   r   r=  r   r  r   ro   r  r-  )r:   r)   r   r>  r?  r@  rA  r   rB  r  r  r   r   bszr   channelsdecoder_outputsr>   r>   r?   rb     s|   '
 
zDiaModel.forward)NNNNNNNNNNN)r@   rA   rB   r"   rO   r   r   r2   r   r   r   r
   r   r   rb   rL   r>   r>   r<   r?   r9    sP    
	
r9  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       s   e Zd ZdZdZdef fddZee												dde	j
dB de	j
dB d	e	j
dB d
e	j
dB de	j
dB deeB dB dedB dedB dedB dedB de	j
dB de	j
dB deeB fddZ  ZS )DiaForConditionalGenerationr(   )audior'   c                    s`   t  | || _t|| _|jj| _|jj| _tj	|jj
| j| j dd| _d| _|   d S )NFrf   ForMaskedLM)r.   rO   r'   r9  r(   r<  r4   r6   r   rh   rQ   logits_dense	loss_typer  rn   r<   r>   r?   rO   3  s   


z$DiaForConditionalGeneration.__init__Nr)   r   r>  r?  r@  rA  r   rB  r  r  labelsr   rU   c                 K   s   | j d	|||||||||	|
|d|}|d }|jd }| ||d| j| jfdd || j d| j}d}|durM| jd	||| jd|}t	|||j
|j|j|j|j|j|jd	S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r)   r   r>  r?  r@  rA  r   rB  r  r  r   r   rV   r!   rW   N)logitsrX  r6   )	lossrY  r   rD  rE  r-  rF  r%  rG  r>   )r(   r^   rV  r]   r4   r6   r   r   loss_functionr   r   rD  rE  r-  rF  r%  rG  )r:   r)   r   r>  r?  r@  rA  r   rB  r  r  rX  r   r   outputsr  r0  audio_logitsrZ  r>   r>   r?   rb   B  sJ   ,
z#DiaForConditionalGeneration.forward)NNNNNNNNNNNN)r@   rA   rB   rD   output_modalitiesr"   rO   r   r   r2   r   r   r   r
   r   r   rb   rL   r>   r>   r<   r?   rS  *  sZ    
	
rS  )r9  r&   rS  )r!   )r   )Ncollections.abcr   typingr   r2   r    r   r7   activationsr   cache_utilsr   r	   r
   integrationsr   r   r   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr    configuration_diar"   r#   r$   generation_diar%   
get_loggerr@   rJ  r&   Moduler1   re   ru   r   r   r   rd   r   r   r   r   r   r   r*   r  r+   r+  r9  rS  __all__r>   r>   r>   r?   <module>   s   
A
AJ!C;qxj