o
    eiu                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- e .e/Z0eG dd deZ1G dd dej2Z3G dd de'Z4G dd de#Z5G dd de$Z6G dd  d e"Z7G d!d" d"ej2Z8G d#d$ d$eZ9G d%d& d&e1Z:G d'd( d(eZ;G d)d* d*e1Z<ed+d,G d-d. d.e1Z=ed/d,G d0d1 d1e1e-Z>g d2Z?dS )3zPyTorch Dia model.    )CallableN)nn   )initialization)DynamicCacheEncoderDecoderCache)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging   )LlamaAttentionLlamaRMSNormLlamaRotaryEmbeddingeager_attention_forward)Phi3MLP   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixinc                       sJ   e Zd ZU eed< dZdZdZdZdZ	dZ
dZddgZ fddZ  ZS )	DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerc                    sH   t  | t|tr"tj| jjtjd| jj	 }t
|j| d S d S )Ndtype)super_init_weights
isinstanceDiaMultiChannelEmbeddingtorcharanger#   num_channelslong
vocab_sizeinitcopy_offsets)selfmoduler5   	__class__ a/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dia/modular_dia.pyr+   =   s
   
z DiaPreTrainedModel._init_weights)__name__
__module____qualname__r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modulesr+   __classcell__r:   r:   r8   r;   r"   1   s   
 r"   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	r-   a  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r#   c                    s^   t    t|j|j |j| _|j| _|j| _tj	|jtj
d|j }| jd|dd d S )Nr(   r5   F)
persistent)r*   __init__r   	Embeddingr2   r0   hidden_sizeembedr.   r/   r1   register_buffer)r6   r#   r5   r8   r:   r;   rJ   R   s   
z!DiaMultiChannelEmbedding.__init__audio_codesreturnc                 C   sH   || j |j d}| ||jd |jd d| j}|jddS )Nr   r   r   )dim)	r5   todevicesqueezerM   viewshaperL   sum)r6   rO   tokensembedsr:   r:   r;   forwardZ   s   $z DiaMultiChannelEmbedding.forward)
r<   r=   r>   __doc__r   rJ   r.   Tensorr[   rH   r:   r:   r8   r;   r-   D   s    r-   c                   @      e Zd ZdS )DiaMLPNr<   r=   r>   r:   r:   r:   r;   r_   `       r_   c                   @   r^   )
DiaRMSNormNr`   r:   r:   r:   r;   rb   d   ra   rb   c                   @   r^   )DiaRotaryEmbeddingNr`   r:   r:   r:   r;   rc   h   ra   rc   c                   @   s,   e Zd ZdZd	deeB dedefddZdS )
DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperFr#   	layer_idx	is_causalc                 C   s   t j|  || _|| _|j| _| jj| _| jjp| j| _| j| j | _	t
|d|j| j | _d| _d| _|| _t j| j| j| j dd| _t j| j| j| j dd| _t j| j| j| j dd| _t j| j| j | jdd| _d S )Nhead_dimr           Fbias)r   ModulerJ   r#   rf   rL   num_attention_heads	num_headsnum_key_value_headsnum_key_value_groupsgetattrrh   scalingattention_dropoutrg   Linearq_projk_projv_projo_proj)r6   r#   rf   rg   r:   r:   r;   rJ   o   s   
 zDiaSelfAttention.__init__N)F)	r<   r=   r>   r\   r    r   intboolrJ   r:   r:   r:   r;   rd   l   s     rd   c                       st   e Zd ZdZdedef fddZ		ddejdejd	ejdB d
e	dB de
e deejejdB f fddZ  ZS )DiaCrossAttentionre   r#   rf   c                    s   t    || _|| _|j| _|j| _| jj| _| jj| _	| j| j	 | _
|j| _d| _d| _d| _tj| j| j| j dd| _tj| j| j	| j dd| _tj| j| j	| j dd| _tj| j| j | jdd| _d S )Nr   ri   Frj   )r*   rJ   r#   rf   rL   cross_hidden_sizecross_num_attention_headsrn   cross_num_key_value_headsro   rp   cross_head_dimrh   rr   rs   rg   r   rt   ru   rv   rw   rx   r6   r#   rf   r8   r:   r;   rJ      s    


 zDiaCrossAttention.__init__Nhidden_statescross_attention_statesattention_maskpast_key_valueskwargsrP   c                 K   sV  |j d d }g |d| jR }g |j d d d| jR }| ||dd}	|d ur7|j| jnd}
|d urP|
rP|jj	| j j
}|jj	| j j}n-| ||dd}| ||dd}|d ur}|j||| j\}}d|j| j< t| jjt}|| |	|||fd| ji|\}}|g |dR  }| |}||fS )NrQ   r   r   FTrr   )rW   rh   ru   rV   	transpose
is_updatedgetrf   cross_attention_cachelayerskeysvaluesrv   rw   updater   get_interfacer#   _attn_implementationr   rr   reshape
contiguousrx   )r6   r   r   r   r   r   input_shapehidden_shapecross_shapequery_statesr   
key_statesvalue_statesattention_interfaceattn_outputattn_weightsr:   r:   r;   r[      sD   


zDiaCrossAttention.forwardNN)r<   r=   r>   r\   r   ry   rJ   r.   r]   r   r   r
   tupler[   rH   r:   r:   r8   r;   r{      s$    r{   c                       sv   e Zd Zdedef fddZ		ddejdeejejf dB dejdB d	e	e
 d
eejejdB f f
ddZ  ZS )r&   r#   rf   c                    sL   t    t|j|jd| _t||dd| _t|j|jd| _t	|| _
d S )NepsFrg   )r*   rJ   rb   rL   norm_epspre_sa_normrd   self_attentionpost_sa_normr_   mlpr   r8   r:   r;   rJ      s
   
zDiaEncoderLayer.__init__Nr   position_embeddingsr   r   rP   c           
      K   sZ   |}|  |}| j|f||d|\}}|| }|}| |}| |}	||	 }||fS )N)r   r   )r   r   r   r   )
r6   r   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outr:   r:   r;   r[      s    



zDiaEncoderLayer.forwardr   )r<   r=   r>   r    ry   rJ   r.   r]   r   r   r
   r[   rH   r:   r:   r8   r;   r&      s    
r&   c                       sl   e Zd Zdef fddZee			ddejdejdB de	dB d	e	dB d
e
e deeB fddZ  ZS )
DiaEncoderr#   c                    sn   t     | _t j j| _t fddt	 j
D | _t j jd| _t d| _|   d S )Nc                       g | ]}t  |qS r:   )r&   .0rf   r#   r:   r;   
<listcomp>       z'DiaEncoder.__init__.<locals>.<listcomp>r   r   )r*   rJ   r#   r   rK   r2   rL   	embedding
ModuleListrangenum_hidden_layersr   rb   r   normrc   
rotary_emb	post_initr6   r#   r8   r   r;   rJ      s   zDiaEncoder.__init__NFr%   r   output_attentionsoutput_hidden_statesr   rP   c                 K   s   |  |}tj|jd |jdd d d f }t| j||d}| j||d}|r)dnd }	|r/dnd }
| jD ]"}|r=|	|f }	||f|||d|}|d }|rV|
|d f }
q4| 	|}|rc|	|f7 }	t
||	|
d	S )
NrQ   rT   )r#   inputs_embedsr   position_idsr:   )r   r   r   r   r   last_hidden_stater   
attentions)r   r.   r/   rW   rT   r   r#   r   r   r   r   )r6   r%   r   r   r   r   r   r   r   encoder_statesall_attentionsencoder_layerlayer_outputsr:   r:   r;   r[      s@   

"



zDiaEncoder.forward)NFF)r<   r=   r>   r    rJ   r   r   r.   r]   rz   r   r
   r   r   r[   rH   r:   r:   r8   r;   r      s(    r   c                       s   e Zd Zdedef fddZ						ddejdeejejf dB dejdB d	ejdB d
ejdB de	dB dej
dB deejejdB ejdB f fddZ  ZS )r'   r#   rf   c                    sr   t    |j| _t||dd| _t||| _t|j|j	d| _
t|j|j	d| _t|j|j	d| _t|| _d S )NTr   r   )r*   rJ   rL   	embed_dimrd   r   r{   cross_attentionrb   r   r   pre_ca_normpre_mlp_normr_   r   r   r8   r:   r;   rJ   1  s   
zDiaDecoderLayer.__init__Nr   r   r   encoder_hidden_statesencoder_attention_maskr   cache_positionrP   c                 K   s   |}	t |	tr
|	j}	|}
| |}| j||||	fd|i|\}}|
| }|}
| |}| j||f||d|\}}|
| }|}
| |}| |}|
| }|||fS )Nr   )r   r   )	r,   r   self_attention_cacher   r   r   r   r   r   )r6   r   r   r   r   r   r   r   r   self_attn_cacher   r   r   r   cross_statescross_attn_weightsr   r:   r:   r;   r[   ;  sB   








zDiaDecoderLayer.forward)NNNNNN)r<   r=   r>   r   ry   rJ   r.   r]   r   r   
LongTensorr[   rH   r:   r:   r8   r;   r'   0  s2    
r'   c                       s   e Zd ZdZdef fddZee								ddej	dej
dB d	ej	dB d
ejdB dej
dB dedB dedB dedB dej
dB deeB fddZ  ZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r#   c                    sp   t     j| _ j| _t | _t fddt j	D | _
t j jd| _t d| _|   d S )Nc                    r   r:   )r'   r   r   r:   r;   r   t  r   z'DiaDecoder.__init__.<locals>.<listcomp>r   r   )r*   rJ   r0   r2   r-   
embeddingsr   r   r   r   r   rb   rL   r   r   rc   r   r   r   r8   r   r;   rJ   n  s   
zDiaDecoder.__init__NFr%   r   r   r   r   r   r   r   r   rP   c
                 K   s  |  dd \}}|dur| nd}|	du r#tj||| |jd}	|du r/|	dddf }| |}|du rHt sH|| }tj|||jd}t| j	|||	|d}t
| j	|||d}| j||d}|rfdnd}|rldnd}|rv|durvdnd}| jD ]1}|r||f7 }|||||f|||	|d	|
}|d }|r||d
 f }|dur||d f }q{| |}|r||f7 }t|||||dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrQ   r   r   )r#   r   r   r   r   )r#   r   r   r   r   r:   )r   r   r   r   r   r   )r   r   r   r   cross_attentions)sizeget_seq_lengthr.   r/   rT   r   r   onesr	   r#   r   r   r   r   r   )r6   r%   r   r   r   r   r   r   r   r   r   
batch_size
seq_lengthpast_key_values_lengthr   mask_seq_lengthr   all_hidden_statesall_self_attnsall_cross_attentionslayerr   r:   r:   r;   r[   {  sz   




zDiaDecoder.forward)NNNNNFFN)r<   r=   r>   r\   r   rJ   r   r   r.   r]   r   FloatTensorr   rz   r   r   r[   rH   r:   r:   r8   r;   r   k  sD    	
r   z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                       s   e Zd Zdef fddZee											ddejdB dejdB dejdB dejdB d	ejdB d
e	e
B dB dedB dedB dedB dedB dejdB de
eB fddZ  ZS )DiaModelr#   c                    s6   t  | || _t|j| _t|j| _| 	  d S )N)
r*   rJ   r#   r   encoder_configencoderr   decoder_configdecoderr   r   r8   r:   r;   rJ     s
   zDiaModel.__init__Nr%   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher   r   r   rP   c                 K   s  |du r|du rt d|	dur|	n| jj}	|
dur|
n| jj}
|dur&|n| jj}| jr9| jr9|r9td d}|rL|du rLt	t
| jdt
| jd}|du r^| jd|||	|
d|}n"t|tst|d t|dkrq|d ndt|d	kr||d	 ndd
}|d jd d| jjj}}}|du rtj|d|f| jjj| jd}|jd	kr||||dd	}| jd||||d |||	|
||d
|}t|j|j|j|j|j|d |j|jdS )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   )r%   r   r   r   r   r   r   r   rQ   )r   
fill_valuerT   )
r%   r   r   r   r   r   r   r   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentionsr:   ) 
ValueErrorr#   r   r   r   is_gradient_checkpointingtrainingloggerwarning_oncer   r   r   r,   r   lenrW   r   r0   r.   fullbos_token_idrT   ndimr   r   r   r   r   r   r   r   r   )r6   r%   r   r   r   r   r   r   r   r   r   r   r   bszseq_lenchannelsdecoder_outputsr:   r:   r;   r[     s|   '
 
zDiaModel.forward)NNNNNNNNNNN)r<   r=   r>   r   rJ   r   r   r.   r   r   r   r   rz   r   r[   rH   r:   r:   r8   r;   r     sP    
	
r   zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       s   e Zd ZdZdZdef fddZee												dde	j
dB de	j
dB d	e	j
dB d
e	j
dB de	j
dB deeB dB dedB dedB dedB dedB de	j
dB de	j
dB deeB fddZ  ZS )DiaForConditionalGenerationr$   )audior#   c                    s`   t  | || _t|| _|jj| _|jj| _tj	|jj
| j| j dd| _d| _|   d S )NFrj   ForMaskedLM)r*   rJ   r#   r   r$   r   r0   r2   r   rt   rL   logits_dense	loss_typer   r   r8   r:   r;   rJ   b  s   


z$DiaForConditionalGeneration.__init__Nr%   r   r   r   r   r   r   r   r   r   labelsr   rP   c                 K   s   | j d	|||||||||	|
|d|}|d }|jd }| ||d| j| jfdd || j d| j}d}|durM| jd	||| jd|}t	|||j
|j|j|j|j|j|jd	S )
a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r%   r   r   r   r   r   r   r   r   r   r   r   rQ   r   r   N)logitsr  r2   )	lossr  r   r   r   r   r   r   r   r:   )r$   rW   r  rV   r0   r2   r   r   loss_functionr   r   r   r   r   r   r   r   )r6   r%   r   r   r   r   r   r   r   r   r   r  r   r   outputsr   r   audio_logitsr  r:   r:   r;   r[   q  sJ   ,
z#DiaForConditionalGeneration.forward)NNNNNNNNNNNN)r<   r=   r>   r@   output_modalitiesr   rJ   r   r   r.   r   r   r   r   rz   r   r[   rH   r:   r:   r8   r;   r   Y  sZ    
	
r   )r   r"   r   )@r\   collections.abcr   r.   r    r   r3   cache_utilsr   r   masking_utilsr   r	   modeling_flash_attention_utilsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   llama.modeling_llamar   r   r   r   phi3.modeling_phi3r   configuration_diar   r   r    generation_diar!   
get_loggerr<   r   r"   rl   r-   r_   rb   rc   rd   r{   r&   r   r'   r   r   r   __all__r:   r:   r:   r;   <module>   sN   
J!C;qxj