o
    ei                     @   s0  d dl mZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 e1e2Z3eeddG dd deZ4G dd de)Z5G dd de*Z6G dd de'Z7G dd  d e$Z8G d!d" d"e%Z9ed#deG d$d% d%eZ:eG d&d' d'e(e:Z;G d(d) d)ej<Z=ed*dG d+d, d,e&eZ>G d-d. d.ej<Z?eG d/d0 d0e(Z@ed1dG d2d3 d3e:e0ZAg d4ZBdS )5    )	dataclassN   )initialization)CacheDynamicCache)GenerationMixin)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tuplelogging)merge_with_config_defaults)is_torchdynamo_compiling)capture_outputs   )	AutoModel)LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRMSNormLlamaRotaryEmbeddingTransformersKwargs   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeejdf dB ed< dZeejdf dB ed< dZejdB ed	< dZejdB ed
< dZe
dB ed< dZeejdf dB ed< dZeejdf dB ed< dZejdB ed< dS )CsmOutputWithPasta	  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the depth decoder model.
    depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
    depth_decoder_past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
    depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the backbone model.
    Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss)__name__
__module____qualname____doc__r$   torchFloatTensor__annotations__r%   r&   r   r'   tupler(   r)   r*   r+   r,   r-   r.    r7   r7   a/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/csm/modular_csm.pyr#   1   s   
 r#   c                   @      e Zd ZdS )
CsmRMSNormNr/   r0   r1   r7   r7   r7   r8   r:   b       r:   c                   @   r9   )CsmRotaryEmbeddingNr;   r7   r7   r7   r8   r=   f   r<   r=   c                   @   r9   )CsmMLPNr;   r7   r7   r7   r8   r>   j   r<   r>   c                   @   r9   )CsmAttentionNr;   r7   r7   r7   r8   r?   n   r<   r?   c                   @   r9   )CsmDecoderLayerNr;   r7   r7   r7   r8   r@   r   r<   r@   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                       s`   e Zd ZU eed< dZdZdZdgZdgZ	dZ
dZdZdZeedZe  fdd	Z  ZS )
CsmPreTrainedModelconfigmodel)audiotextTr@   r&   )r'   r(   c                    sz   t  | t|tr$|j}t|d D ]}tj|jd| j	j
d qd S t|tr;t|jt| j	j| j	j  d S d S )Nr   g        )meanstd)super_init_weights
isinstanceCsmCodebooksHeadnum_codebooksrangeinitnormal_weightrB   initializer_rangeCsmBackboneModelEmbeddingscopy_audio_tokens_offsetsr3   arange
vocab_size)selfmodulerL   i	__class__r7   r8   rI      s   

$z CsmPreTrainedModel._init_weights)r/   r0   r1   r   r5   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr@   r?   _can_record_outputsr3   no_gradrI   __classcell__r7   r7   rZ   r8   rA   v   s    
 rA   c                       s   e Zd ZU eed<  fddZeee								dde	j
dB de	jdB de	jdB de	j
dB d	edB d
e	jdB dedB de	j
dB dee deeB fddZ  ZS )CsmDepthDecoderModelrB   c                    s>   t  | t|j|j |j| _tj|j|j	dd| _
d S NF)bias)rH   __init__nn	EmbeddingrL   rV   backbone_hidden_sizeembed_tokensLinearhidden_sizeinputs_embeds_projectorrW   rB   rZ   r7   r8   rk      s   zCsmDepthDecoderModel.__init__N	input_idsbackbone_last_hidden_stateattention_maskposition_idsr&   inputs_embeds	use_cachecache_positionkwargsreturnc	              
   K   s  |durt  std d}|du |duA rtd|r&|du r&t| jd}|du rV|dur2| nd}
|dur=|jd n|jd }|durI|jn|j}t	j
|
|
| |d}|du rt	j|d dd}|| j }| || }|d dk}|dur||dddf< n
t  s|rtd	 | |}t| j|||||d
}|}|d}| j||d}| jd| jj D ]}||f||||||d|	}q| |}t||r|dS ddS )aJ  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.)rB   r   r   device)minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.)rB   rx   rv   rz   r&   rw   )rw   )rv   rw   r&   ry   rz   position_embeddings)last_hidden_stater&   )r   loggerwarning_once
ValueErrorr   rB   get_seq_lengthshaper~   r3   rU   clamprV   ro   warningrr   r   	unsqueeze
rotary_emblayersnum_hidden_layersnormr	   )rW   rt   ru   rv   rw   r&   rx   ry   rz   r{   past_seen_tokensinputs_seq_lengthr~   codebook_idxsoffsetinput_ids_are_first_codebookcausal_maskr'   r   decoder_layerr7   r7   r8   forward   sr   


	

zCsmDepthDecoderModel.forward)NNNNNNNN)r/   r0   r1   r    r5   rk   r   r   r   r3   
LongTensorr4   Tensorr   boolr   r   r6   r	   r   rg   r7   r7   rZ   r8   rh      sF   
 	
rh   c                       s&   e Zd Z fddZdddZ  ZS )rK   c                    s0   t    || _tt| jd ||| _d S )Nr   )rH   rk   rL   rl   	Parameterr3   emptyrP   )rW   rq   rL   rV   rZ   r7   r8   rk      s   
 zCsmCodebooksHead.__init__Nc                    sf   |d u rj d }| jt|  n	|d }| j|   fddt j d D tjddS )Nr   c              	      s2   g | ]}t jd d |d d f  | jqS N)rl   
functionallinearT).0codebook_idxcodebook_weightr'   r7   r8   
<listcomp>	  s    $z,CsmCodebooksHead.forward.<locals>.<listcomp>r   dim)r   rP   r3   rU   rM   stack)rW   r'   rz   
seq_lengthr   r7   r   r8   r     s   

zCsmCodebooksHead.forwardr   r/   r0   r1   rk   r   rg   r7   r7   rZ   r8   rK      s    rK   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                       s  e Zd ZdZdZdZ fddZ				ddejde	dB dejdB dej
dB dejdB f
 fd	d
Zee										ddejdB dej
dB dejdB dejdB de	dB dej
dB dejdB dedB dejdB deejB dee deeB fddZ  ZS )CsmDepthDecoderForCausalLMNc                    s2   t  | | `t|j|j|j| _t|| _	d S r   )
rH   rk   lm_headrK   rq   rL   rV   codebooks_headrh   rC   rs   rZ   r7   r8   rk     s   z#CsmDepthDecoderForCausalLM.__init__rt   r&   rv   rx   rz   c           	         sH   t  j|||||fi |}|d d dk}|s|d |d |S )Nrz   r   ru   rw   )rH   prepare_inputs_for_generationpop)	rW   rt   r&   rv   rx   rz   r{   model_inputsis_first_generation_steprZ   r7   r8   r   $  s   	


z8CsmDepthDecoderForCausalLM.prepare_inputs_for_generationr   ru   rw   labelsry   logits_to_keepr{   r|   c                 K   s   | j d||||||||	d|}|d }t|
tr+|
dkr$tdd}n	t|
 d}n|
}| |dd|ddf |	durA|	| nd}| }d}|durg|dddf  }| jd|d| jj|d|}t	|||j
|j|jdS )	a  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )rt   ru   rv   rw   r&   rx   ry   rz   r   r   N.)r%   r   rV   shift_labels)r$   r%   r&   r'   r(   r7   )rC   rJ   intslicer   
contiguousloss_functionrB   rV   r
   r&   r'   r(   )rW   rt   ru   rv   rw   r&   rx   r   ry   rz   r   r{   outputsr'   slice_indicesr%   r$   r   r7   r7   r8   r   :  sJ   	
&z"CsmDepthDecoderForCausalLM.forwardNNNN)
NNNNNNNNNr   )r/   r0   r1   _tied_weights_keys_tp_plan_pp_planrk   r3   r   r   r4   r   r   r   r   r   r   r   r   r6   r
   r   rg   r7   r7   rZ   r8   r     sr    		
r   c                       s$   e Zd Z fddZdd Z  ZS )rR   c                    sD   t    t|j|j |j| _| jdt	
|j|j dd d S )NrT   F)
persistent)rH   rk   rl   rm   rL   rV   rq   embed_audio_tokensregister_bufferr3   rU   rs   rZ   r7   r8   rk     s
   

z#CsmBackboneModelEmbeddings.__init__c                 C   s    |  || j }|jdd}|S )Nr   r   )r   rT   sum)rW   rt   rx   r7   r7   r8   r     s   z"CsmBackboneModelEmbeddings.forwardr   r7   r7   rZ   r8   rR     s    rR   c                       s4   e Zd Z fddZeee fddZ  ZS )CsmBackboneModelc                    s   t  | t|| _d S r   )rH   rk   rR   ro   rs   rZ   r7   r8   rk     s   zCsmBackboneModel.__init__c                    s   t  jdi |S )a&  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nr7   )rH   r   )rW   super_kwargsrZ   r7   r8   r     s   zCsmBackboneModel.forward)	r/   r0   r1   rk   r   r   r   r   rg   r7   r7   rZ   r8   r     s    r   z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                       s~  e Zd ZddiZ fddZdd Zdd Ze fd	d
Z fddZ					d"de
jdB de
jdB de
jdB de
jdB de
jdB f
ddZ				d"de
jdedB de
jdB de
jdB de
jdB f
 fddZee											d#de
jdB de
jdB de
jdB de
jdB de
jdB dedB de
jdB de
jdB dedB de
jdB dee
jB dee deeB fd d!Z  ZS )$CsmForConditionalGenerationz5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                    sp   t  | |j| _tj|j|jdd| _t|j|j| _	t
|| _t|j| _t|j| _|   d S ri   )rH   rk   rV   rl   rp   rq   r   rm   text_vocab_sizeembed_text_tokensr   _from_configbackbone_modelr   depth_decoder_configdepth_decoderr   from_configcodec_configcodec_model	post_initrs   rZ   r7   r8   rk     s   z$CsmForConditionalGeneration.__init__c                 C   s   | j jS r   r   ro   )rW   r7   r7   r8   get_input_embeddings  s   z0CsmForConditionalGeneration.get_input_embeddingsc                 C   s   || j _d S r   r   )rW   valuer7   r7   r8   set_input_embeddings  s   z0CsmForConditionalGeneration.set_input_embeddingsc                    s   | ddrt j|i |\}}n	t j|i |}d t  fddt|j D }t|jjddi| |D ]
}t	|j |  q?d|v rR||fS |S )Noutput_loading_infoFdepth_decoder_c                    s(   i | ]\}}|  r|d  |qS r   )
startswith)r   attrr   prefix
prefix_lenr7   r8   
<dictcomp>  s    z?CsmForConditionalGeneration.from_pretrained.<locals>.<dictcomp>_from_model_config)
getrH   from_pretrainedlenvarsgeneration_configitemsr   updatedelattr)clsargsr{   rC   loading_infodepth_decoder_attrsr   rZ   r   r8   r     s   z+CsmForConditionalGeneration.from_pretrainedc                    sV   d}| j j }|dd  | D ]\}}t| j|| | qt j|i | d S )Nr   transformers_version)r   r   to_diff_dictr   r   setattrrH   save_pretrained)rW   r   r{   r   r   r   r   rZ   r7   r8   r     s   z+CsmForConditionalGeneration.save_pretrainedNrt   input_valuesinput_values_cutoffsr   r|   c                    sF  |  |}|durtj|d}||dk  }||dk }tj| |jd	t
|d}||dk }t j g }t||D ]?\}	}
|
|
dk }
t|
jd d D ]+}|
| }|
|d  }|	d||f }| j|d}|jdd}||d  qUqBtdd	 |D  t fd
d|D }| j|}W d   n1 sw   Y  | jj}||k}| j|}|| ||< tjdd| jjf|jtjd| jj }| j|d}|| jj k}|!|" d||< |dur|d!dd| jj}|| ||< |||< |dkj#dd}d||d |d ddf< |}||dS )a  
        Merges the input_ids and input_values to produce a single inputs_embeds tensor:
        1 - Infers the codec model on the input_values to retrieve codebook token.
        2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
        3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The input ids to embed.
            input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
                The audio input values to embed.
            input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
                The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
        Nr   r   r   r}   r   .c                 s   s    | ]}|j d  V  qdS )r   N)r   r   elr7   r7   r8   	<genexpr>  s    zQCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>c                    s,   g | ]}t j|d d d  |jd   fqS )r   )rl   r   padr   r   max_audio_framesr7   r8   r     s   , zRCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<listcomp>)r~   dtypeiTas_tuple)rx   r   )$r   rl   r   r   diffr3   rU   maxr~   expandr   r   rf   ziprM   r   r   encodeaudio_codes	transposeappendr   get_audio_codes_maskrB   audio_token_idr   ro   onesrL   longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatr   nonzero)rW   rt   r   r   r   rx   audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsrY   	start_idxend_idxaudio_batchcodec_outputscodebook_idsbatched_audio_token_idsaudio_codes_maskr  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxsr7   r   r8   "_merge_input_ids_with_input_values  s\   




z>CsmForConditionalGeneration._merge_input_ids_with_input_valuesr&   rv   rx   rz   c           	         s   t  jd	|||||d|}|d ur>|jdkr>|dd u r>| j||d|d|dd}||d |d d d |S )
N)rt   r&   rv   rx   rz   r   rx   r   r   r   )rt   r   r   r   )rx   r   rt   r7   )rH   r   ndimr   r  r   )	rW   rt   r&   rv   rx   rz   r{   r   merged_inputsrZ   r7   r8   r   7  s(   	 	z9CsmForConditionalGeneration.prepare_inputs_for_generationr   rw   ry   r   r{   c                 K   s  |dur|j dkr| ||||}|d }|d }d}| jd||||||	|
d|}|d }t|tr:t| dn|}| |dd|ddf }d}d}d}d}|dur|dddddf }| jd||| jj	d|}|ddddddf d	kj
d
d }|| dd| jjd f }tjj|ddd}|jdd}||d |d d ddf }|| }| jd|||	d|d|}|j}|| }t|||||j|j|j|dur|jnd|dur|jnd|dur|jnd|dur|jdS ddS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
            Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
            If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
            where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
            the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
            Requires targeted `input_values` to be provided as audio tokens will be inferred from it using the `codec_model`.
            - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

            Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            Kept for compatibility. Does not support another value than:
            1. `0`, which is equivalent to keeping all logits, used in the training regime
            2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

        Example:

        ```python
        >>> import torch
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        >>> # ensure the audio is 24kHz
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

        >>> conversation = []
        >>> # prepare a conversation with text and corresponding audio
        >>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
        ...     conversation.append(
        ...         {
        ...             "role": f"{speaker_id}",
        ...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
        ...         }
        ...     )

        >>> inputs = processor.apply_chat_template(
        ...     conversation,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     output_labels=True,
        ... ).to(torch_device)

        >>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
        >>> output = model(**inputs)
        >>> output.loss.backward()
        ```Nr   rx   r   )rt   rv   rw   r&   rx   ry   rz   r   )r%   r   rV   r   r   r   r   .r   )r   Tr   )rt   ru   ry   return_dictr   )r$   r.   r)   r%   r&   r'   r(   r*   r+   r,   r-   r7   )r  r  r   rJ   r   r   r   r   rB   rV   allrL   rl   r   r   r	  r   r$   r#   r&   r'   r(   r%   )rW   rt   r   rv   r   rw   r&   rx   r   ry   rz   r   r{   r  backbone_outputsbackbone_hidden_statesr   backbone_logitsr$   r.   r)   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelsr7   r7   r8   r   V  s   S
(	z#CsmForConditionalGeneration.forwardr   )NNNNNNNNNNr   )r/   r0   r1   r   rk   r   r   classmethodr   r   r3   r   r  r   r   r4   r   r   r   r   r   r   r   r6   r#   r   rg   r7   r7   rZ   r8   r     s    

U	
r   )rA   r   rh   r   r   )Cdataclassesr   r3   torch.nnrl    r   rN   cache_utilsr   r   
generationr   masking_utilsr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   autor   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_csmr   r    generation_csmr!   
get_loggerr/   r   r#   r:   r=   r>   r?   r@   rA   rh   ModulerK   r   rR   r   r   __all__r7   r7   r7   r8   <module>   sf   (

+`f  M