o
    eiS                     @   s   d dl mZ d dlmZmZ d dlZd dlmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ e e!Z"G dd deZ#dS )    )Callable)AnyOptionalN   )(DiaClassifierFreeGuidanceLogitsProcessor"DiaEOSChannelFilterLogitsProcessor!DiaEOSDelayPatternLogitsProcessorLogitsProcessorListTemperatureLogitsWarper)StoppingCriteriaList)BaseStreamer)GenerateOutputGenerationConfigGenerationMixinGenerationMode)is_deepspeed_zero3_enabled)is_fsdp_managed_module)PreTrainedModel)loggingc                       s  e Zd ZdZ								d,dededB dejdB deeej	ge
e f dB dedB dedB deeef dB d	ej	dB d
ej	dB def fddZdedB dedeeef f fddZ			d-dej	dB dej	dB deeej	f dB deej	edB eeej	f f f fddZ	d.dededeeej	f dej	dejdB deejeeej	f f fddZ		d/ fdd	Zedej	dedej	dB dej	fdd Z											d0dej	dB dedB dedB d!edB deeej	ge
e f dB d"edB d#ed$ d%ed& d	ej	dB d
ej	dB d'edB fd(d)Ze 											d0dej	dB dedB dedB d!edB deeej	ge
e f dB d"edB d#ed$ d%ed& d	ej	dB d
ej	dB d'edB deejB fd*d+Z  ZS )1DiaGenerationMixinNgeneration_configinput_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnlogits_processordevicemodel_kwargsnegative_prompt_idsnegative_prompt_attention_maskreturnc
                    s   |j }
|j}d |_ d |_t }|d ur|dkr|t| |tt| jj| jj	j
d t j|||d |||||	d	}|
d urR|
dkrRt|
|jd}|d| |t| jj| jj	j
|j|d |
|_ ||_|S )N      ?)num_channelseos_token_id	r   r   r   r   r   r   r   r   r      )guidance_scaleguidance_top_kr   )delay_patternr"   max_generation_lenr   )r%   temperaturer	   appendr
   r   lenconfigr'   decoder_configr"   super_get_logits_processorr   top_kinsertr   
max_length)selfr   r   r   r   r   r   r   r   r   original_guidance_scaleoriginal_temperaturecustom_processorsmerged_processorscfg_processor	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dia/generation_dia.pyr/   ,   sR   

z(DiaGenerationMixin._get_logits_processorkwargsc                    sz   t  j|fi |\}}|jd ur#|jdk r#td|j d d|_| jt| jj7  _|j	d uo7|j	dk| _
||fS )Nr    zAtemperature < 1.0 is not supported for Dia; clamping to 1.0 (got )r$   )r.   _prepare_generation_configr)   loggerwarning_oncer2   maxr,   r'   r%   	_uses_cfg)r3   r   r=   r   r9   r;   r<   r?   o   s   z-DiaGenerationMixin._prepare_generation_configinputsbos_token_idc                    sh   t  j|||d\}}}| jr/t|}tj||gdd}|dd d ur/|d dd|d< |||fS )N)rD   rE   r   r   dimattention_mask   r$   )r.   _prepare_model_inputsrC   torch
zeros_likecatgetrepeat)r3   rD   rE   r   
input_nameunconditioned_inputsr9   r;   r<   rJ      s   

z(DiaGenerationMixin._prepare_model_inputs
batch_sizemodel_input_namedecoder_start_token_idc                 C   sT  d }}|durd|v r| d}|durd|v r| d}|du s&|du rctd|du d|du d | jjj}| jrB|d n|}	|du rUtj|	d|f|tj	|d	}tj
|	|jd ftj	|d
}|	 }
|jd |dddddf | jjjkjdd  }|
ddd|f dd	 }|ddd|f 	 }||d< |
|d< ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNdecoder_input_idsdecoder_attention_maskz[In order to generate with Dia, we need the processed audio input: Got `decoder_input_ids`: z" and got `decoder_attention_mask`=z]. This can be achieved via the [`DiaProcessor`] but now defaulting to non-delayed generation.rI   r$   )dtyper   )sizerW   r   r   rF   decoder_delay_mask)popr@   rA   r,   r-   r!   rC   rK   fulllongonesshapepad_token_idsumrB   	transpose)r3   rR   rS   r   rT   r   rU   rV   r!   real_batch_size
delay_maskvalid_input_sizer;   r;   r<   )_prepare_decoder_input_ids_for_generation   s@   



* z<DiaGenerationMixin._prepare_decoder_input_ids_for_generationc           	         s"  | j r|d jd d n|d jd }||| jjjddd}t j|fd|i|}| 	|| jjj
||d< |ddr_|d	 d dkr_|d d d dd d f d d d d d f |d< |d  |d< | j rd
D ]"}||d d urtdgdg|| jd   }|| j| ||< ql|S )Nr   rI   rY   r$   encoder_outputsrU   	use_cacheFcache_position)rU   rV   decoder_position_ids)rC   r_   reshaper,   r-   r!   rb   r.   prepare_inputs_for_generationapply_delay_maskr`   rN   
contiguoustuplendimrO   )	r3   	input_idsrg   rZ   r=   rR   model_inputskeyrepeat_patternr9   r;   r<   rl      s    &0z0DiaGenerationMixin.prepare_inputs_for_generationrq   pad_idrd   c                 C   s   |d u r| S t | jd |jd }|d d d |d d f }| d d d |d d f }t||k||| d d d |d d f< | S )Nr$   )minr_   rK   where)rq   ru   rd   mask_len
valid_maskvalid_inputr;   r;   r<   rm      s   (z#DiaGenerationMixin.apply_delay_maskstopping_criteriasynced_gpusassistant_modelr   streamerr   custom_generatec                 K   s  |  |||||}| j|fi |\}}||}|tjtjfvr%td| |  | 	||| |d u rDt
 s>t| oCt dk}|d urJ|nt }|d urS|nt }|dd d u}| ||j|\}}}|jd }|j}| j|||d d|vr| ||||}| j||||j|jd\}}|jr| ||d}|d ur||  |jd	 }|d
d u o|jd u}|dd u o|jd u}| j||||||d}|   rd|vrd|d< | !||| |jd }|jd |kr|dkr| j"j#s||jd 7 }| $||||| | j%||||||j||	|
d	}| j&|||dd}|j'|d< |(d	|jd	 }|j)dkr>td| j*|f|||d||S )NzGot incompatible mode for generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.r$   rH   r   )r   rg   )rR   rS   r   rT   r   	tokenizerrY   r2   
min_length)r   has_default_max_lengthhas_default_min_lengthrS   inputs_tensorinput_ids_lengthlogits_to_keepinputs_embedsr#   )r   r{   r   rh   z2`num_return_sequences>1` is incompatible with Dia.)r   r{   r   )+_extract_generation_mode_kwargsr?   get_generation_moder   SAMPLEGREEDY_SEARCH
ValueError_validate_model_kwargscopy_validate_generation_moder   r   distget_world_sizer	   r   rN   rJ   rE   r_   r   _prepare_special_tokens._prepare_encoder_decoder_kwargs_for_generationrf   _decoder_start_token_tensortoken_healingheal_tokensputcpur2   r   _prepare_generated_length_supports_logits_to_keep_validate_generated_lengthr,   is_encoder_decoder_prepare_cache_for_generationr/   _get_stopping_criteriarh   rk   num_return_sequences_sample)r3   rD   r   r   r{   r   r|   r}   r~   r   r   r   r=   generation_mode_kwargsr   generation_modekwargs_has_attention_maskr   rS   rR   r   rq   r   r   r   max_cache_lengthprepared_logits_processorprepared_stopping_criteriar;   r;   r<   _main_generate_loop   s   








z&DiaGenerationMixin._main_generate_loopc                 K   s   | d}|d ur| }| jd|||||||||	|
|d|}t|tj }|r.|j}n|}| jjj	}|j
d | }|||ddd}| || jjj|}|rX||_|S |}|S )NrU   )rD   r   r   r{   r   r|   r}   r~   r   r   r   r   rY   r$   rI   r;   )rN   cloner   
isinstancerK   Tensor	sequencesr,   r-   r!   r_   rk   rb   rm   r`   )r3   rD   r   r   r{   r   r|   r}   r~   r   r   r   r=   rd   outputreturn_dict_in_generateoutput_sequencesr!   bszr;   r;   r<   generate  s>   

zDiaGenerationMixin.generate)NNNNNNNN)NNN)N)NN)NNNNNNNNNNN) __name__
__module____qualname__rC   r   intrK   
LongTensorr   r   listr	   strdictr   r/   ro   r?   rJ   r   rf   rl   staticmethodrm   r   boolr   r   no_gradr   r   __classcell__r;   r;   r9   r<   r   (   s   	
C

6%&	

 	
r   )$collections.abcr   typingr   r   rK   torch.distributeddistributedr   generation.logits_processr   r   r   r	   r
   generation.stopping_criteriar   generation.streamersr   generation.utilsr   r   r   r   integrations.deepspeedr   integrations.fsdpr   modeling_utilsr   utilsr   
get_loggerr   r@   r   r;   r;   r;   r<   <module>   s   
