o
    Ni                     @   s  d dl mZmZmZmZmZmZmZmZm	Z	 d dl
Zd dlZd dlZd dlZd dlmZmZmZ d dlZd dlmZmZmZmZ d dlmZ d dlmZ d dlZdd ZG dd	 d	eZG d
d deZ dd Z!e"dkrd dl m	Z	 d dl m#Z#m$Z$ e j%e	_%e j&e	_&e$j'dej(dZ)e#'dZ*e)+dZ)e), Z)dZ-e*e-dddj.Z.e.+dZ.e/ E e)j%e.dddddddd d	Z0e1e*j2e0dd e)j%e.dddddddd dd 
Z3d!Z4e3D ]Z5e*j2e5ddZ6e4e67 Z4qe1e4 W d   dS 1 sw   Y  dS dS )"    )	GenerationConfigGenerationMixinLogitsProcessorListStoppingCriteriaListDisjunctiveConstraintBeamSearchScorerPhrasalConstraintConstrainedBeamSearchScorerPreTrainedModelN)GenerateOutputSampleOutputlogger)CallableListOptionalUnion)nnc                 C   sP   | dkrd S t |  t j rt j|  tj|  t|  dt jj	_
d S )NT)torchmanual_seedcudais_availablemanual_seed_allnprandomseedbackendscudnndeterministic)r    r   V/home/ubuntu/.local/lib/python3.10/site-packages/transformers_stream_generator/main.py
setup_seed   s   


r!   c                       s   e Zd Z fddZ  ZS )StreamGenerationConfigc                    s$   t  jdi | |dd| _d S )N	do_streamFr   )super__init__popr#   )selfkwargs	__class__r   r    r%   $   s   zStreamGenerationConfig.__init__)__name__
__module____qualname__r%   __classcell__r   r   r)   r    r"   #   s    r"   c                   @   s"  e Zd Ze 							ddeej dee dee dee	 dee
eejgee f  d	ee d
eeejf fddZe 											ddejdee dee	 dee dee dee deeeee f  dee dee dee dee d	ee d
eeejf fddZdS )NewGenerationMixinNFr   inputsgeneration_configlogits_processorstopping_criteriaprefix_allowed_tokens_fnsynced_gpusreturnc           %         s
  t | |    du r&| jjr#t| j}	|	| jkr#td |	| _| j t	
   jdCi |}
|dur9|nt }|durB|nt } jdu rt jdurt|
dddu r\td  j}t|trh|d }td| d | _| | j|
\}}}
|jd } j|
d<  j|
d	<  j|
d
< dtt| jj  v }d|
v}|
dddu r|r|r| !| j j|
d< | jj"sو jdurt#$|dddf  jkdkrtd | jj"rd|
vr| %||
|}
| jj"r| j&| j' j|
|j(d}n|}|jd }|ddu o j)du}|r# j*du r#td j) dt+ n|r3 j*dur3 j*|  _)n|s@ j*dur@t,d j-durZ j- j)krZt,d j- d j) d| j)kry| jj"rgdnd}td| d| d j) d t. ds|dd _/ j0dup j1duo j/du } j2duo j2dko j3du o j4duo j4dko j/du } j5dko҈ j6dko҈ j3du o| o| o؈ j/du } j5dko j6dko j3du o j/du o| o| o j/du } j5dko j6dko j/du o| o| } j5dko/ j6dko/ j3du o/| o/| o5 j/du } j5dkoN j6dkoN j3du oN| oN| oT j/du } j5dkog j6dkog| og| om j/du } j6 j5kryt,d |r j3du rt,d!| j(j7|j(j7krtd"|j(j7 d#| j(j7 d$| j(j7 d%t+ | j8 ||||d&}| j9 |d'}|r݈ j:dkrt,d( j: d)| j;|f|| j j j< j=|d*|
S |r j:dkrt,d( j: d+| j>|f j2 j4|| j j j< j=|d,	|
S |r7| ? }| j@dC| j:| jj"d-|
\}}
| jA|f||| j j j< j=|d.|
S |rf| ? }| j@dC| j:| jj"d-|
\}}
| jB|f||| j j j< j=|d.|
S |r j: j5krtt,d/|j)du r~t,d0tC| j5|j( jD jE j:d1}| j@dC| j5| jj"d-|
\}}
| jF||f|| j j j< j=|d*|
S |r| ? }|j)du rt,d0tC| j:  j5|j( jD jEd2}| j@dC| j5 j: | jj"d-|
\}}
| jG||f||| j j j< j=|d.|
S |rt j: j5krt,d/ j5 j6 dkrt,d3|j)du r&t,d0|d4du o2 jHd5k}|s:t,d6tC| j5|j)|j( jD jE j: j6d7}| j@dC| j5| jj"d-|
\}}
| jI||f|| j j j< j=|d*|
S |rr j: j5krt,d/|j)du rt,d0 j5dkrt,d8 j3rt,d9 j6dur j6dkrt,d:g }  j0dur j0}  j1dur; fd;d<}!t j1trtJ j1dkr|!   j1D ]`}"t|"d trt|"trtJ|"dkr|!  tKd=d> |"D r|!  tKd?d> |"D r|!  tL|"}#n!t|"tr tJ|"dkr#|!  tKd@d> |"D r0|!  tM|"}#| N|# qtO| | j5|j( jD jE j:dA}$| j@dC| j5| jj"d-|
\}}
| jP|f|$|| j j j< j=|dB|
S dS )Da  

        Generates sequences of token ids for models with a language modeling head.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](./generation_strategies).

        </Tip>

        Parameters:
            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://arxiv.org/abs/2010.00904).
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            kwargs:
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.

                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GreedySearchDecoderOnlyOutput`],
                    - [`~generation.SampleDecoderOnlyOutput`],
                    - [`~generation.BeamSearchDecoderOnlyOutput`],
                    - [`~generation.BeamSampleDecoderOnlyOutput`]

                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GreedySearchEncoderDecoderOutput`],
                    - [`~generation.SampleEncoderDecoderOutput`],
                    - [`~generation.BeamSearchEncoderDecoderOutput`],
                    - [`~generation.BeamSampleEncoderDecoderOutput`]
        Na%  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)attention_maskzThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.r   z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.output_attentionsoutput_hidden_states	use_cacheencoder_outputsr   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.)decoder_start_token_idbos_token_idmodel_kwargsdevice
max_lengthzUNeither `max_length` nor `max_new_tokens` has been set, `max_length` will default to z (`generation_config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.a.  Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a limit to the generated output length. Remove one of those arguments. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)z3Unfeasible length constraints: the minimum length (z%) is larger than the maximum length ()decoder_input_ids	input_idszInput length of z is z, but `max_length` is set to zX. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.r#   F   Tz;`num_beam_groups` has to be smaller or equal to `num_beams`zbDiverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`.z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.)r1   input_ids_seq_lengthencoder_input_idsr4   r2   )r1   r3   z)num_return_sequences has to be 1, but is z when doing greedy search.)r2   r3   pad_token_ideos_token_idoutput_scoresreturn_dict_in_generater5   z when doing contrastive search.)	top_kpenalty_alphar2   r3   rG   rH   rI   rJ   r5   )rC   expand_sizeis_encoder_decoder)r2   logits_warperr3   rG   rH   rI   rJ   r5   zA`num_return_sequences` has to be smaller or equal to `num_beams`.z5`max_length` needs to be a stopping_criteria for now.)
batch_size	num_beamsr?   length_penaltydo_early_stoppingnum_beam_hyps_to_keep)rP   rQ   r?   rR   rS   zK`num_beams` should be divisible by `num_beam_groups` for group beam search.	typical_p      ?z?Decoder argument `typical_p` is not supported with beam groups.)rP   rQ   r@   r?   rR   rS   rT   num_beam_groupszB`num_beams` needs to be greater than 1 for constrained generation.z9`do_sample` needs to be false for constrained generation.z?`num_beam_groups` not supported yet for constrained generation.c                      s   t d j d)Nzn`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]`of positive integers, but is .)
ValueErrorforce_words_idsr   r1   r   r    	typeerror  s
   z.NewGenerationMixin.generate.<locals>.typeerrorc                 s   s    | ]	}t |t V  qd S N)
isinstancelist.0	token_idsr   r   r    	<genexpr>  s    
z.NewGenerationMixin.generate.<locals>.<genexpr>c                 s   s"    | ]}t d d |D V  qdS )c                 s   $    | ]}t |t p|d k V  qdS r   Nr^   intra   token_idr   r   r    rc     
    
z8NewGenerationMixin.generate.<locals>.<genexpr>.<genexpr>N)anyr`   r   r   r    rc     s    
c                 s   rd   re   rf   rh   r   r   r    rc     rj   )constraintsrP   rQ   r?   rR   rS   rT   )constrained_beam_scorerr2   r3   rG   rH   rI   rJ   r5   r   )Qr!   _validate_model_classr1   _from_model_configr"   from_model_configconfigwarningswarncopydeepcopyupdater   r   rG   rH   getr   warningr^   r_   _prepare_model_inputsr=   shaper8   r9   r:   setinspect	signatureforward
parameterskeys&_prepare_attention_mask_for_generationrN   r   sum._prepare_encoder_decoder_kwargs_for_generation)_prepare_decoder_input_ids_for_generationr<   r?   r@   max_new_tokensUserWarningrY   
min_lengthhasattrr#   rl   rZ   rK   	do_samplerL   rQ   rW   type_get_logits_processor_get_stopping_criterianum_return_sequencesgreedy_searchrI   rJ   contrastive_search_get_logits_warper_expand_inputs_for_generationsamplesample_streamr   rR   early_stoppingbeam_searchbeam_samplerU   group_beam_searchlenrk   r   r   appendr	   constrained_beam_search)%r'   r0   r1   r2   r3   r4   r5   r   r(   new_generation_configr>   rH   inputs_tensormodel_input_namerP   accepts_attention_maskrequires_attention_maskrC   rE   has_default_max_lengthinput_ids_stringis_constraint_gen_modeis_contrastive_search_gen_modeis_greedy_gen_modeis_sample_gen_modeis_sample_gen_stream_modeis_beam_gen_modeis_beam_sample_gen_modeis_group_beam_gen_moderO   beam_scorerhas_default_typical_pfinal_constraintsr\   word_ids
constraintrm   r   r[   r    generate*   s  T










	
	

		
		





	


	
	










zNewGenerationMixin.generaterC   rO   r@   rG   rH   r8   r9   rI   rJ   c                 +   s.   |dur|nt  }|dur|nt }|dur"tdt t||}|dur(|nt  }|dur1|n| jj}|dur;|n| jj}t	|t
rG|g}|
durM|
n| jj}
|durW|n| jj}|	dura|	n| jj}	|durk|n| jj}|ru|
rudnd}|r}|r}dnd}|r|rdnd}|r|	rdnd}||jd d}d}	 |rt|rdnd	|j}tj|tjjd
 | dkrdS | j|fi |}| di |d||	d}|r|rq|jdddddf }|||}|||}|r'|
r||f7 }|r|| jjr|jfn|j f7 }| jjr||j!f7 }|	r'|| jjr"|j"fn|j#f7 }t$j%j&|dd}tj'|dd(d |durQ|du rGt)d | |d|     V  tj*| dddf gdd}| j+||| jjd}|dur|,t- fdd|D . }|/ dks|||r|sdS d}q)a  
        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
        For an overview of generation strategies and code examples, check the [following
        guide](./generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            logits_warper (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
                to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`int`, *optional*):
                The id of the *end-of-sequence* token.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.SampleDecoderOnlyOutput`], [`~generation.SampleEncoderDecoderOutput`] or `torch.LongTensor`:
            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.SampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.SampleEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForCausalLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     TopKLogitsWarper,
        ...     TemperatureLogitsWarper,
        ...     StoppingCriteriaList,
        ...     MaxLengthCriteria,
        ... )
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")

        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
        >>> model.config.pad_token_id = model.config.eos_token_id
        >>> model.generation_config.pad_token_id = model.config.eos_token_id

        >>> input_prompt = "Today is a beautiful day, and"
        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
        ...     ]
        ... )
        >>> # instantiate logits processors
        >>> logits_warper = LogitsProcessorList(
        ...     [
        ...         TopKLogitsWarper(50),
        ...         TemperatureLogitsWarper(0.7),
        ...     ]
        ... )

        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])

        >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
        >>> outputs = model.sample(
        ...     input_ids,
        ...     logits_processor=logits_processor,
        ...     logits_warper=logits_warper,
        ...     stopping_criteria=stopping_criteria,
        ... )

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Today is a beautiful day, and a wonderful day.\n\nI was lucky enough to meet the']
        ```Nz`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.r   r   rD   FTg        rV   )op)return_dictr8   r9   r   )dim)num_sampleszGIf `eos_token_id` is defined, make sure that `pad_token_id` is defined.)rN   c                 3   s    | ]} |kV  qd S r]   r   )ra   inext_tokensr   r    rc     s    z3NewGenerationMixin.sample_stream.<locals>.<genexpr>)0r   r   rr   rs   r   validate_stopping_criteriar1   rG   rH   r^   rg   rI   r8   r9   rJ   newrz   fill_r   tensortor?   dist
all_reduceReduceOpSUMitemprepare_inputs_for_generationlogitsrq   rN   decoder_attentions
attentionscross_attentionsdecoder_hidden_stateshidden_statesr   
functionalsoftmaxmultinomialsqueezerY   cat#_update_model_kwargs_for_generationmulr   longmax)r'   rC   r2   r3   rO   r@   rG   rH   r8   r9   rI   rJ   r5   r>   scoresr   r   r   unfinished_sequencesthis_peer_finishedthis_peer_finished_flagmodel_inputsoutputsnext_token_logitsnext_token_scoresprobsr   r   r    r     s   ~










z NewGenerationMixin.sample_stream)NNNNNFr   )NNNNNNNNNNF)r+   r,   r-   r   no_gradr   Tensorr"   r   r   r   rg   r   boolr   r   
LongTensorr   r   r   r   r   r   r    r/   )   s    	     ,	
r/   c                   C   s   t jt_t jt_d S r]   )r/   r   r
   r   r   r   r   r    init_stream_support  s   r   __main__)r
   )AutoTokenizerAutoModelForCausalLMzbigscience/bloom-560m)torch_dtypezcuda:0zhello? 
ptF)return_tensorsadd_special_tokens   T   g333333?gffffff?g333333?)r   r   rK   top_ptemperaturerepetition_penaltyr   r   )skip_special_tokens)	r   r   rK   r   r   r   r   r   r#    )7transformersr   r   r   r   r   r   r   r	   r
   numpyr   r   rr   r|   transformers.generation.utilsr   r   r   r   typingr   r   r   r   r   torch.distributeddistributedr   rt   r!   r"   r/   r   r+   r   r   r   r   from_pretrainedfloat16model	tokenizerr   evalprompt_textrC   r   resultprintdecode	generatorstream_resultxchunkr   r   r   r    <module>   s   ,        J





"