o
    5tip                     @   s@  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	  m
Z d dlZd dlmZ d dlmZ d dlmZ d dlZd dlZd dlmZ d dlmZ d d	lmZ d d
lmZ zdZd dlmZ d dlmZ d dlm Z! W n e"y|   e#ZdZY nw erd dl$m%Z% e&e'Z(G dd deZ)edG dd deZ*dS )    N)defaultdict)TYPE_CHECKINGOptional)version)tqdm)GenerationConfig)utils)
TemplateLM)register_model)stop_sequences_criteriaT)NeuronModelForCausalLM)TokenSelector)__version__F)StoppingCriteriaListc                   @   sH   e Zd ZdZ			ddejdejdB ded ded d	ejf
d
dZdS )CustomNeuronModelForCausalLMz=NeuronModelForCausalLM with `stopping_criteria` in `generate`N	input_idsattention_maskstopping_criteriar   generation_configr   returnc                 K   sF  t |du r	| jn|}|jdi |}| | t||| | j}|j	| |j
\}}	|	| jkr>td|	 d| j d|}
|}|| jkrStd| d| j d|| jk r| jstd | j| |	g}tj|| jjtjd}t||g}
|durtj|tjd	}t||g}| j|
||fd
|i|}|d|ddf S )a  
        A streamlined generate method overriding the transformers.GenerationMixin.generate() method.

        This method uses the same logits processors/warpers and stopping criteria as the transformers library
        `generate()` method but restricts the generation to greedy search and sampling.

        It does not support transformers `generate()` advanced options.

        Please refer to https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
        for details on generation configuration.

        Parameters:
            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices.
            generation_config (`~transformers.generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~transformers.generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.

        Returns:
            `torch.Tensor`: A  `torch.FloatTensor`.
        NzThe input sequence length (z,) exceeds the model static sequence length ()zThe specified batch_size (z') exceeds the model static batch size (zWInputs will be padded to match the model static batch size. This will increase latency.)
fill_valuedtype)r   r    )copydeepcopyr   update_validate_model_kwargsr   create
max_lengthr   appendshape
ValueError
batch_sizecontinuous_batchingloggerwarningtorchfullconfigeos_token_idint64catzerosgenerate_tokens)selfr   r   r   r   kwargsmodel_kwargsselectorr#   sequence_lengthpadded_input_idspadded_attention_maskpadding_shapepadding
output_idsr   r   Q/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/models/neuron_optimum.pygenerate(   sX   $




z%CustomNeuronModelForCausalLM.generate)NNN)	__name__
__module____qualname____doc__r'   Tensorr   
LongTensorr:   r   r   r   r9   r   %   s     r   neuronxc                       s  e Zd ZdZ													dAd	edB d
edB dedB dedB dedB dedB dedB deejB dB dedB dedB dedB dedB dedB ddf fddZ	e
dd Ze
dd Ze
dd Ze
dd  Ze
defd!d"Ze
d#d$ Ze
d%d& Ze
d'd( Ze
d)d* ZdBd+efd,d-Z	.		dCd/ee d0ed1edefd2d3Zd4d5 Zd6d7 ZdBd8d9ZdDd:efd;d<Z	dEd:efd=d>ZdDd:efd?d@Z  ZS )F	NEURON_HFz
    Enables usage with on AWS Neuron
    using the HuggingFace Transformers + Transformers neuronx library.
    Tested with neuron 2.17.0
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0mainNFauto   T
pretrainedrevision	tp_degree	subfolder	tokenizer
truncationr   r   r#   low_cpu_mem_usagetrust_remote_codeuse_fast_tokenizeradd_bos_tokenr   c                    s
  t stdddtttdkrtdt  t   t	|t
s&J t	|	tt
fs/J t|	| _t|	}	tjj|||d| _t
|}||d urOd| nd }tjj|d u r[|n||||d	| _t| jd
d }|d u r|d urt	|tsJ d| dt| dtjj|}|tjkrd| _n|tjkrd| _n|tjkrd| _ntdtd d t j||||
d|	|| j|d	| _!| j!j"j#}td| dd  ntd d| d t j||||
d| _!tdd  || _$| jj%| _%| jj&| j_'|| _(d| _)i | _*d S )Nz8Tried to load neuron model, but neuron is not installed z:please install neuron via pip install transformers-neuron z6also make sure you are running on an AWS inf2 instancez0.0.24z`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2 You are using optimum-neuron=)rH   rN   / )rH   rN   use_fastneuronz7tp_degree must be set to an integer, but is tp_degree=`z` with type=`z`.Set it to a number lower than the number of neuron cores on your instance. For inf2.xlarge and inf2.8xlarge, set it to `2`. For inf2.24xlarge, set it <= `12`. For inf2.48xlarge, set it <= `24`.f16bf16f32z,Only float16/bfloat16/float32 are supported.z====================z 
 exporting model to neuronT)rH   rN   rM   exportr#   	num_coresauto_cast_typer3   z+SUCCESS: neuron model exported with config z. 
 z$ 
 loading neuron model with config z...)rH   rN   rM   z SUCCESS: neuron model loaded. 
 rF   )+NEURON_AVAILABLEImportErrorr   parseoptimum_neuron_versionr%   r&   super__init__
isinstancestrintbatch_size_per_gputransformers
AutoConfigfrom_pretrained_configAutoTokenizerrK   getattrtypelm_evalmodelsutils_hf	get_dtyper'   float16	amp_dtypebfloat16float32NotImplementedErrorprintr   modelr)   rT   rL   
vocab_sizer*   pad_token_idrP   batch_schedulebatch_sizes)r/   rG   rH   rI   rJ   rK   rL   r   r   r#   rM   rN   rO   rP   neuron_configtorch_dtype	__class__r   r9   r`      s   







zNEURON_HF.__init__c                 C      | j S N)rh   r/   r   r   r9   r)      s   zNEURON_HF.configc                 C      | j jS r   )rK   r*   r   r   r   r9   eot_token_id   s   zNEURON_HF.eot_token_idc                 C   s   | j jp| j jS r   )rK   bos_token_idr*   r   r   r   r9   prefix_token_id  s   zNEURON_HF.prefix_token_idc                 C   r   r   )rv   r   r   r   r   r9   r     s   zNEURON_HF.max_lengthc                 C      dS )N   r   r   r   r   r9   max_gen_toks
     zNEURON_HF.max_gen_toksc                 C   r   r   )rd   r   r   r   r9   r#     s   zNEURON_HF.batch_sizec                 C   r   )z<Device are neuron cores, but the created tensors are on CPU.cpur   r   r   r   r9   device  s   zNEURON_HF.devicec                 C   r   Nr   r   r   r   r   r9   rank  r   zNEURON_HF.rankc                 C   r   NrF   r   r   r   r   r9   
world_size  r   zNEURON_HF.world_sizestringc                 C   s4   |du r| j }| jj||d}|r|| d }|S )zEncode strings to tokensNF)add_special_tokens)rP   rK   encode)r/   r   left_truncate_lenr   encodingr   r   r9   
tok_encode  s   zNEURON_HF.tok_encodeleftstringspadding_sider   c                 C   s   | j j}|| j _| j}| j ||dd|d}|r5|d d d | d f |d< |d d d | d f |d< || j _|d |d fS )NFlongestpt)rL   r7   return_tensorsr   r   r   )rK   r   rP   )r/   r   r   r   rL   old_padding_sider   r   r   r   r9   tok_batch_encode,  s"   zNEURON_HF.tok_batch_encodec                 C   s   | j |S r   )rK   decode)r/   tokensr   r   r9   
tok_decodeI  s   zNEURON_HF.tok_decodec              	   K   s   t  7 d| vrd|d< t| j|| j| jjgg d|jd }| j	j
d|||| jdd|W  d    S 1 s>w   Y  d S )N	do_sampleFrF   r   T)r   r   r   rx   	use_cacher   )r'   inference_modekeysr   rK   r   r)   r*   r!   rv   r:   r   )r/   contextr   stopgeneration_kwargsr   r   r   r9   _model_generateL  s&   
$zNEURON_HF._model_generatec                 C   s$   |r|sJ d||| | }|S )NzGMust pass input len and cont. len to select scored logits for causal LMr   )r/   logitscontleninplenr   r   r9   _select_cont_toksd  s
   
zNEURON_HF._select_cont_toksdisable_tqdmc              
   C   sB  g }d }t dd |D |p| jdkdD ]\}tttjtj| || j| j	dd}dd |D }d}| j
dkrgtjt|| jd}| j|    }	t|	|	| j  }|dkrg|||d g 7 }| j|d	|d
}
| j
dkr|dkrdd |
d |  D }
ndd |
D }
t|
}
||
 | jd|f|
 q|S )Nc                 S      g | ]}|j qS r   args.0reqr   r   r9   
<listcomp>t      z3NEURON_HF.loglikelihood_rolling.<locals>.<listcomp>r   disablerF   )
token_listprefix_tokenmax_seq_lencontext_lenc                 S   s   g | ]}d | qS )r   r   r   xr   r   r9   r         )r   T)r   override_bsc                 S      g | ]}|d  qS r   r   r   r   r   r9   r     r   c                 S   r   r   r   r   r   r   r9   r     r   loglikelihood_rolling)r   r   listmapr   make_disjoint_windowget_rolling_token_windowsr   r   r   r   r'   tensorlenr   acceleratorgatherr   detachnumpytolistmax_loglikelihood_tokenssumr    
cache_hookadd_partial)r/   requestsr   loglikelihoodsadaptive_batch_sizer   rolling_token_windowspad_amntmytensorgathered
string_nllr   r   r9   r   n  sJ   

zNEURON_HF.loglikelihood_rollingc           %   
   C   sn  g }dd }t ||}t| }tjj j| | jd d}t||p'| j	dkdD ]}	g }
g }g }g }g }d }d }|	D ]T\}}}t|dksJJ t|dksRJ t|| j
ks[J tj|| | j
d  d  d d tj| jd}|j\}|d urt||n|}|
| || || q=t|
| jk r|
t|
d g| jt|
   }
d	d
 |
D }tjjj||
dd}tjjj||dd}| jjjjr| j||}tj| jjdi |jdd}n_| j|d d d df |d d d df }| jjdi |jg}td|D ]+}| j|d d d |d f |d d d |d f }|| jjdi |j q	tjtj|dddd}t |	|||ddD ]f\\}}}}}}t|} ||jd |  }!| j!|| |!d}|"d}|j#dd}"tj|tj| jd"d}|"|k$ }#t%|d|"d&d}t'|( t)|#f}$||$ |d ur| j*+d||$ qJq*|,|S )Nc                 S   s"   | d | d  }t | t|fS )NrF      )r   tupler   toksr   r   r9   _collate  s   z1NEURON_HF._loglikelihood_tokens.<locals>._collate)nfnr   r   rF   )r   r   c                 S   s   g | ]}t |qS r   )r'   	ones_like)r   inpr   r   r9   r     s    z3NEURON_HF._loglikelihood_tokens.<locals>.<listcomp>right)r   )dimFstrict)r   r   r   loglikelihoodr   )-r   	Reordererr   get_reorderedrl   rm   chunksr#   r   r   r   r'   r   longr   r!   r   r    
zeros_likern   pad_and_concatrv   r{   output_all_logitsprepare_inputs_for_prefillFlog_softmaxforwardr   rangeprepare_inputs_for_decodeconcatzipr   	unsqueezeargmaxallr   squeezefloatr   boolr   r   get_original)%r/   r   r   r   resr   re_ordn_reordered_requestsr   chunkinpscont_toks_listinplenscontsencoder_attnspadding_len_inppadding_len_cont_context_enccontinuation_encr   r   masksbatched_inpsbatched_masksinputsmulti_logitsoutputsi	cache_keyr   	cont_toksr   ctx_lengreedy_tokens	max_equalanswerr   r   r9   r     s   



$,





%zNEURON_HF._loglikelihood_tokensc                    s  t t}i } fdd}tjj|dd }|  D ]\}}tdd |D |||< qt	t
||p8 jdkd}	| D ]	\}}
tjjj|
  jd	}t	| jdkd
D ]}t|ddi\}}|d }d }t|trt|}d| v r|d}t|tr|g}nt|tstd| ntd|   j}|s|g}n|| d| v r|d}n j}|d g} j| } j|| jd\}}| j }| j }d|vr|j!d | |d<  j"d|||d|}|# }t||ddD ]=\}}||j!d d  } |}|D ]}t
|dkr%|$|d }q|| |  j%&d||f| |	'd qqY|
(|| ||< q?|	)  |(|S )Nc                    s      | d }t| | d fS r   )r   r   r   r   r   r9   r   <  s   z*NEURON_HF.generate_until.<locals>._collatec                 S   s   t | jd S r   )rb   r   )r   r   r   r9   <lambda>I  s    z*NEURON_HF.generate_until.<locals>.<lambda>c                 S   r   r   r   r   r   r   r9   r   L  r   z,NEURON_HF.generate_until.<locals>.<listcomp>r   )totalr   )r   r   r   FuntilzAExpected `kwargs['until']` to be of type Union[str,list] but got z/Expected `kwargs` to be of type `dict` but got r   )r   rL   r   rF   )r   r   r   r   generate_untilr   )*r   r   rl   rm   r   Grouperget_groupeditemsr   r   r   r   r   r   r#   r   ra   dictr   r   r   poprb   r"   r   r   r    r   r   r   rL   tor   r!   r   r   splitr   r   r   r   close)r/   r   r   r   re_ordsr   grouperkeyreqspbarr  r   r  contextsall_gen_kwargs
gen_kwargsr  r0   eosr   primary_untilmax_ctx_lenr  
attn_maskscontr  r  r   stermr   r   r9   r  8  s   












zNEURON_HF.generate_until)rC   rD   NNNFNrE   rF   TFTF)NN)r   NF)F)FN)r;   r<   r=   r>   rb   rc   r   r'   r   r`   propertyr)   r   r   r   r   r#   r   r   r   r   r   r   r   r   r   r   r   r  __classcell__r   r   r}   r9   rB   }   s    	
r










6
 rB   )+r   loggingcollectionsr   typingr   r   r'   torch.nn.functionalnn
functionalr   re   	packagingr   r   r   lm_eval.models.utilsrl   lm_eval.models.utils_hfr   lm_eval.api.modelr	   lm_eval.api.registryr
   r   r[   optimum.neuronr   optimum.neuron.generationr   optimum.neuron.versionr   r^   r\   objecttransformers.generationr   	getLoggerr;   r%   r   rB   r   r   r   r9   <module>   s>    
X