o
    %ݫiW>                     @   sx   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ ddlmZ eeZG dd	 d	eZdS )
zThis lobe enables the integration of huggingface pretrained LLAMA2-chat model.

Transformer from HuggingFace needs to be installed:
https://huggingface.co/transformers/installation.html

Authors
 * Pooneh Mousavi 2023
 * Ha Nguyen 2023
    N)
Linear4bit)
LoraConfigget_peft_modelprepare_model_for_kbit_training)BitsAndBytesConfig)HFTransformersInterface)
get_loggerc                       s   e Zd ZdZ											
		d-dededededededededededededededdf fddZde	j
de	j
fdd Zd!gfd"d#Zd$d% Z	&d.de	j
de	j
fd'd(Zd)d* Zd+d, Z  ZS )/LLAMA2u  This lobe enables the integration of HuggingFace pretrained LLAMA2 model.
     Source paper LLAMA2:
       https://arxiv.org/abs/2307.09288
    Transformer from HuggingFace needs to be installed:
        https://huggingface.co/transformers/installation.html

    The model can be finetuned. It will download automatically the model from
    HuggingFace or use a local path.

    Notes:
    - To use this model, you need to install the extra dependencies in recipes/MultiWOZ/response_generation/llama2/extra_requirements.txt
    - transformers and peft libraries should follow the versions mentioned in the extra_requirements.
    - Llama 2 is licensed under the LLAMA 2 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.

    Arguments
    ---------
    source : str
        HuggingFace hub name: e.g "meta-llama/Llama-2-7b-chat-hf"
    save_path : str
        Path (dir) of the downloaded model.
    freeze : bool (default: False)
        If True, the model is frozen. If False, the model will be trained
        alongside with the rest of the pipeline.
    max_new_tokens: int (default: 200)
    use_4bit: bool (default: True)
    bnb_4bit_compute_dtype: str (default: "float16")
        This sets the computational type which might be different than the input time. For example, inputs might be fp32, but computation can be set to bf16 for speedups.
    bnb_4bit_quant_type: str (default:"nf4")
        This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types which are specified by fp4 or nf4.
    use_nested_quant: bool (default: False)
        You have set this to False, which means you're not using nested quantization. This seems reasonable, as nested quantization can be computationally expensive.
    min_length: int (default: 1)
        The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + min_new_tokens. Its effect is overridden by min_new_tokens, if also set
    top_k: int (default: 45)
        The number of highest probability vocabulary tokens to keep for top-k-filtering.
    top_p: float (default: 0.9)
        If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
    num_beams: int (default: 8)
         Number of beams for beam search. 1 means no beam search.
    early_stopping: bool (default: True)
        Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
        - True, where the generation stops as soon as there are num_beams complete candidates
        - False, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates
        - "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
    with_peft: bool (default:False)
        If set to True, the peft model (model + adaptors) are loaded. If set to False, the original model is loaded.

    Example
    -------
    >>> model_hub = "meta-llama/Llama-2-7b-chat-hf"
    >>> save_path = "savedir"
    >>> model = LLAMA2(model_hub, save_path)
    >>> tokens = torch.tensor([[1, 1]])
    >>> attention_mask = torch.tensor([[1, 1]])
    >>> outputs = model(tokens, attention_mask)
    F   Tfloat16nf4   -   ?   source	save_pathfreezemax_new_tokensuse_4bitbnb_4bit_compute_dtypebnb_4bit_quant_typeuse_nested_quant
min_lengthtop_ktop_p	num_beamsearly_stopping	with_peftreturnNc                    s"  || _ || _|	| _|
| _|| _|| _|| _|| _|| _d| _	t
t|}d | _|rRt||||d| _|tjkrR|rRtj \}}|dkrRtd td td t j|||d| jd | j|d dd d	| j_d
| j_|r| j	st| j| _tdddddd}t| j|| _| | j d S )NF)load_in_4bitr   r   bnb_4bit_use_double_quantr   zP================================================================================z>Your GPU supports bfloat16: accelerate training with bf16=TrueT)r   r   r   with_casual_lmquantization_config)r   	pad_tokenuse_fastz<PAD>right   皙?@   none	CAUSAL_LM
lora_alphalora_dropoutrbias	task_type)r   r   r   r   r   r   r   r   r   is_sbgetattrtorch
bnb_configr   r   cudaget_device_capabilityloggerinfosuper__init__load_tokenizer	tokenizerr$   padding_sider   modelr   r   print_trainable_parameters)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   compute_dtypemajor_config	__class__ l/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/huggingface_transformers/llama2.pyr;   S   s`   



zLLAMA2.__init__	input_idsattention_maskc                 C   sD   t | j  | jj||d}W d   |S 1 sw   Y  |S )a  Takes an input a history of conversation and returns its corresponding reply.

        Arguments
        ---------
        input_ids : torch.Tensor
            A batch of input-id to transform to features.
        attention_mask : torch.Tensor
            A batch of attention_mask.

        Returns
        -------
        output : torch.Tensor
            Reply to conversation.
        )rK   N)r4   set_grad_enabledr   r?   forward)rA   rJ   rK   outputrH   rH   rI   rM      s   
zLLAMA2.forward
base_modelc              
   C   s2  d| _ tj|dd}tdd |D d}||d| jjj }|dkr,| jj	dd	 | j
rdd
lm} || jdg| jd| _ddlm} | j }| D ]}	tj||	 jtjdd||	< qO|| j|| d| dditjdd\}
}}ddlm} d| j_d| j_|j| j_d| j_d| j_d| j_i }d|d< d|d< d|d< d|d< d|d< d|d< d|d< d|d< d|d< d|d < || jj_dd!lm } ddiddd"d#}|| jfi | t!| j| _t"d$d%d&d'd(d)}t#| j|| _i }|$ D ]\}	}|D ]}| |	v r|	%d*| | }|||< qq|S )+aF  A custom loading ensures SpeechBrain compatibility for Pretrain and model
        de/serialization. Here, the scope is to remove '.wav2vec2' before loading.

        Arguments
        ---------
        path : str
            Checkpoint path, file name relative to the repo root.
        replaceables : List[str]
            State dict sub-keys that if found, shall be dropped (incl. the 'model.' parent key), elevating key structures.

        Returns
        -------
        modified_state_dict : see torch.load
            SpeechBrain-valid deserialized pretrained model.
        Tcpu)map_locationc                 s   s    | ]	}d |v r|V  qdS )zembed_tokens.weightNrH   ).0keyrH   rH   rI   	<genexpr>   s    z,LLAMA2._modify_state_dict.<locals>.<genexpr>Nr   i}  )new_num_tokens)replace_with_bnb_linearlm_head)modules_to_not_convertr#   ) _load_state_dict_into_meta_model)dtypedevice )r?   
state_dictloaded_state_dict_keysstart_prefixexpected_keys
device_maprZ   is_quantized)QuantizationMethodFr   r   r   r   r!    llm_int8_enable_fp32_cpu_offloadllm_int8_has_fp16_weightllm_int8_skip_modulesg      @llm_int8_thresholdr    load_in_8bitbitsandbytesquant_method)dispatch_modelpast_key_values)ra   offload_diroffload_index	skip_keysr'   r(   r)   r*   r+   r,   zmodel.)&r2   r4   loadnextgetsizer?   rE   
vocab_sizeresize_token_embeddingsr   transformers.integrationsrV   r5   transformers.modeling_utilsrY   r]   keysrandshaper   &transformers.utils.quantization_configrc   _is_quantized_training_enabledis_8bit_serializableBITS_AND_BYTESquantization_methodrb   is_loaded_in_4bitis_loaded_in_8bitr#   
acceleraterk   r   r   r   itemsreplace)rA   pathreplaceablesorig_state_dictdesired_keyrU   rV   rY   r]   rS   new_error_msgsrn   state_dict_indexrc   r#   rk   device_map_kwargslora_configmodified_state_dictparamstagsave_keyrH   rH   rI   _modify_state_dict   s   


zLLAMA2._modify_state_dictc              	   C   sR   |  D ]"\}}t|tjr!|dkr!t||t|j|j|jd q| 	| qdS )zModify the loaded module linear layers with Linear4bit to be compatible

        Arguments
        ---------
        module : nn.module
            llama2 model.
        rW   )r0   N)
named_children
isinstancennLinearsetattrr   in_featuresout_featuresr0   replace_linear)rA   modulenamechildrH   rH   rI   r   6  s   zLLAMA2.replace_lineargreedyc                 C   s   t  9 |dkr#| jj||d| j| j| j| jd| jddd| j	d}n| jj|| j|d}W d   |S W d   |S 1 s@w   Y  |S )a  Takes an input a history of conversation and returns its corresponding reply.

        Arguments
        ---------
        input_ids : torch.Tensor
            A batch of input-id which are dialogue context tokens
        attention_mask : torch.Tensor
            A batch of attention_mask.
        decoder_type : str
            It shows strategy for autoregressive decoding either beam search or greedy.

        Returns
        -------
        hyp : torch.Tensor
            Reply to conversation input.
        beamTg      ?r   )rJ   rK   	do_sampler   r   r   r   temperaturer   num_return_sequencesrepetition_penaltylength_penaltyr   )rJ   r   rK   N)
r4   no_gradr?   generater   r   r   r   r   r   )rA   rJ   rK   decoder_typehyprH   rH   rI   r   K  s:   


zLLAMA2.generatec                 C   s    | j r|j| j| j| j d}|S )a  override config to include quantization config.

        Arguments
        ---------
        config : HuggingFace config object
            The original config.

        Returns
        -------
        config : HuggingFace config object
            Overridden config.
        )	cache_dirr#   )r5   from_pretrainedr   r   )rA   rE   rH   rH   rI   override_config}  s   zLLAMA2.override_configc              	   C   s`   d}d}|  D ]\}}|| 7 }|jr|| 7 }qtd| d| dd| |   dS )zI
        Prints the number of trainable parameters in the model.
        r   ztrainable params: z || all params: z || trainable%: d   N)named_parametersnumelrequires_gradr8   r9   )rA   r?   trainable_params	all_paramrD   paramrH   rH   rI   r@     s   z!LLAMA2.print_trainable_parameters)Fr
   Tr   r   Fr   r   r   r   TF)r   )__name__
__module____qualname____doc__strboolintfloatr;   r4   TensorrM   r   r   r   r   r@   __classcell__rH   rH   rF   rI   r	      sr    =	
M 
2r	   )r   r4   torch.nnr   bitsandbytes.nnr   peftr   r   r   transformersr   =speechbrain.lobes.models.huggingface_transformers.huggingfacer   speechbrain.utils.loggerr   r   r8   r	   rH   rH   rH   rI   <module>   s    
