o
    3wi;O                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dl mZ d dlmZm	Z	 d dl
mZ d dlZd dlZd dlmZ d dlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ee*Z+e" rd dl,m-  m.Z/ dej0j1de2fddZ3dej0j1de2fddZ4dej0j1de2fddZ5dej0j1de2fddZ6dej0j1dej0j1fddZ7dej0j1fdd Z8d!ej0j1de2fd"d#Z9	%dKd&e2d'e2d(e2fd)d*Z:d+d, Z;d-e<fd.d/Z=dLd0e2d1e2fd2d3Z>e d4rej?nej-Z@e@jAjBejCeejDgZEe d5reEFejGjH dMd6d7ZId8d9 ZJd:d; ZKdMd<eLde2fd=d>ZMdeLfd?d@ZNdAdB ZOdCdD ZPdEeQfdFdGZRdNd!ej0j1dHe2deSej0j1 fdIdJZTdS )O    N)encode)OrderedDict)partialreduce)
MethodType)Version)	save_file   )write_basic_config)
get_logger)PartialState   )FSDP_PYTORCH_VERSION)DistributedType)is_deepspeed_availableis_numpy_availableis_torch_distributed_availableis_torch_xla_availableis_weights_only_available)id_tensor_storage)convert_model)is_torch_versionmodulereturnc                 C   s   t tdsdS t| tjjjS )zD
    Check whether the module was compiled with torch.compile()
    _dynamoF)hasattrtorch
isinstancer   
eval_frameOptimizedModuler    r!   S/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/accelerate/utils/other.pyis_compiled_module5   s   
r#   c                 C   s<   t tdsdS | jr|  D ]}t|tjjjr dS qdS )z\
    Check whether the module has submodules that were compiled with `torch.compile()`.
    r   FT)r   r   _modulesmodulesr   r   r   r   r   	submoduler!   r!   r"   has_compiled_regions?   s   
r(   c                    s$   t  tjjot fdd D S )z
    Check whether the module is a repeated block, i.e. `torch.nn.ModuleList` with all children of the same class. This
    is useful to determine whether we should apply regional compilation to the module.
    c                 3   s     | ]}t | d  jV  qdS )r   N)r   	__class__).0mr    r!   r"   	<genexpr>T   s    z%is_repeated_blocks.<locals>.<genexpr>)r   r   nn
ModuleListallr    r!   r    r"   is_repeated_blocksN   s   $r0   c                 C   s&   | j r|  D ]	}t|r dS qdS )z
    Check whether the module has repeated blocks, i.e. `torch.nn.ModuleList` with all children of the same class, at
    any level of the module hierarchy. This is useful to determine whether we should apply regional compilation to the
    module.
    TF)r$   r%   r0   r&   r!   r!   r"   has_repeated_blocksW   s   r1   c                    sF   dt jjdt jjf fdd  | fi |}d|jvr!| |jd< |S )a_  
    Performs regional compilation where we target repeated blocks of the same class and compile them sequentially to
    hit the compiler's cache. For example, in `GPT2LMHeadModel`, the repeated block/class is `GPT2Block`, and can be
    accessed as `model.transformer.h[0]`. The rest of the model (e.g. model.lm_head) is compiled separately.

    This allows us to speed up the compilation overhead / cold start of models like LLMs and Transformers in general.
    See https://pytorch.org/tutorials/recipes/regional_compilation.html for more details.

    Args:
        module (`torch.nn.Module`):
            The model to compile.
        **compile_kwargs:
            Additional keyword arguments to pass to `torch.compile()`.

    Returns:
        `torch.nn.Module`: A new instance of the model with some compiled regions.

    Example:
    ```python
    >>> from accelerate.utils import compile_regions
    >>> from transformers import AutoModelForCausalLM

    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
    >>> compiled_model = compile_regions(model, mode="reduce-overhead")
    >>> compiled_model.transformer.h[0]
    OptimizedModule(
        (_orig_mod): GPT2Block(
                (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (attn): GPT2Attention(
                (c_attn): Conv1D(nf=2304, nx=768)
                (c_proj): Conv1D(nf=768, nx=768)
                (attn_dropout): Dropout(p=0.1, inplace=False)
                (resid_dropout): Dropout(p=0.1, inplace=False)
            )
            (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): GPT2MLP(
                (c_fc): Conv1D(nf=3072, nx=768)
                (c_proj): Conv1D(nf=768, nx=3072)
                (act): NewGELUActivation()
                (dropout): Dropout(p=0.1, inplace=False)
            )
        )
    )
    ```
    r   r   c                    s   t | rtj }| D ]}|tj|fi | q|S t| rH| j| j}|j	
| j	 i |_|  D ]\}}|| |fi | q5|S tj| fi |}|S N)r0   r   r-   r.   appendcompiler1   r)   __new____dict__updater$   named_children
add_module)r   compile_kwargs
new_moduler'   name_compile_regionsr!   r"   r>      s   

z)compile_regions.<locals>._compile_regions	_orig_mod)r   r-   Moduler6   )r   r:   r;   r!   r=   r"   compile_regionse   s
   /

rA   c                 K   sd   t | r| D ]
}|jdi | qdS t| r(|  D ]
}t|fi | qdS | jdi | dS )a  
    Performs regional compilation the same way as `compile_regions`, but specifically for `DeepSpeedEngine.module`.
    Since the model is wrapped in a `DeepSpeedEngine` and has many added hooks, offloaded parameters, etc that
    `torch.compile(...)` interferes with, version of trgional compilation uses the inplace `module.compile()` method
    instead.

    Args:
        module (`torch.nn.Module`):
            The model to compile.
        **compile_kwargs:
            Additional keyword arguments to pass to `module.compile()`.
    Nr!   )r0   r4   r1   childrencompile_regions_deepspeed)r   r:   r'   childr!   r!   r"   rC      s   rC   modelc                    s>   t ddrddlm  nddlm  t fdd|  D S )z
    Check if the model has DTensor parameters.

    Args:
        model (`torch.nn.Module`):
            The model to check.

    Returns:
        `bool`: Whether the model has DTensor parameters.
    >=z2.5.0r   DTensorc                 3   s    | ]}t | V  qd S r2   )r   )r*   prG   r!   r"   r,      s    z$model_has_dtensor.<locals>.<genexpr>)r   torch.distributed.tensorrH   torch.distributed._tensorany
parameters)rE   r!   rG   r"   model_has_dtensor   s   
rN   TFkeep_fp32_wrapperkeep_torch_compile	recursivec                    sL  t jjjt jjf}t| }t| }|r| }| j} n	|r"| }| jd } t	 r0ddl
m} ||f7 }tdtrCt rCddlm}	 ||	f7 }t| |rP| j} t| |sH|r\ fdd  | } |s| j}
| jdd	}|d	urt|
d
r~|
j}
|
|krynt|
d
sqt|
| | _t| ddrt| dd |r|r| |_|} | S |r| |jd< |} | S )a  
    Extract a model from its distributed containers.

    Args:
        model (`torch.nn.Module`):
            The model to extract.
        keep_fp32_wrapper (`bool`, *optional*):
            Whether to remove mixed precision hooks from the model.
        keep_torch_compile (`bool`, *optional*):
            Whether to unwrap compiled model.
        recursive (`bool`, *optional*, defaults to `False`):
            Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
            recursively, not just the top-level distributed containers.

    Returns:
        `torch.nn.Module`: The extracted model.
    r?   r   )DeepSpeedEnginerF   )FullyShardedDataParallelc                    s@   t | dr | j}n| }| D ]\}}t|| | q|S )Nr   )r   r   r8   setattr)r   unwrapped_moduler<   rD   _recursive_unwrapr!   r"   rW     s   
z6extract_model_from_parallel.<locals>._recursive_unwrap_original_forwardN__wrapped__ _converted_to_transformer_engineF)to_transformer_engine)r   r-   parallelDistributedDataParallelDataParallelr#   r(   r?   r6   r   	deepspeedrR   r   r   r   2torch.distributed.fsdp.fully_sharded_data_parallelrS   r   r   forwardpopr   rY   r   getattrr   )rE   rO   rP   rQ   optionsis_compiledhas_compiledcompiled_modelrR   FSDPra   original_forwardr!   rV   r"   extract_model_from_parallel   sT   







rj   c                   C   s   t    dS )a  
    Introduces a blocking point in the script, making sure all processes have reached this point before continuing.

    <Tip warning={true}>

    Make sure all processes will reach this instruction otherwise one of your processes will hang forever.

    </Tip>
    N)r   wait_for_everyoner!   r!   r!   r"   rk   .  s   
rk   
state_dictc                    s   t t}  D ]\}}t|ts|t| | q	dd | D }t }|	 D ] } fdd|D }|
|dd  |dd D ]} |= qFq,t|dkr\td| d	 d
d   D   S )z
    Cleans the state dictionary from a model and removes tensor aliasing if present.

    Args:
        state_dict (`dict`):
            The state dictionary from a model
    c                 S   s"   i | ]\}}t |d kr||qS )r   )len)r*   ptrnamesr!   r!   r"   
<dictcomp>J  s   " z4clean_state_dict_for_safetensors.<locals>.<dictcomp>c                    s   g | ]}| v r|qS r!   r!   )r*   r<   rl   r!   r"   
<listcomp>R  s    z4clean_state_dict_for_safetensors.<locals>.<listcomp>r   Nr   zRemoved shared tensor zk while saving. This should be OK, but check by verifying that you don't receive any warning while reloadingc                 S   s*   i | ]\}}|t |tjr| n|qS r!   )r   r   Tensor
contiguous)r*   kvr!   r!   r"   rp   Z  s   * )collectionsdefaultdictlistitemsr   strr   r3   setvaluesr7   rm   loggerwarning)rl   ptrsr<   tensorshared_ptrs
warn_namesro   found_namesr!   rq   r"    clean_state_dict_for_safetensors;  s&   


r   save_on_each_nodesafe_serializationc                 C   s   t  jtjkrt| } |r ttddid}t| t	rt
| } ntj}t  jr0|s0|| | dS t  jr=|r?|| | dS dS dS )a  
    Save the data to disk. Use in place of `torch.save()`.

    Args:
        obj:
            The data to save
        f:
            The file (or file-like object) to use to save the data
        save_on_each_node (`bool`, *optional*, defaults to `False`):
            Whether to only save on the global main process
        safe_serialization (`bool`, *optional*, defaults to `False`):
            Whether to save `obj` using `safetensors` or the traditional PyTorch way (that uses `pickle`).
    formatpt)metadataN)r   distributed_typer   XLAxm_maybe_convert_to_cpur   safe_save_filer   r   r   r   saveis_main_processis_local_main_process)objfr   r   	save_funcr!   r!   r"   r   ^  s   

r   z2.0.0z1.25.0c              	   K   s   z;t  rtj }d|vrd|d< tjt n|dd tj| fd|i|}W t  r:tj  |r:tj| |S t  rMtj  |rNtj| w w w )a  
    Compatible drop-in replacement of `torch.load()` which allows for `weights_only` to be used if `torch` version is
    2.4.0 or higher. Otherwise will ignore the kwarg.

    Will also add (and then remove) an exception for numpy arrays

    Args:
        f:
            The file (or file-like object) to use to load the data
        map_location:
            a function, `torch.device`, string or a dict specifying how to remap storage locations
        **kwargs:
            Additional keyword arguments to pass to `torch.load()`.
    weights_onlyTNmap_location)	r   r   serializationget_safe_globalsadd_safe_globalsTORCH_SAFE_GLOBALSrb   loadclear_safe_globals)r   r   kwargsold_safe_globals
loaded_objr!   r!   r"   r     s&   


r   c                 C   sH   t | dst | dst| d| } t | dr| jS t | dr | jS t| S )z(
    Gets a pretty name from `obj`.
    __qualname____name__r)   )r   rc   r   r   r{   )r   r!   r!   r"   get_pretty_name  s   

r   c                 C   s@   |   D ]\}}t|tr||i }t|| q|||< q|S )z
    Recursively merges two dictionaries.

    Args:
        source (`dict`): The dictionary to merge into `destination`.
        destination (`dict`): The dictionary to merge `source` into.
    )rz   r   dict
setdefaultmerge_dicts)sourcedestinationkeyvaluenoder!   r!   r"   r     s   

r   portc                 C   sR   | du rd} t  t jt j}|d| fdkW  d   S 1 s"w   Y  dS )z
    Checks if a port is in use on `localhost`. Useful for checking if multiple `accelerate launch` commands have been
    run and need to see if the port is already in use.
    Ni<s  	localhostr   )socketAF_INETSOCK_STREAM
connect_ex)r   sr!   r!   r"   is_port_in_use  s
   $r   c                  C   sJ   t  t jt j} | d |  d W  d   S 1 sw   Y  dS )z
    Gets a free port on `localhost`. Useful for automatic port selection when port 0 is specified in distributed
    training scenarios.

    Returns:
        int: An available port number
    ) r   r   N)r   r   r   bindgetsockname)r   r!   r!   r"   get_free_port  s   

$r   c                 C   sB   dD ]}| dk rt | d d|   S | d } qt | d dS )z7Converts `size` from bytes to the largest possible unit)bytesKBMBGBTBg      @r	    z PB)round)sizexr!   r!   r"   convert_bytes  s
   
r   c                  C   sj   t  } | j}|dkrdS td| j^}}}d}t|t|k r3d| d| d}tj|dd	 dS dS )
zFWarns if the kernel version is below the recommended minimum on Linux.LinuxNz(\d+\.\d+\.\d+)z5.5.0zDetected kernel version z,, which is below the recommended minimum of zo; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.T)main_process_only)	platformunamesystemresplitreleaser   r~   r   )infor   _versionmin_versionmsgr!   r!   r"   check_os_kernel  s   r   attrc                 C   s   dd }t || g|d S )z
    Recursive `getattr`.

    Args:
        obj:
            A class instance holding the attribute.
        attr (`str`):
            The attribute that is to be retrieved, e.g. 'attribute1.attribute2'.
    c                 S   s
   t | |S r2   )rc   )r   r   r!   r!   r"   _getattr  s   
z#recursive_getattr.<locals>._getattr.)r   r   )r   r   r   r!   r!   r"   recursive_getattr  s   r   return_fqnsc           
      C   s   |s| nd| f}|g}g }|rS|  }|r|\}}| D ]$\}}t|tjjrA|r<|r2|d | n|}	||	|f q|| q|rL|||f n|| |s|ddd S )aA  Traverse the model in bottom-up order and return the children modules in that order.

    Args:
        model (`torch.nn.Module`): the model to get the children of

    Returns:
        `list[torch.nn.Module]`: a list of children modules of `model` in bottom-up order. The last element is the
        `model` itself.
    r   r   N)rb   r8   r   r   r-   r@   r3   )
rE   r   topstackordered_modulescurrent_modulecurrent_module_namer<   r   
child_namer!   r!   r"   get_module_children_bottom_up  s&   


r   )TTF)FFr2   )F)Urw   r   r   r   codecsr   r   	functoolsr   r   typesr   numpynpr   packaging.versionr   safetensors.torchr   r   commands.config.defaultr
   loggingr   stater   	constantsr   dataclassesr   importsr   r   r   r   r   modelingr   transformer_enginer   versionsr   r   r~   torch_xla.core.xla_modelcore	xla_modelr   r-   r@   boolr#   r(   r0   r1   rA   rC   rN   rj   rk   r   r   r   _corenp_core
multiarray_reconstructndarraydtyper   r3   dtypesUInt32DTyper   r   r   intr   r   r   r   r{   r   ry   r   r!   r!   r!   r"   <module>   sv   
	H
U#$
	
 
(