o
    Gi                     @   sn  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dl mZ d dlm	Z	 d dl
mZmZ d dlZd dlZeedddurTd dlmZmZ d dlmZ d dlmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& e& rd dl'Z'e'j(j)* rd dl)Z)e" rd dl+m,Z, e# rd dl-m.Z. e% rd dl/m0Z0 e$ rd dl1Z1de2fddZ3dd Z4de5fddZ6	dVdededej7dej7dej7dej7d ej7d!e8d"e9ej7ej7f fd#d$Z:ded"e;e5ej7f fd%d&Z<ej=fd'ej>j?e@ej>j? B fd(d)ZAd*e;e5ej7f d+e5d,ej>j?fd-d.ZBd/e;e5ej>j?f d"e;e5ef fd0d1ZC				2	dWd3e5d4e2d5e8d6e8d7e8d8ejDe5B d9ejEdB fd:d;ZFdXd3e5fd<d=ZGd>d? ZHed@dAdBej>j?eB d8e5ejDB dCeIfdDdEZJdFdG ZKdHdI ZLd"e;fdJdKZMd"e;fdLdMZN	@	@	@		dYd'ej>j?d8e5ejDB dCeIdNeIdOeIdPe;e5ef dB dQeOePej>j?  dB d"efdRdSZQG dTdU dUZRdS )Z    N)contextmanager)partial)AnyIterabledistributed)
CPUOffloadShardingStrategy)FullyShardedDataParallel)transformer_auto_wrap_policy   )UNet2DConditionModel)DiffusionPipeline)SchedulerMixin)convert_state_dict_to_diffusersconvert_state_dict_to_peft	deprecateis_accelerate_availableis_peft_availableis_torch_npu_availableis_torchvision_availableis_transformers_available)
get_logger)set_peft_model_state_dict)
transformsseedc                 C   sF   t |  tj |  t|  t rtj|  dS tj|  dS )z
    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.

    Args:
        seed (`int`): The seed to set.

    Returns:
        `None`
    N)	randomr   nptorchmanual_seedr   npumanual_seed_allcuda)r    r"   L/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/training_utils.pyset_seed7   s   


r$   c                 C   s   | j }|d }d| d }|j|jd|  }t|jt|jk r0|d }t|jt|jk s"||j}|j|jd|  }t|jt|jk rY|d }t|jt|jk sK||j}|| d }|S )a  
    Computes SNR as per
    https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
    for the given timesteps using the provided noise scheduler.

    Args:
        noise_scheduler (`NoiseScheduler`):
            An object containing the noise schedule parameters, specifically `alphas_cumprod`, which is used to compute
            the SNR values.
        timesteps (`torch.Tensor`):
            A tensor of timesteps for which the SNR is computed.

    Returns:
        `torch.Tensor`: A tensor containing the computed SNR values for each timestep.
          ?      ?device).N   )alphas_cumprodtor(   floatlenshapeexpand)noise_scheduler	timestepsr*   sqrt_alphas_cumprodsqrt_one_minus_alphas_cumprodalphasigmasnrr"   r"   r#   compute_snrK   s   r7   interpolation_typec                 C   s   t  std| dkrtjj}|S | dkrtjj}|S | dkr%tjj}|S | dkr/tjj}|S | dkr9tjj}|S | dkrCtjj	}|S | dkrMtjj
}|S td	|  d
)a  
    Maps a string describing an interpolation function to the corresponding torchvision `InterpolationMode` enum. The
    full list of supported enums is documented at
    https://pytorch.org/vision/0.9/transforms.html#torchvision.transforms.functional.InterpolationMode.

    Args:
        interpolation_type (`str`):
            A string describing an interpolation method. Currently, `bilinear`, `bicubic`, `box`, `nearest`,
            `nearest_exact`, `hamming`, and `lanczos` are supported, corresponding to the supported interpolation modes
            in torchvision.

    Returns:
        `torchvision.transforms.InterpolationMode`: an `InterpolationMode` enum used by torchvision's `resize`
        transform.
    zhPlease make sure to install `torchvision` to be able to use the `resolve_interpolation_mode()` function.bilinearbicubicboxnearestnearest_exacthamminglanczoszThe given interpolation mode z is not supported. Currently supported interpolation modes are `bilinear`, `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`.)r   ImportErrorr   InterpolationModeBILINEARBICUBICBOXNEARESTNEAREST_EXACTHAMMINGLANCZOS
ValueError)r8   interpolation_moder"   r"   r#   resolve_interpolation_modep   s8   	
rK   r&   unetr0   r1   noisenoisy_latentstargetencoder_hidden_statesdream_detail_preservationreturnc                 C   s   |j |j|dddf }d| d }	|	| }
d}t  | |||j}W d   n1 s/w   Y  d\}}|jjdkr[|}||  }|	|
 |
|	| }|
|}||fS |jjdkretdtd|jj )	a  
    Implements "DREAM (Diffusion Rectification and Estimation-Adaptive Models)" from
    https://huggingface.co/papers/2312.00210. DREAM helps align training with sampling to help training be more
    efficient and accurate at the cost of an extra forward step without gradients.

    Args:
        `unet`: The state unet to use to make a prediction.
        `noise_scheduler`: The noise scheduler used to add noise for the given timestep.
        `timesteps`: The timesteps for the noise_scheduler to user.
        `noise`: A tensor of noise in the shape of noisy_latents.
        `noisy_latents`: Previously noise latents from the training loop.
        `target`: The ground-truth tensor to predict after eps is removed.
        `encoder_hidden_states`: Text embeddings from the text model.
        `dream_detail_preservation`: A float value that indicates detail preservation level.
          See reference.

    Returns:
        `tuple[torch.Tensor, torch.Tensor]`: Adjusted noisy_latents and target.
    Nr&   r%   )NNepsilonv_predictionz/DREAM has not been implemented for v-predictionzUnknown prediction type )r*   r+   r(   r   no_gradsampleconfigprediction_typedetachmul_addNotImplementedErrorrI   )rL   r0   r1   rM   rN   rO   rP   rQ   r*   r3   dream_lambdapred_noisy_latents_targetpredicted_noisedelta_noiser"   r"   r#    compute_dream_and_update_latents   s$   


rc   c                 C   sb   i }|   D ](\}}t|dr.t|d}|dur.| }| D ]\}}||| d| < q q|S )zL
    Returns:
        A state dict containing just the LoRA parameters.
    set_lora_layer
lora_layerNz.lora.)named_moduleshasattrgetattr
state_dictitems)rL   lora_state_dictnamemodulere   current_lora_layer_sdlora_layer_matrix_name
lora_paramr"   r"   r#   unet_lora_state_dict   s   

rq   modelc                 C   s>   t | ts| g} | D ]}| D ]}|jr|||_qq
dS )z
    Casts the training parameters of the model to the specified data type.

    Args:
        model: The PyTorch model whose parameters will be cast.
        dtype: The data type to which the model parameters will be cast.
    N)
isinstancelist
parametersrequires_gradr+   data)rr   dtypemparamr"   r"   r#   cast_training_params   s   
r{   rk   prefixtext_encoderc                    s4    fdd|   D }tt|}t||dd dS )aD  
    Sets the `lora_state_dict` into `text_encoder` coming from `transformers`.

    Args:
        lora_state_dict: The state dictionary to be set.
        prefix: String identifier to retrieve the portion of the state dict that belongs to `text_encoder`.
        text_encoder: Where the `lora_state_dict` is to be set.
    c                    s*   i | ]\}}|  r| d  |qS ) )
startswithreplace.0kvr|   r"   r#   
<dictcomp>  s
    z5_set_state_dict_into_text_encoder.<locals>.<dictcomp>default)adapter_nameN)rj   r   r   r   )rk   r|   r}   text_encoder_state_dictr"   r   r#   !_set_state_dict_into_text_encoder   s
   
r   modules_to_savec                 C   s:   i }|   D ]\}}|d ur|jd  || d< q|S )Nr   _lora_adapter_metadata)rj   peft_configto_dict)r   	metadatasmodule_namerm   r"   r"   r#   _collate_lora_metadata	  s   r   cpuweighting_scheme
batch_size
logit_mean	logit_std
mode_scaler(   	generatorc                 C   s   | dkrt j|||f||d}t jj|}|S | dkr=t j|f||d}d| |t tj| d d d |   }|S t j|f||d}|S )a  
    Compute the density for sampling the timesteps when doing SD3 training.

    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.

    SD3 paper reference: https://huggingface.co/papers/2403.03206v1.
    logit_normal)meanstdsizer(   r   mode)r   r(   r   r   r)   )	r   normalnn
functionalsigmoidrandcosmathpi)r   r   r   r   r   r(   r   ur"   r"   r#   %compute_density_for_timestep_sampling  s   ,r   c                 C   sX   | dkr|d   }|S | dkr%dd|  d|d   }dtj|  }|S t|}|S )z
    Computes loss weighting scheme for SD3 training.

    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.

    SD3 paper reference: https://huggingface.co/papers/2403.03206v1.
    
sigma_sqrtg       cosmapr   r)   )r,   r   r   r   	ones_like)r   sigmas	weightingbotr"   r"   r#   compute_loss_weighting_for_sd3,  s   
r   c                   C   sx   t   tj rtj  dS tjj rtj  dS t r't	j
  dS ttdr8tj r:tj  dS dS dS )zV
    Runs garbage collection. Then clears the cache of the available accelerator.
    xpuN)gccollectr   r!   is_availableempty_cachebackendsmpsr   	torch_npur   rg   r   r"   r"   r"   r#   free_memory>  s   
r   T)offloadmodulesr   c              
   g   s    |r/t dd |D  }|rdd |D }nt|dksJ |d jg}|D ]}||  q'zdV  W |rEt||D ]\}}|| q;dS dS |rZt||D ]
\}}|| qPw w )a  
    Context manager that, if offload=True, moves each module to `device` on enter, then moves it back to its original
    device on exit.

    Args:
        device (`str` or `torch.Device`): Device to move the `modules` to.
        offload (`bool`): Flag to enable offloading.
    c                 s   s    | ]}t |tV  qd S N)rs   r   r   ry   r"   r"   r#   	<genexpr>Y  s    z!offload_models.<locals>.<genexpr>c                 S   s   g | ]	}t | jqS r"   )nextru   r(   r   r"   r"   r#   
<listcomp>\      z"offload_models.<locals>.<listcomp>r   r   N)anyr-   r(   r+   zip)r(   r   r   is_modeloriginal_devicesry   orig_devr"   r"   r#   offload_modelsN  s*   
r   c                 C   s   | st d|  d}g }|D ]d}td|}|s#t d| dz;t|d}t|d}|dks:|dkr>t d	|d
 dksJ|d
 dkrVtd| d| d |	||f W q t yu } zt d| d| |d}~ww |s|t d|S )zGParses a string defining buckets into a list of (height, width) tuples.zBucket string cannot be empty.;z^\s*(\d+)\s*,\s*(\d+)\s*$zInvalid bucket format: 'z'. Expected 'height,width'.r   r)   r   z,Bucket dimensions must be positive integers.   zBucket dimension (,z.) not divisible by 8. This might cause issues.z Invalid integer in bucket pair 'z': Nz.No valid buckets found in the provided string.)
rI   stripsplitrematchintgroupwarningswarnappend)buckets_strbucket_pairsparsed_bucketspair_strr   heightwidther"   r"   r#   parse_buckets_stringn  s.   r   c           	      C   sJ   t d}d}t|D ]\}\}}t| | ||  }||kr"|}|}q
|S )z6Finds the closes bucket to the given height and width.infN)r,   	enumerateabs)	hwbucket_options
min_metricbest_bucket_idx
bucket_idxbucket_hbucket_wmetricr"   r"   r#   find_nearest_bucket  s   r   c                 C   s   dd |   D S )Nc                 S   s2   i | ]\}}|t |tjr|   n|qS r"   )rs   r   TensorrY   r   
contiguousr   r"   r"   r#   r     s   2 z&_to_cpu_contiguous.<locals>.<dictcomp>)rj   )state_dictsr"   r"   r#   _to_cpu_contiguous  s   r   c                 C   sT   i }t | jdd}|du rtd| jj}|du r tj|d< |S |jp%tj|d< |S )zT
    Extract and convert FSDP config from Accelerator into PyTorch FSDP kwargs.
    fsdp_pluginNzLAccelerate isn't configured to handle FSDP. Please update your installation.sharding_strategy)rh   staterI   r   r   
FULL_SHARDr   )acceleratorkwargs
fsdp_stater   r"   r"   r#    get_fsdp_kwargs_from_accelerator  s   
r   use_orig_paramslimit_all_gathersfsdp_kwargstransformer_layer_clsc                 C   s   t t}|du rt| jjjd }|d|j  tt|hd}||r)t	|dnd|||d}	|r7|	
| t| fi |	}
|
S )uY  
    Wrap a model with FSDP using common defaults and optional transformer auto-wrapping.

    Args:
        model: Model to wrap
        device: Target device (e.g., accelerator.device)
        offload: Whether to enable CPU parameter offloading
        use_orig_params: Whether to use original parameters
        limit_all_gathers: Whether to limit all gathers
        fsdp_kwargs: FSDP arguments (sharding_strategy, etc.) — usually from Accelerate config
        transformer_layer_cls: Classes for auto-wrapping (if not using policy from fsdp_kwargs)

    Returns:
        FSDP-wrapped model
    Nr   z8transformer_layer_cls is not provided, auto-inferred as )r   )offload_params)	device_idcpu_offloadr   r   auto_wrap_policy)r   __name__typerr   language_modellayersinfor   r
   r   updateFSDP)rr   r(   r   r   r   r   r   loggerr   rW   
fsdp_modelr"   r"   r#   wrap_with_fsdp  s   
r  c                   @   sB  e Zd ZdZ									d.d	eejj d
edede	de
dee	B dee	B de
dedB deeef dB fddZed/d0ddZdd Zde	defddZe d	eejj fddZd	eejj ddfdd Zd1d!d"Zd2d1d#d$Zdefd%d&Zd	eejj ddfd'd(Zd	eejj ddfd)d*Zd+eddfd,d-ZdS )3EMAModelz6
    Exponential Moving Average of models weights
    H.?        r   Fr&   UUUUUU?Nru   decay	min_decayupdate_after_stepuse_ema_warmup	inv_gammapowerforeach	model_clsmodel_configc                 K   s  t |tjjrd}tdd|dd | }d}|dddur-d	}tdd|dd |d }|d
ddurCd}td
d|dd |d
 }t|}dd |D | _|dddurid}tdd|dd | j	|d d d| _
|| _|| _|| _|| _|| _|| _d| _d| _|| _|	| _|
| _dS )ar  
        Args:
            parameters (Iterable[torch.nn.Parameter]): The parameters to track.
            decay (float): The decay factor for the exponential moving average.
            min_decay (float): The minimum decay factor for the exponential moving average.
            update_after_step (int): The number of steps to wait before starting to update the EMA weights.
            use_ema_warmup (bool): Whether to use EMA warmup.
            inv_gamma (float):
                Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
            power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
            foreach (bool): Use torch._foreach functions for updating shadow parameters. Should be faster.
            device (str | torch.device | None): The device to store the EMA weights on. If None, the EMA
                        weights will be stored on CPU.

        @crowsonkb's notes on EMA Warmup:
            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
            at 215.4k steps).
        zzPassing a `torch.nn.Module` to `ExponentialMovingAverage` is deprecated. Please pass the parameters of the module instead.z9passing a `torch.nn.Module` to `ExponentialMovingAverage`1.0.0Fstandard_warnT	max_valueNzCThe `max_value` argument is deprecated. Please use `decay` instead.	min_valuezGThe `min_value` argument is deprecated. Please use `min_decay` instead.c                 S   s   g | ]}|   qS r"   )clonerY   r   pr"   r"   r#   r   '  s    z%EMAModel.__init__.<locals>.<listcomp>r(   z=The `device` argument is deprecated. Please use `to` instead.r'   r   )rs   r   r   Moduler   ru   getrt   shadow_paramsr+   temp_stored_paramsr	  r
  r  r  r  r  optimization_stepcur_decay_valuer  r  r  )selfru   r	  r
  r  r  r  r  r  r  r  r   deprecation_messager"   r"   r#   __init__  sJ   #
zEMAModel.__init__rR   c                 C   s@   |j |dd\}}||}| | ||j|d}|| |S )NT)return_unused_kwargs)r  r  r  )from_configfrom_pretrainedru   rW   load_state_dict)clspathr  r  _
ema_kwargsrr   	ema_modelr"   r"   r#   r%  =  s
   

zEMAModel.from_pretrainedc                 C   sr   | j d u r	td| jd u rtd| j | j}|  }|dd  |jdi | | |  |	| d S )NzJ`save_pretrained` can only be used if `model_cls` was defined at __init__.zM`save_pretrained` can only be used if `model_config` was defined at __init__.r  r"   )
r  rI   r  r$  ri   popregister_to_configcopy_toru   save_pretrained)r   r(  rr   ri   r"   r"   r#   r/  G  s   

zEMAModel.save_pretrainedr  c                 C   sn   t d|| j d }|dkrdS | jr!dd|| j  | j   }nd| d|  }t|| j}t || j}|S )zN
        Compute the decay factor for the exponential moving average.
        r   r   r  
   )maxr  r  r  r  minr	  r
  )r   r  stepr  r"   r"   r#   	get_decayV  s   zEMAModel.get_decayc           
   	   C   s  t |tjjrd}tdd|dd | }t|}|  jd7  _| | j}|| _	d| }t
 }| jrt rEtjj rEtjj|d d}|E dd	 |D }d
d	 t| j|D }t|t|k rxtjdd	 t| j|D dd	 |D dd tj|t|||d W d    d S 1 sw   Y  d S t| j|D ]9\}}	t rtjj rtjj|	d d}| |	jr||||	   n||	 W d    n1 sw   Y  qd S )NzPassing a `torch.nn.Module` to `ExponentialMovingAverage.step` is deprecated. Please pass the parameters of the module instead.z>passing a `torch.nn.Module` to `ExponentialMovingAverage.step`r  Fr  r   )modifier_rankc                 S   s   g | ]}|j r|qS r"   rv   r   rz   r"   r"   r#   r         z!EMAModel.step.<locals>.<listcomp>c                 S   s   g | ]	\}}|j r|qS r"   r6  r   s_paramrz   r"   r"   r#   r     s
    
c                 S   s   g | ]	\}}|j s|qS r"   r6  r9  r"   r"   r#   r     r   c                 S   s   g | ]}|j s|qS r"   r6  r7  r"   r"   r#   r     r8  T)non_blocking)r4   )rs   r   r   r  r   ru   rt   r  r4  r  
contextlibnullcontextr  r   transformersintegrations	deepspeedis_deepspeed_zero3_enabledzeroGatheredParametersr   r  r-   _foreach_copy__foreach_sub__foreach_subrv   sub_copy_)
r   ru   r!  r	  one_minus_decaycontext_managerparams_grads_params_gradr:  rz   r"   r"   r#   r3  i  sZ   
"
zEMAModel.stepc                 C   sj   t |}| jrtdd |D dd t| j|D  dS t| j|D ]\}}|j||j	j q#dS )aa  
        Copy current averaged parameters into given collection of parameters.

        Args:
            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
                updated with the stored moving averages. If `None`, the parameters with which this
                `ExponentialMovingAverage` was initialized will be used.
        c                 S      g | ]}|j qS r"   rw   r7  r"   r"   r#   r         z$EMAModel.copy_to.<locals>.<listcomp>c                 S   s   g | ]\}}| |jjqS r"   )r+   r(   rw   r9  r"   r"   r#   r     s    N)
rt   r  r   rD  r   r  rw   rH  r+   r(   )r   ru   r:  rz   r"   r"   r#   r.    s   	zEMAModel.copy_toc                 C   s   dd | j D | _ dS )z
        Move internal buffers of the ExponentialMovingAverage to pinned memory. Useful for non-blocking transfers for
        offloading EMA params to the host.
        c                 S   s   g | ]}|  qS r"   )
pin_memoryr  r"   r"   r#   r     s    z'EMAModel.pin_memory.<locals>.<listcomp>Nr  r   r"   r"   r#   rP    s   zEMAModel.pin_memoryc                    s    fdd| j D | _ dS )z
        Move internal buffers of the ExponentialMovingAverage to `device`.

        Args:
            device: like `device` argument to `torch.Tensor.to`
        c                    s2   g | ]}|  r|j d n|j dqS )r(   rx   r;  )r(   r;  )is_floating_pointr+   r  rS  r"   r#   r     s    zEMAModel.to.<locals>.<listcomp>NrQ  )r   r(   rx   r;  r"   rS  r#   r+     s   zEMAModel.toc              	   C   s&   | j | j| j| j| j| j| j| jdS )z
        Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
        checkpointing to save the ema state dict.
        r	  r
  r  r  r  r  r  r  rU  rR  r"   r"   r#   ri     s   	zEMAModel.state_dictc                 C   s   dd |D | _ dS )z
        Saves the current parameters for restoring later.

        Args:
            parameters: Iterable of `torch.nn.Parameter`. The parameters to be temporarily stored.
        c                 S   s   g | ]
}|    qS r"   )rY   r   r  r7  r"   r"   r#   r     s    z"EMAModel.store.<locals>.<listcomp>N)r  )r   ru   r"   r"   r#   store  s   zEMAModel.storec                 C   sj   | j du r	td| jrtdd |D dd | j D  nt| j |D ]\}}|j|j q$d| _ dS )aG  
        Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters
        without: affecting the original optimization process. Store the parameters before the `copy_to()` method. After
        validation (or model saving), use this to restore the former parameters.

        Args:
            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
                updated with the stored parameters. If `None`, the parameters with which this
                `ExponentialMovingAverage` was initialized will be used.
        NzGThis ExponentialMovingAverage has no `store()`ed weights to `restore()`c                 S   rM  r"   rN  r7  r"   r"   r#   r     rO  z$EMAModel.restore.<locals>.<listcomp>c                 S   rM  r"   rN  )r   c_paramr"   r"   r#   r     rO  )r  RuntimeErrorr  r   rD  r   rw   rH  )r   ru   rW  rz   r"   r"   r#   restore  s   

zEMAModel.restoreri   c                 C   sh  t |}|d| j| _| jdk s| jdkrtd|d| j| _t| jts-td|d| j| _t| jt	s?td|d	| j
| _
t| j
t	sQtd
|d| j| _t| jtsctd|d| j| _t| jtt	fswtd|d| j| _t| jtt	fstd|dd}|dur|| _t| jtstdtdd | jD stddS dS )a  
        Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
        ema state dict.

        Args:
            state_dict (dict): EMA state. Should be an object returned
                from a call to :meth:`state_dict`.
        r	  r  r&   zDecay must be between 0 and 1r
  zInvalid min_decayr  zInvalid optimization_stepr  zInvalid update_after_stepr  zInvalid use_ema_warmupr  zInvalid inv_gammar  zInvalid powerr  Nzshadow_params must be a listc                 s   s    | ]	}t |tjV  qd S r   )rs   r   r   r  r"   r"   r#   r   /  s    z+EMAModel.load_state_dict.<locals>.<genexpr>z!shadow_params must all be Tensors)copydeepcopyr  r	  rI   r
  rs   r,   r  r   r  r  boolr  r  r  rt   all)r   ri   r  r"   r"   r#   r&    s>   

zEMAModel.load_state_dict)	r  r  r   Fr&   r  FNN)F)rR   r  )rR   N)NNF)r   
__module____qualname____doc__r   r   r   	Parameterr,   r   r\  r   dictstrr"  classmethodr%  r/  r4  rU   r3  r.  rP  r+   ri   rV  rY  r&  r"   r"   r"   r#   r    sZ    
	

T	9
	r  )r&   )NNNr   Nr   )TTTNN)Sr<  rZ  r   r   r   r   r   r   	functoolsr   typingr   r   numpyr   r   rh   torch.distributed.fsdpr   r   r	   r  torch.distributed.fsdp.wrapr
   modelsr   	pipelinesr   
schedulersr   utilsr   r   r   r   r   r   r   r   r>  r?  r@  rA  accelerate.loggingr   peftr   torchvisionr   r   r   r$   r7   rc  rK   r   r,   tuplerc   rb  rq   float32r   r  rt   r{   r   r   r(   	Generatorr   r   r   r\  r   r   r   r   r   setr   r  r  r"   r"   r"   r#   <module>   s    (%4	
6$
&
,
3