o
    پi><                     @   s  d dl mZmZ d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZ d dlmZ d dlm Z  ee!Z"dejj#dej$dejj#fddZ%					d1de&ej' de(e)ef de*e) dej+de,de,dej-dej-de.de.d ej-dB d!e.d"e.dejj'fd#d$Z/de dg dd%de.d&e.d'edB d(e
dB d)e*ee)ej'ge.f  d!e.ddfd*d+Z0			d2d,eejj'B d-ee1e)ej$f ddf dej+dej-dB d"e.de.d.ee)ge1e)eef f dB defd/d0Z2dS )3    )Callable	Generator)chain)AnyN)nn)
DeviceMeshinit_device_mesh)distribute_tensor)CPUOffloadPolicy
FSDPModuleMixedPrecisionPolicyfully_shard)_IncompatibleKeys)get_param_names_mappinghf_to_custom_state_dictset_default_torch_dtype)safetensors_weights_iterator)current_platform)init_logger)set_mixed_precision_policyactual_paramtensorreturnc                 C   sR   | j }z
|j||dd}W n ty   |||}Y nw |j| j d|_|S )NFrequires_grad)	__class____new__	TypeError__dict__updater   )r   r   cls	new_param r"   b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/loader/fsdp_load.py_make_param_like(   s   r$   FT	model_clsinit_paramsweight_dir_listdevicehsdp_replicate_dimhsdp_shard_dimparam_dtypereduce_dtypecpu_offloadfsdp_inferenceoutput_dtypepin_cpu_memorystrictc              	   C   s^  |r|nt j}t|||
dd}t|||
|d t|$ t d | di |}W d   n1 s3w   Y  W d   n1 sBw   Y  |	}t rTd}t	d |rv|| }|	s`|}d}t
tj||fdd	}t||d
|||j|d t|}t|j}t|||||||d t| | D ]\}}|jrtd| dt|t jjrd|_q|S )a  Load a model with optional FSDP (Fully Sharded Data Parallel) support.

    Args:
        param_dtype: Data type for model parameters, also used for:
            - Model initialization context (set_default_torch_dtype)
            - FSDP mixed precision policy
            - Weight loading and casting
        reduce_dtype: Data type for gradient reduction in FSDP mixed precision.
        strict: If True, enforce strict state dict loading (all keys must match).
    F)cast_forward_inputs)r+   r,   r/   	mp_policymetaNz6Disabling FSDP for MPS platform as it's not compatible   )	replicateshard)
mesh_shapemesh_dim_namesT)r-   reshard_after_forwardr3   meshfsdp_shard_conditionsr0   )r1   r-   param_names_mappingzUnexpected param or buffer z on meta device.r"   )torchbfloat16r   r   r   r(   r   is_mpsloggerinfor   device_typeshard_model_fsdp_shard_conditionsr   r   r=   %load_model_from_full_model_state_dictr   named_parametersnamed_buffersis_metaRuntimeError
isinstancer   	Parameterr   )r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   default_torch_dtyper3   modeluse_fsdp
world_sizedevice_meshweight_iteratorparam_names_mapping_fnnpr"   r"   r#   maybe_load_fsdp_model8   sp    


	rV   )r:   r3   r;   r<   r0   r:   r3   r;   r<   c          	         s   |du s
t |dkrtdt| j dS |||d}|r$t|d|d< d}tt|  D ]\ t	 fdd|D rJt
 fi | |d	7 }q.|dkrStd
t
| fi | dS )ao  
    Utility to shard a model with FSDP using the PyTorch Distributed fully_shard API.

    This method will over the model's named modules from the bottom-up and apply shard modules
    based on whether they meet any of the criteria from shard_conditions.

    Args:
        model (TransformerDecoder): Model to shard with FSDP.
        cpu_offload (bool): If set to True, FSDP will offload parameters, gradients, and optimizer
            states to CPU.
        reshard_after_forward (bool): Whether to reshard parameters and buffers after
            the forward pass. Setting this to True corresponds to the FULL_SHARD sharding strategy
            from FSDP1, while setting it to False corresponds to the SHARD_GRAD_OP sharding strategy.
        mesh (Optional[DeviceMesh]): Device mesh to use for FSDP sharding under multiple parallelism.
            Default to None.
        fsdp_shard_conditions (List[Callable[[str, nn.Module], bool]]): A list of functions to determine
            which modules to shard with FSDP.
        pin_cpu_memory (bool): If set to True, FSDP will pin the CPU memory of the offloaded parameters.

    Nr   zPThe FSDP shard condition list is empty or None. No modules will be sharded in %s)r:   r;   r3   )
pin_memoryoffload_policyc                    s   g | ]}| qS r"   r"   ).0shard_conditionmrT   r"   r#   
<listcomp>   s    zshard_model.<locals>.<listcomp>r5   zXNo layer modules were sharded. Please check if shard conditions are working as expected.)lenrA   warningtype__name__r
   reversedlistnamed_modulesanyr   
ValueError)	rN   r-   r:   r3   r;   r<   r0   fsdp_kwargsnum_layers_shardedr"   r[   r#   rD      s.   rD   rN   full_sd_iteratorr=   c                    s  |   }t|  }t||\}	}
t| tp tdd | D }t|		 }i }|D ]}|	| }|
|}|du rP|s>|rFtd| dtd| d q+|rT|n|j}t|ds|j||d	}|
|}|durrt|d
dnd}|dur|dus~J tj|||d	}t|dd}t||}| s| sd}||_||| |j}n|}|r|s| }n|j||d	}t||j|j}|r|d}d}tj||d||< q+|
| _t |	 t |	  }|rtd| g d}|D ]k t fdd|D st!d | td  d| d|
 }d v s d v r$tj"}ntj#}t|ds?||||d	}|r>|s>| }n||||d	}t||j|j}|rU| }t|| < q| j$||ddS )a$  
    Converting full state dict into a sharded state dict
    and loading it into FSDP model (if training) or normal huggingface model
    Args:
        model (Union[FSDPModule, torch.nn.Module]): Model to generate fully qualified names for cpu_state_dict
        full_sd_iterator (Generator): an iterator yielding (param_name, tensor) pairs
        device (torch.device): device used to move full state dict tensors
        param_dtype (torch.dtype): dtype used to move full state dict tensors. If none, respect original dtype from checkpoint
        strict (bool): flag to check if to load the model in strict mode
        cpu_offload (bool): flag to check if FSDP offload is enabled
        param_names_mapping (Optional[Callable[[str], str]]): a function that maps full param name to sharded param name
    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    c                 s   s    | ]}t |d V  qdS )rQ   N)hasattr)rY   rU   r"   r"   r#   	<genexpr>   s    

z8load_model_from_full_model_state_dict.<locals>.<genexpr>Nz
Parameter zQ not found in custom model state dict. The hf to custom mapping may be incorrect.zParameter 'zY' from checkpoint not found in model; skipping. This is expected for optional parameters.rQ   )r(   dtypeweight_loaderr   Fcpur   z0Found unloaded parameters in meta state dict: %s)gate_compresswcscaleswtscalebiasc                 3   s    | ]}| v V  qd S )Nr"   )rY   patternnew_param_namer"   r#   rk   U  s    z3Unsupported new parameter: %s. Allowed patterns: %szNew parameter 'z9' is not supported. Currently only parameters containing z are allowed.rp   rq   T)r1   assign)%
state_dictdictrG   r   rK   r   re   valuessortedkeysgetrf   rA   r_   rl   rj   togetattrr>   
empty_liker$   is_floating_point
is_complexr   datarn   r	   rQ   
placementsr   rL   reverse_param_names_mappingseterror	ones_like
zeros_likeload_state_dict)rN   ri   r(   r+   r1   r-   r=   meta_sd
param_dictcustom_param_sdr   is_fsdp_modelsorted_param_names
sharded_sdtarget_param_namefull_tensormeta_sharded_paramtarget_dtyper   rm   sharded_tensorr   
temp_paramunused_keysALLOWED_NEW_PARAM_PATTERNS	init_liker"   rt   r#   rF      s   








rF   )FFNTT)FFN)3collections.abcr   r   	itertoolsr   typingr   r>   r   torch.distributedr   r   torch.distributed._tensorr	   torch.distributed.fsdpr
   r   r   r   torch.nn.modules.moduler   *sglang.multimodal_gen.runtime.loader.utilsr   r   r   1sglang.multimodal_gen.runtime.loader.weight_utilsr   'sglang.multimodal_gen.runtime.platformsr   1sglang.multimodal_gen.runtime.utils.logging_utilsr   sglang.multimodal_gen.utilsr   ra   rA   rL   Tensorr$   r`   Modulerx   strrc   r(   intrl   boolrV   rD   tuplerF   r"   r"   r"   r#   <module>   s   

	

`	
F
