o
    ÔÙ¾i><  ã                   @   s  d dl mZmZ d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZ d dlmZ d dlm Z  ee!ƒZ"dejj#dej$dejj#fdd„Z%					d1de&ej' de(e)ef de*e) dej+de,de,dej-dej-de.de.d ej-dB d!e.d"e.dejj'fd#d$„Z/deƒ dg dd%œde.d&e.d'edB d(e
dB d)e*ee)ej'ge.f  d!e.ddfd*d+„Z0			d2d,eejj'B d-ee1e)ej$f ddf dej+dej-dB d"e.de.d.ee)ge1e)eef f dB defd/d0„Z2dS )3é    )ÚCallableÚ	Generator)Úchain)ÚAnyN)Únn)Ú
DeviceMeshÚinit_device_mesh)Údistribute_tensor)ÚCPUOffloadPolicyÚ
FSDPModuleÚMixedPrecisionPolicyÚfully_shard)Ú_IncompatibleKeys)Úget_param_names_mappingÚhf_to_custom_state_dictÚset_default_torch_dtype)Úsafetensors_weights_iterator)Úcurrent_platform)Úinit_logger)Úset_mixed_precision_policyÚactual_paramÚtensorÚreturnc                 C   sR   | j }z
|j||dd}W n ty   | ||¡}Y nw |j | j¡ d|_|S )NF©Úrequires_grad)Ú	__class__Ú__new__Ú	TypeErrorÚ__dict__Úupdater   )r   r   ÚclsÚ	new_param© r"   úb/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/loader/fsdp_load.pyÚ_make_param_like(   s   ÿr$   FTÚ	model_clsÚinit_paramsÚweight_dir_listÚdeviceÚhsdp_replicate_dimÚhsdp_shard_dimÚparam_dtypeÚreduce_dtypeÚcpu_offloadÚfsdp_inferenceÚoutput_dtypeÚpin_cpu_memoryÚstrictc              	   C   s^  |r|nt j}t|||
dd}t|||
|d t|ƒ$ t  d¡ | di |¤Ž}W d  ƒ n1 s3w   Y  W d  ƒ n1 sBw   Y  |	}t ¡ rTd}t 	d¡ |rv|| }|	s`|}d}t
tj||fdd	}t||d
|||j|d t|ƒ}t|jƒ}t|||||||d t| ¡ | ¡ ƒD ]\}}|jr¢td|› dƒ‚t|t jjƒr¬d|_q“|S )aÔ  Load a model with optional FSDP (Fully Sharded Data Parallel) support.

    Args:
        param_dtype: Data type for model parameters, also used for:
            - Model initialization context (set_default_torch_dtype)
            - FSDP mixed precision policy
            - Weight loading and casting
        reduce_dtype: Data type for gradient reduction in FSDP mixed precision.
        strict: If True, enforce strict state dict loading (all keys must match).
    F)Úcast_forward_inputs)r+   r,   r/   Ú	mp_policyÚmetaNz6Disabling FSDP for MPS platform as it's not compatibleé   )Ú	replicateÚshard)Ú
mesh_shapeÚmesh_dim_namesT)r-   Úreshard_after_forwardr3   ÚmeshÚfsdp_shard_conditionsr0   )r1   r-   Úparam_names_mappingzUnexpected param or buffer z on meta device.r"   )ÚtorchÚbfloat16r   r   r   r(   r   Úis_mpsÚloggerÚinfor   Údevice_typeÚshard_modelÚ_fsdp_shard_conditionsr   r   r=   Ú%load_model_from_full_model_state_dictr   Únamed_parametersÚnamed_buffersÚis_metaÚRuntimeErrorÚ
isinstancer   Ú	Parameterr   )r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   Údefault_torch_dtyper3   ÚmodelÚuse_fsdpÚ
world_sizeÚdevice_meshÚweight_iteratorÚparam_names_mapping_fnÚnÚpr"   r"   r#   Úmaybe_load_fsdp_model8   sp   ÿüÿ€ 
üù

ù	€rV   )r:   r3   r;   r<   r0   r:   r3   r;   r<   c          	         sº   |du s
t |ƒdkrt dt| ƒj¡ dS |||dœ}|r$t|d|d< d}tt|  ¡ ƒƒD ]\‰‰ t	‡ ‡fdd„|D ƒƒrJt
ˆ fi |¤Ž |d	7 }q.|dkrStd
ƒ‚t
| fi |¤Ž dS )ao  
    Utility to shard a model with FSDP using the PyTorch Distributed fully_shard API.

    This method will over the model's named modules from the bottom-up and apply shard modules
    based on whether they meet any of the criteria from shard_conditions.

    Args:
        model (TransformerDecoder): Model to shard with FSDP.
        cpu_offload (bool): If set to True, FSDP will offload parameters, gradients, and optimizer
            states to CPU.
        reshard_after_forward (bool): Whether to reshard parameters and buffers after
            the forward pass. Setting this to True corresponds to the FULL_SHARD sharding strategy
            from FSDP1, while setting it to False corresponds to the SHARD_GRAD_OP sharding strategy.
        mesh (Optional[DeviceMesh]): Device mesh to use for FSDP sharding under multiple parallelism.
            Default to None.
        fsdp_shard_conditions (List[Callable[[str, nn.Module], bool]]): A list of functions to determine
            which modules to shard with FSDP.
        pin_cpu_memory (bool): If set to True, FSDP will pin the CPU memory of the offloaded parameters.

    Nr   zPThe FSDP shard condition list is empty or None. No modules will be sharded in %s)r:   r;   r3   )Ú
pin_memoryÚoffload_policyc                    s   g | ]}|ˆˆ ƒ‘qS r"   r"   )Ú.0Úshard_condition©ÚmrT   r"   r#   Ú
<listcomp>Ç   s    zshard_model.<locals>.<listcomp>r5   zXNo layer modules were sharded. Please check if shard conditions are working as expected.)ÚlenrA   ÚwarningÚtypeÚ__name__r
   ÚreversedÚlistÚnamed_modulesÚanyr   Ú
ValueError)	rN   r-   r:   r3   r;   r<   r0   Úfsdp_kwargsÚnum_layers_shardedr"   r[   r#   rD   ”   s.   þý€ÿrD   rN   Úfull_sd_iteratorr=   c                    sÊ  |   ¡ }t|  ¡ ƒ}t||ƒ\}	}
t| tƒp tdd„ | ¡ D ƒƒ}t|	 	¡ ƒ}i }|D ]¨}|	| }| 
|¡}|du rP|s>|rFtd|› dƒ‚t d|› d¡ q+|rT|n|j}t|dƒs²|j||d	}| 
|¡}|durrt|d
dƒnd}|dur§|dus~J ‚tj|||d	}t|ddƒ}t||ƒ}| ¡ s›| ¡ s›d}||_|||ƒ |j}n|}|r±|s±| ¡ }n|j||d	}t||j|jƒ}|rÈ| d¡}d}tj||d||< q+|
| _t | 	¡ ƒt | 	¡ ƒ }|rët d|¡ g d¢}|D ]k‰ t‡ fdd„|D ƒƒst !dˆ |¡ tdˆ › d|› dƒ‚| 
ˆ ¡}dˆ v s dˆ v r$tj"}ntj#}t|dƒs?||||d	}|r>|s>| ¡ }n||||d	}t||j|jƒ}|rU| ¡ }t |¡|ˆ < qñ| j$||ddS )a$  
    Converting full state dict into a sharded state dict
    and loading it into FSDP model (if training) or normal huggingface model
    Args:
        model (Union[FSDPModule, torch.nn.Module]): Model to generate fully qualified names for cpu_state_dict
        full_sd_iterator (Generator): an iterator yielding (param_name, tensor) pairs
        device (torch.device): device used to move full state dict tensors
        param_dtype (torch.dtype): dtype used to move full state dict tensors. If none, respect original dtype from checkpoint
        strict (bool): flag to check if to load the model in strict mode
        cpu_offload (bool): flag to check if FSDP offload is enabled
        param_names_mapping (Optional[Callable[[str], str]]): a function that maps full param name to sharded param name
    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    c                 s   s    | ]}t |d ƒV  qdS )rQ   N)Úhasattr)rY   rU   r"   r"   r#   Ú	<genexpr>÷   s   € 

ÿz8load_model_from_full_model_state_dict.<locals>.<genexpr>Nz
Parameter zQ not found in custom model state dict. The hf to custom mapping may be incorrect.zParameter 'zY' from checkpoint not found in model; skipping. This is expected for optional parameters.rQ   )r(   ÚdtypeÚweight_loaderr   FÚcpur   z0Found unloaded parameters in meta state dict: %s)Úgate_compressÚwcscalesÚwtscaleÚbiasc                 3   s    | ]}|ˆ v V  qd S )Nr"   )rY   Úpattern©Únew_param_namer"   r#   rk   U  s   € z3Unsupported new parameter: %s. Allowed patterns: %szNew parameter 'z9' is not supported. Currently only parameters containing z are allowed.rp   rq   T)r1   Úassign)%Ú
state_dictÚdictrG   r   rK   r   re   ÚvaluesÚsortedÚkeysÚgetrf   rA   r_   rl   rj   ÚtoÚgetattrr>   Ú
empty_liker$   Úis_floating_pointÚ
is_complexr   Údatarn   r	   rQ   Ú
placementsr   rL   Úreverse_param_names_mappingÚsetÚerrorÚ	ones_likeÚ
zeros_likeÚload_state_dict)rN   ri   r(   r+   r1   r-   r=   Úmeta_sdÚ
param_dictÚcustom_param_sdr„   Úis_fsdp_modelÚsorted_param_namesÚ
sharded_sdÚtarget_param_nameÚfull_tensorÚmeta_sharded_paramÚtarget_dtyper   rm   Úsharded_tensorr   Ú
temp_paramÚunused_keysÚALLOWED_NEW_PARAM_PATTERNSÚ	init_liker"   rt   r#   rF   Õ   sÊ   ÿÿ

ÿ
ÿ

ÿýÿ
ÿÿ
€ý
ÿýÿÿ
ÿ€ÿýrF   )FFNTT)FFN)3Úcollections.abcr   r   Ú	itertoolsr   Útypingr   r>   r   Útorch.distributedr   r   Útorch.distributed._tensorr	   Útorch.distributed.fsdpr
   r   r   r   Útorch.nn.modules.moduler   Ú*sglang.multimodal_gen.runtime.loader.utilsr   r   r   Ú1sglang.multimodal_gen.runtime.loader.weight_utilsr   Ú'sglang.multimodal_gen.runtime.platformsr   Ú1sglang.multimodal_gen.runtime.utils.logging_utilsr   Úsglang.multimodal_gen.utilsr   ra   rA   rL   ÚTensorr$   r`   ÚModulerx   Ústrrc   r(   Úintrl   ÚboolrV   rD   ÚtuplerF   r"   r"   r"   r#   Ú<module>   sÀ   ÿÿ
þóÿ
þýüûúùø	÷
öõôó
ò`øýüûúùø	
÷Fù
ÿþýüûúùø