o
    پizy                     @   s  d dl Z d dlZd dlmZmZ d dlZd dlmZmZ d dl	m
Z
 e jG dd dZe jG dd dZejgZd	d
gZddgZededdgddgZededddgddededdgddgZededddedddgdddgZededddedddgddgZededddedddgddgZdejd ejd!ejfd"d#Z	dRd$ejd%ejd&ejd'ejd(ejd)ejd*ed!ejfd+d,Z	dSd$ejd%ejd&ejd'ejd(ejd-ejd.eej d!ejfd/d0Z	
		1	1	2	3dTd4ee d5ed6ejd7ed8ed9ee d:ed;ed<ed=efd>d?Z 	
	1	1	2	3dUd4ee d5ed6ejd7ed8ed:ed;ed<ed=efd@dAZ!dBdC Z"g dDZ#d4ee dEee fdFdGZ$	1	H	I	1	I	dVdJee dKedLed:edMed;ed9ee fdNdOZ%	H	I	I	dWdJee dLed:ed;ed9ee f
dPdQZ&dS )X    N)ListOptional)HFRunner	SRTRunner)calculate_rouge_lc                   @   s:   e Zd ZU eed< dZeed< dZeed< dZeed< dS )LoRAAdaptornameNprefill_tolerancedecode_tolerancerouge_l_tolerance)	__name__
__module____qualname__str__annotations__r	   floatr
   r    r   r   J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/lora_utils.pyr      s
   
 r   c                   @   s   e Zd ZU eed< ee ed< dZeed< dZ	e
ed< dZe
ed< dZe
ed	< dZeed
< dZee ed< dZeed< dd ZdS )LoRAModelCasebaseadaptors   tp_size皙?r	   r
   g      ?r   max_loras_per_batchNmax_loaded_lorasFskip_long_promptc                 C   s:   t | j| jkrtd| j dt | j d| j dd S )Nz
For base 'z', number of adaptors (z") must be <= max_loras_per_batch ())lenr   r   
ValueErrorr   )selfr   r   r   __post_init__   s   zLoRAModelCase.__post_init__)r   r   r   r   r   r   r   r   intr	   r   r
   r   r   r   r   r   boolr!   r   r   r   r   r      s   
 r   tritoncsgmv,AI is a field of computer science focused on  
    ### Instruction:
    Tell me about llamas and alpacas
    ### Response:
    Llamas are large, long-necked animals with a woolly coat. They have two toes on each foot instead of three like other camelids (camels, dromedaries). Llamas live in the Andean mountains of South America where they graze on grasses and shrubs. Alpaca is another name for domesticated llama. The word "alpaca" comes from an Incan language meaning "golden fleece." Alpacas look very similar to llamas but are smaller than their wild relatives. Both species were used by ancient people as pack animals and for meat. Today both llamas and alpacas are raised primarily for their fiber which can be spun into yarn or knitted into clothing.
    ### Question 2:
    What do you know about llamas?
    ### Answer:
    z meta-llama/Llama-3.1-8B-Instructz3algoprog/fact-generation-llama-3.1-8b-instruct-lorar   r   )r   r   r   z2Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16r   )r   r	   zmeta-llama/Llama-2-7b-hfzwinddude/wizardLM-LlaMA-LoRA-7B   z*RuterNorway/Llama-2-7b-chat-norwegian-LoRag333333?   )r   r   r   r   zQwen/Qwen3-4Bznissenj/Qwen3-4B-lora-v2z&TanXS/Qwen3-4B-LoRA-ZH-WebNovelty-v0.0abreturnc                 C   s    t |  | }|| jS )z?Matrix multiplication with mixed precision handling for float16)torchmatmulr   todtype)r+   r,   resultr   r   r   safe_matmul   s   r3   xweightsweight_indicesseq_lengths
lora_rankslora_scalings
num_slicesc                 C   s
  |  dkr| jd }tj|d| j| jdS | j\}}|j\}}	}|	| }
tj|||
 | j| jd}d}t|||| || D ]C\}}}}|dkrJq?|dkr~| ||| ddf }||d|| ddf }t|| }|| |||| d|| f< ||7 }q?|S )ae  
    Simple sequence-level reference implementation of SGMV shrink operation.

    Args:
        x: (total_seq_len, input_dim) - Input activations
        weights: (num_loras, num_slices * max_rank, input_dim) - LoRA A weights
        weight_indices: LoRA idx for each sequence
        seq_lengths: Length of each sequence
        lora_ranks: LoRA rank for each LoRA adapters
        lora_scalings: LoRA scaling for each LoRA adapters
        num_slices: Number of slices (3 for QKV, 2 for gate_up, 1 for others)

    Returns:
        output: (total_seq_len, num_slices * max_rank) - Intermediate activations
    r   r1   deviceN)	numelshaper.   zerosr1   r<   zipr3   t)r4   r5   r6   r7   r8   r9   r:   total_seq_len_weight_out_dimmax_rankoutputtoken_offsetlora_idxseq_lenrankscalingx_seqw_seqr2   r   r   r   reference_sgmv_shrink   s4   


rN   slice_offsetsbase_outputc                 C   s  |  dkr$| jd }t|dkr|d  nd}tj||| j| jdS | j\}}	t|d }
|dur8| }n|d  }tj||| j| jd}d}t	|||| D ]k\}}}|dkr]qS|dkr| ||| d|
| f }t
|
D ]F}|| }|d | }||  }||d   }|dd||f }||||d|f }t|| }|||| ||f  |7  < qs||7 }qS|S )aG  
    Simple sequence-level reference implementation of SGMV expand operation.

    Args:
        x: (total_seq_len, num_slices * max_rank) - Intermediate activations
        weights: (num_loras, output_dim, max_rank) - LoRA B weights
        weight_indices: LoRA idx for each sequence
        seq_lengths: Length of each sequence
        lora_ranks: LoRA rank for each LoRA adapters
        slice_offsets: Tensor defining slice boundaries
        base_output: Optional base output to accumulate into

    Returns:
        output: (total_seq_len, total_output_dim) - Final output
    r   r;   r   N)r=   r>   r   itemr.   r?   r1   r<   cloner@   ranger3   rA   )r4   r5   r6   r7   r8   rO   rP   rB   total_output_dimrC   r:   rF   rG   rH   rI   rJ   rL   	slice_idxslice_start_inputslice_end_inputslice_start_outputslice_end_outputx_slicew_slicer2   r   r   r   reference_sgmv_expand   sV   





r]   F)\(? prompts
model_casetorch_dtypemax_new_tokensbackendenable_lora_overlap_loadingdisable_cuda_graphdisable_radix_cachemem_fraction_statictest_tagc
           $      C   s  |j }
dg }}tt| D ]}||j|  |d t|j }qdd |D }td|	 d|j  d| d| d	d
d | D  d| d t|
|d|jdd |jD ||j|j	||||d}|j
| ||d}W d   n1 srw   Y  t|
|d|j|d}|j
| |d}W d   n1 sw   Y  t|
|dd}|j
| ||d}|j
| |d}W d   n1 sw   Y  tt| D ]/}|| }|jdur|jn|j}|jdur|jn|j}|jdur|jn|j}t|j| }t|j| }tt|| }td| t|j| }t|j| }tt|| }td| |j|  }|j|  } t|g| gd }!td|! td| td|  t|j| }"t|j| }#tdtt|#|  tdtt|"|  |jd dkrtt|| |k sJ d|
 d| d| d | d dd!  d"	|jd dkrtt|| |k sJ d#|
 d| d| d | d dd!  d"	|!|k rtd$|! d%| d&|
 d| d| d | d dd!  d"qdS )'a_  
    Input a batch of prompts, and run lora tests one by one with several generate requests
    (each request will have bs=1).
    For prompt0, prompt1, ..., promptN,
    we will use adaptor0, adaptor1, ..., adaptorN included in model case,
    We will then compare the outputs of HF and SRT with and without LoRA.
    If number of prompts is larger than number of adaptors,
    the prompt i will use adaptor i % (number of adaptors).

    Args:
        prompts (List[str]): The batch of prompts to test.
        model_case (LoRAModelCase): The model case to test.
        torch_dtype (torch.dtype): The torch dtype to use.
        max_new_tokens (int): The maximum number of new tokens to generate.
        backend (str): The lora backend to use.
        disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
        disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
        mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
        test_tag (str, optional): The tag to use for the test. Defaults to "".
    r   r   c                 S      g | ]}|j qS r   r(   .0adaptorr   r   r   
<listcomp>B      z,run_lora_test_one_by_one.<locals>.<listcomp>
========== Testing 
 on base '' with backend=, dtype= --- Using prompts c                 S      g | ]}|d d qS N2   r   rl   pr   r   r   rn   F       with adaptors:  ---
generationc                 S      g | ]
}|j d ur|j qS Nr(   rk   r   r   r   rn   M      )rb   
model_typer   
lora_pathsre   r   r   lora_backendrf   rg   rh   rc   r   Nrb   r   r   rh   rc   rb   r   zMax prefill diff (HF vs SRT):zMax decode diff (HF vs SRT):ROUGE-L score:SRT output:
HF output:z(Max diff (SRT base vs SRT LoRA prefill):z&Max diff (HF base vs HF LoRA prefill):d   z$Prefill logprobs mismatch for base '', adaptor 'z', backend '', prompt: 'rw   ...'z#Decode logprobs mismatch for base 'ROUGE-L score  below tolerance  for base ')r   rT   r   appendr   printr   r   r   r   forwardr   r	   r
   r   r.   tensortop_input_logprobsmaxabstop_output_logprobsoutput_strsstripr   r>   allAssertionError)$r`   ra   rb   rc   rd   re   rf   rg   rh   ri   	base_pathir   rC   adaptor_names
srt_runnersrt_outputssrt_no_lora_outputs	hf_runner
hf_outputshf_no_lora_outputsrm   prefill_tol
decode_tol	rouge_tol
hf_prefillsrt_prefillmax_prefill_diff	hf_decode
srt_decodemax_decode_diffsrt_output_strhf_output_strrouge_scorehf_no_lora_prefillsrt_no_lora_prefillr   r   r   run_lora_test_one_by_one  s    
		








r   c	                 C   s  |j }	dg }
}tt| D ]}||j|
  |
d t|j }
qdd |D }td| d|j  d| d| d	d
d | D  d| d t|	|d|jdd |jD |j|j	||||d}|j
| ||d}W d   n1 sqw   Y  t|	|d|j|d}|j
| |d}W d   n1 sw   Y  t|	|dd}|j| ||d}W d   n1 sw   Y  t|	|dd}|j| |d}W d   n1 sw   Y  tt| D ]}
|j|
  }|j|
  }t|g|gd }td| td| td| td|j|
   td|j|
   |j|
 d|j|
 dks9J |j|
 d|j|
 df|j|
 d|j|
 dks\J |j|
 d|j|
 dfqdS )a  
    Run lora tests as a batch.
    For prompt0, prompt1, ..., promptN,
    we will use adaptor0, adaptor1, ..., adaptorN included in model case,
    We will then compare the outputs of HF and SRT with LoRA.
    If number of prompts is larger than number of adaptors,
    the prompt i will use adaptor i % (number of adaptors).

    Args:
        prompts (List[str]): The batch of prompts to test.
        model_case (LoRAModelCase): The model case to test.
        torch_dtype (torch.dtype): The torch dtype to use.
        max_new_tokens (int): The maximum number of new tokens to generate.
        backend (str): The lora backend to use.
        disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
        disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to False.
        mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
        test_tag (str, optional): The tag to use for the test. Defaults to "".
    r   r   c                 S   rj   r   r(   rk   r   r   r   rn     ro   z*run_lora_test_by_batch.<locals>.<listcomp>rp   rq   rr   rs   rt   c                 S   ru   rv   r   rx   r   r   r   rn     rz   r{   r|   r}   c                 S   r~   r   r(   rk   r   r   r   rn     r   )
rb   r   r   r   r   r   r   rf   rg   rh   r   Nr   r   r   r   r   r   zSRT no lora output:zHF no lora output: )r   rT   r   r   r   r   r   r   r   r   batch_forwardr   r   r   r   r   )r`   ra   rb   rc   rd   rf   rg   rh   ri   r   r   r   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   run_lora_test_by_batch  s   





r   c                  C   s(   d} t |  t|  tj|  d S )N*   )randomseedr.   manual_seedcudamanual_seed_all)r   r   r   r   ensure_reproducibility"  s   

r   )r'   z
    ### Instruction:
    Write a poem about the transformers Python library.
    Mention the word "large language models" in that poem.
    ### Response:
    The Transformers are large language models,
    They're used to make predictions on text.
    r&   z Computer science is the study ofzWrite a short story.z+What are the main components of a computer?lora_adapter_pathsc                 C   s   t d t | t | t | gd |d |d gft | t | t | g|d |d d gft | t | t | gg dfgS )Nr   r   r   NNN)r   r   choice)r`   r   r   r   r   "create_multiple_batch_test_samplesB  s.   
r   torch_nativeTmodel_casesuse_spec_decodingattention_backendenable_deterministic_inferencec                 C   s$  | D ]}t D ]}d}	|j}
dd |jD }t|dksJ tt|}td|
 d| d t  |s6i ndd	dd
d}t|
f|d|d |d g|t|d |j	d||||d|}t  t
|
|ddd}| | t|D ]}\}\}}td|d  d| d|  |j||	|d}|j||	|d}tddd |jD  tddd |jD  t|j|jD ]0\}}| }| }|j}t|g|gd }||k rtd| d| d|
 d| d| dqtd |d  d! qqW d    n1 sw   Y  W d    n	1 s	w   Y  qqd S )"N    c                 S   rj   r   r(   rl   r+   r   r   r   rn     ro   z:run_lora_multiple_batch_on_model_cases.<locals>.<listcomp>r)   z.
========== Testing multiple batches on base '	', dtype=r|   NGRAM      )speculative_algorithmspeculative_num_draft_tokens'speculative_ngram_min_match_window_size'speculative_ngram_max_match_window_sizer}   r   r   T)rb   r   r   re   r   r   sleep_on_idler   r   rf   rg   )rb   r   patch_model_do_sample_falsez
--- Running Batch z --- prompts: z, lora_paths: r   zSRT outputs:c                 S      g | ]}|qS r   r   rl   sr   r   r   rn         zHF outputs:c                 S   r   r   r   r   r   r   r   rn     r   r   r   r   r   r   r   
--- Batch z Comparison Passed --- )TORCH_DTYPESr   r   r   r   TEST_MULTIPLE_BATCH_PROMPTSr   r   r   r   r   	enumerater   r   r   r@   r   r   r   r   )r   r   r   rf   r   rg   re   ra   rb   rc   r   r   batches	spec_argsr   r   r   r`   r   r   r   srt_outhf_outsrt_strhf_strr   r   r   r   r   &run_lora_multiple_batch_on_model_cases}  s   
	


 r   c                    sF   ddt dtjf fdd}| D ]}tD ]}||| qqdS )a  
    Test that SRT correctly handles batch splitting with multiple LoRA adapters.

    When the number of distinct adapters (including None for base model) exceeds
    max_loras_per_batch, SRT internally splits requests into microbatches.

    This test validates:
    1. SRT can process batches that trigger internal splitting without errors
    2. Different adapters don't produce all identical outputs (i.e., at least one
       output differs, indicating adapters are being applied correctly)

    Args:
        model_cases: List of LoRAModelCase configurations to test
        attention_backend: Attention backend to use
        disable_cuda_graph: Whether to disable CUDA graph
        disable_radix_cache: Whether to disable radix cache
    r)   ra   rb   c                    s  dd | j D }t|ksJ d dd}| j}td| d| d td	 gd
 }|d |d	 |d gf||d	 d |d gf||d	 |d d gf|d |d d gf||d	 |d |d	 gf|g dfg}t  t||d|| jddf}t|D ]X\}\}	}
td|d  d td|
  |j	|	||
d}t
|
}t|dkrdd |jD  t fdd D }|rJ d| d|d  d|
 dtd|d  d q~W d    d S 1 sw   Y  d S )Nc                 S   rj   r   r(   r   r   r   r   rn     ro   zPrun_lora_batch_splitting_equivalence_test.<locals>._run_test.<locals>.<listcomp>zNeed at least z adapters for this test@   z-
========== Testing batch splitting on base 'r   z ==========r      r   r   r}   T)
rb   r   r   re   r   r   r   r   rf   rg   z
--- Batch r|   z  Adapters: r   r)   c                 S   s   g | ]}|  qS r   )r   r   r   r   r   rn   >  s    c                 3   s    | ]	}| d  kV  qdS )r   Nr   )rl   outall_outputsr   r   	<genexpr>?  s    zOrun_lora_batch_splitting_equivalence_test.<locals>._run_test.<locals>.<genexpr>zFEvery output was identical despite using different adapters for base 'z	', batch z: adapters=z). Expected at least one output to differ.r   z passed ---)r   r   r   r   r   r   r   r   r   r   setr   r   )ra   rb   r   rc   r   r`   
test_casesr   	batch_idxbatch_promptsr   r   unique_adaptersall_identicalr   rf   rg   re   r   r   r   	_run_test  s   
"z<run_lora_batch_splitting_equivalence_test.<locals>._run_testN)r   r.   r1   r   )r   r   rf   rg   re   r   ra   rb   r   r   r   )run_lora_batch_splitting_equivalence_test  s    Pr   )r   r   )r%   NFFr^   r_   )r%   FFr^   r_   )Fr   TFTN)r   TTN)'dataclassesr   typingr   r   r.   sglang.test.runnersr   r   sglang.test.test_utilsr   	dataclassr   r   float16r   BACKENDSDEFAULT_PROMPTSCI_LORA_MODELSALL_OTHER_LORA_MODELSCI_MULTI_LORA_MODELSALL_OTHER_MULTI_LORA_MODELSLORA_MODELS_QWEN3Tensorr3   r"   rN   r]   r   r1   r#   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s   





C
T	

 	
p
=
c