o
    TiH                     @   s&   d dl Z d	ddZdd Zdd ZdS )
    NFc                    s@   dd fddfdd  fdd}t || |d	S )
a   Quantize bert-style transformer layers with DeepSpeed's transformer layer
    Arguments:
        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
        model (torch.nn.Module): user's nn.module representing their model

        megatron (bool): megatron model-parallel implementation (this is supported for inference only)
        preln (bool): does the original layer implementation do pre or post layer norm?

        Note: For Bert kind of models, we inject based on the DeepSpeed-Example models, if not setting huggingface flag.

    Returns:
        Updated nn.module with quantized transformer layers
    c                 S   s   |  tjS N)totorchint8)weight r   [/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/module_quantize.pyquantize_weight   s   z3quantize_transformer_layer.<locals>.quantize_weightc                    sd    | j jjj| j jj_ | j jjj| j jj_ | jjjj| jjj_ | jjjj| jjj_d S r   )	attentionquery_key_valuer   datadensemlpdense_h_to_4hdense_4h_to_hlayer)r	   r   r   megatron_layer_quantize   s   z;quantize_transformer_layer.<locals>.megatron_layer_quantizec                    s   | j jjjj| j jjj_| j jjjj| j jjj_| j jjjj| j jjj_| j jjjj| j jjj_ rG| j	j
jj| j	j
j_n| j	jjj| j	jj_| jjjj| jjj_d S r   )r
   selfqueryr   r   keyvalueoutputr   intermediate	dense_actr   )prelnr	   r   r   bert_layer_quantize"   s   z7quantize_transformer_layer.<locals>.bert_layer_quantizec                    s   r|  | S  |  | S r   r   )child)r   megatronr   r   r   quantize_fn-   s
   z/quantize_transformer_layer.<locals>.quantize_fn)model
orig_classr   )quantize_module)orig_layer_implr    r   r   r   r   )r   r   r   r   r	   r   quantize_transformer_layer	   s
   
r$   c                 C   s   ||i}t | |S r   )_quantize_module)r    r!   r   policyr   r   r   r"   :   s   
r"   c                 C   sT   |   D ]#\}}|j|v r"t|}t| |||j | t| |}qt|| q| S r   )named_children	__class__reprsetattrgetattrr%   )r    policiesnamer   orignewr   r   r   r%   ?   s   
r%   )FF)r   r$   r"   r%   r   r   r   r   <module>   s   
1