o
    biY                     @   s`   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 edG dd	 d	eZdS )
    N)tree)keras_export)_convert_loss_to_function)Model)serialization_libzkeras.distillation.Distillerc                       s   e Zd ZdZ			d- fdd	Zdd Zd	d
 Zdd Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zd. fd d!	Zd/d"d#Z	$d0d%d&Z fd'd(Z fd)d*Zed+d, Z  ZS )1	Distillera  Distillation model for transferring knowledge from teacher to student.

    Knowledge distillation transfers knowledge from a large, complex model
    (teacher) to a smaller, simpler model (student). The student learns
    from both ground truth labels and the teacher's predictions, often
    achieving better performance than training on labels alone.

    Arguments:
        teacher: A trained `keras.Model` that serves as the knowledge source.
            The teacher model is frozen during distillation.
        student: A `keras.Model` to be trained through distillation.
        distillation_losses: List of distillation losses to apply. Can be a
            single distillation loss or a list of distillation losses like
            `keras.distillation.LogitsDistillation`,
            `keras.distillation.FeatureDistillation`, or custom distillation
            losses.
        distillation_loss_weights: List of weights for each distillation loss.
            Must have the same length as `distillation_losses`. If `None`,
            equal weights are used.
        student_loss_weight: Weight for the student's supervised loss component.
            Must be between 0 and 1. Defaults to 0.5.
        name: Name for the distiller model. Defaults to `"distiller"`.
        **kwargs: Additional keyword arguments passed to the parent `Model`
            class.

    Attributes:
        student: The student model being trained. Access this to get the trained
            student model for independent use after distillation training.
        teacher: The teacher model providing knowledge. This model is frozen
            during training.

    Examples:

    ```python
    # Basic distillation with KerasHub models
    import keras_hub as hub

    teacher = hub.models.CausalLM.from_preset("gemma_2b_en")
    student = hub.models.CausalLM.from_preset(
        "gemma_1.1_2b_en", load_weights=False
    )

    # Single distillation loss
    distiller = Distiller(
        teacher=teacher,
        student=student,
        distillation_losses=LogitsDistillation(temperature=3.0),
    )

    # Compile the distiller (like any Keras model)
    distiller.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Train the distiller
    distiller.fit(x_train, y_train, epochs=10)

    # Access the trained student model
    trained_student = distiller.student

    # Multiple distillation losses
    distiller = Distiller(
        teacher=teacher,
        student=student,
        distillation_losses=[
            LogitsDistillation(temperature=3.0),
            FeatureDistillation(
                teacher_layer_name="dense_1",
                student_layer_name="dense_1"
            )
        ],
        distillation_loss_weights=[1.0, 0.5],
    )

    # Compile with custom settings
    distiller.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    ```
    N      ?	distillerc           	         sT  t  jdd|i| | || || _|| _t|ttfs'tdt	| |dk s/|dkr6td| || _
|d u rAtdt|ttfsQ|g| _dg| _n*|| _|d u radgt| | _nt|t|krxtdt| dt| d	|| _| jD ]	}| ||| q~|   d
| j_tjjdd| _tjjdd| _tjjdd| _d S )Nnamez*student_loss_weight must be a number, got               ?z5student_loss_weight must be between 0.0 and 1.0, got z'distillation_losses' cannot be `None`. Provide a distillation loss (e.g., LogitsDistillation or FeatureDistillation) or a list of distillation losses.z%Number of distillation_loss_weights (z,) must match number of distillation_losses ()Fstudent_lossr
   distillation_loss
total_loss )super__init___validate_modelsteacherstudent
isinstanceintfloat
ValueErrortypestudent_loss_weightlisttupledistillation_lossesdistillation_loss_weightslen)_validate_distillation_loss_compatibility _create_multi_feature_extractors	trainablekerasmetricsMeanstudent_loss_trackerdistillation_loss_trackertotal_loss_tracker)	selfr   r   r    r!   r   r
   kwargsr   	__class__r   T/home/ubuntu/.local/lib/python3.10/site-packages/keras/src/distillation/distiller.pyr   `   sb   



zDistiller.__init__c                 C   sd   t |tjstdt| t |tjstdt| | || | || | || dS )z8Validate that teacher and student models are compatible.z#Teacher must be a keras.Model, got z#Student must be a keras.Model, got N)r   r&   r   r   r   _validate_input_compatibility_validate_output_compatibility_validate_dtype_compatibilityr,   r   r   r   r   r0   r      s   zDistiller._validate_modelsc              	   C   sx   t |t |krtd| d| d| dt||D ]\}}|dur9|dur9||kr9td| d| d| dqdS )z&Assert that two shapes are compatible.Teacher and student z, shapes have different dimensions. Teacher: , Student: .Nz# shapes are incompatible. Teacher: z. All dimensions must match.)r"   r   zip)r,   shape1shape2contextdim1dim2r   r   r0   _assert_shapes_are_compatible   s&   z'Distiller._assert_shapes_are_compatiblec                 C   s(   ||krt d| d| d| ddS )z4Assert that teacher and student dtypes are the same.r5   z dtypes must match. Teacher: r6   r7   N)r   )r,   teacher_dtypestudent_dtyper;   r   r   r0   _assert_same_dtype   s   zDistiller._assert_same_dtypec                    Z   t |dr
t |dsdS t|d}t|d}|du s|du r dS t fdd|| dS )z?Validate that teacher and student have compatible input shapes.inputsNc                         | j|jdS Ninputr>   shapetisir,   r   r0   <lambda>       
z9Distiller._validate_input_compatibility.<locals>.<lambda>hasattrgetattrr   map_structure)r,   r   r   teacher_inputsstudent_inputsr   rL   r0   r1         


z'Distiller._validate_input_compatibilityc                    rB   )z@Validate that teacher and student have compatible output shapes.outputsNc                    rD   NoutputrG   tosorL   r   r0   rM      rN   z:Distiller._validate_output_compatibility.<locals>.<lambda>rO   )r,   r   r   teacher_outputsstudent_outputsr   rL   r0   r2      rU   z(Distiller._validate_output_compatibilityc                    s   t |dr
t |dsdS |jdu s|jdu rdS t fdd|j|j t |dr/t |ds1dS |jdu s;|jdu r=dS t fdd|j|j dS )z=Validate that teacher and student have compatible data types.rC   Nc                    rD   rE   rA   dtyperI   rL   r   r0   rM      s    z9Distiller._validate_dtype_compatibility.<locals>.<lambda>rV   c                    rD   rW   r^   rY   rL   r   r0   rM     rN   )rP   rC   r   rR   rV   r4   r   rL   r0   r3      s$   

z'Distiller._validate_dtype_compatibilityc                 C   s   | || dS )zZValidate that the distillation loss is compatible with teacher
        and student models.N)validate_model_compatibility)r,   r   r   r   r   r   r0   r#     s   z3Distiller._validate_distillation_loss_compatibilityc                 C   s   g }g }| j D ](}t|dr|jr|j|vr||j t|dr/|jr/|j|vr/||j q| | j|| _| | j|| _	dS )z?Create feature extractors for efficient multi-layer extraction.teacher_layer_namestudent_layer_nameN)
r    rP   ra   appendrb   _create_feature_extractorr   _teacher_feature_extractorr   _student_feature_extractor)r,   teacher_layer_namesstudent_layer_namesr   r   r   r0   r$     s<   

z*Distiller._create_multi_feature_extractorsc                 C   s   |sdS t |dr|jdu rtd|j dt|tjr$|jd j}n|j}d|i}|D ]}|j	|d}|j||< q-tj
|j||j dd	S )
ar  Create a feature extractor for a model.

        Arguments:
            model: The model to create an extractor for.
            layer_names: List of layer names to extract features from.

        Returns:
            Feature extractor model or `None` if no layer names provided.

        Raises:
            ValueError: If model has no symbolic inputs/outputs.
        NrC   z$Cannot create feature extractor for z-. The model has no symbolic inputs attribute.final_outputr   _multi_feature_extractor)rC   rV   r
   )rP   rC   r   r
   r   r&   
SequentiallayersrX   	get_layerr   )r,   modellayer_namesrj   rV   
layer_namelayerr   r   r0   rd   =  s$   
z#Distiller._create_feature_extractorc                 C   s*   | j dur| j |ddS d| j|ddiS )z6Extract all teacher features in a single forward pass.NFtrainingrj   )re   r   )r,   xr   r   r0   _extract_all_teacher_featuresc  s   
z'Distiller._extract_all_teacher_featuresc                 C   s    | j dur| j |ddS d|iS )z6Extract all student features in a single forward pass.NTrs   rj   )rf   )r,   ru   y_predr   r   r0   _extract_all_student_featuresj  s   
z'Distiller._extract_all_student_featuresc                 C   sF   |r|j pd}n|jpd}||vrtd| dt|  || S )z8Get the specific features needed by a distillation loss.rj   zLayer 'z.' not found in extracted features. Available: )ra   rb   r   r   keys)r,   r   all_features
is_teacherrq   r   r   r0   _get_distillation_loss_featuresq  s   

z)Distiller._get_distillation_loss_featuresadamc                    sj   |du rt dtt|| _|| _|dur&t|ttfs&t dt	| t
 jd|d|d| dS )a  Compile the distiller with proper integration.

        Arguments:
            optimizer: Optimizer for training the student model.
            loss: Student loss function for the student's supervised learning.
                Can be a string identifier or a loss function instance.
            metrics: Additional metrics to track during training.
            **kwargs: Additional arguments passed to parent compile.
        Nz'loss' cannot be `None`.z%metrics must be a list or tuple, got )	optimizerlossr'   r   )r   r   rR   r   _student_loss_student_loss_for_serializationr   r   r   r   r   compile)r,   r~   r   r'   r-   r.   r   r0   r     s   

zDistiller.compilec                 K   s   | j |fd|i|S )z)Forward pass returns student predictions.rt   )r   )r,   rC   rt   r-   r   r   r0   call  s   zDistiller.callTc                 C   s  |du r
| ||d}d}| j dkrK|durKtdd | j||}t|}t|dkr5tjtj	|n|d }t
|drKt|jdkrKtj|}d}	| j d	k r| |}
| ||}t| j| jD ]{\}}t
|d
r|jdurz| j||
dd}| j||dd}W n( ty } ztdt|j d|j d|j d| |d}~ww |
d }|}||| |||}t
|drt|jdkrtd|jj d|j dtj|	tj||}	qdtjtj| j |tjtjd	| j |	}| j | | j! |	 | j" | |S )aM  Compute combined distillation loss.

        Arguments:
            x: Input data.
            y: Target data.
            y_pred: Model predictions.
            sample_weight: Sample weights (currently unused).
            training: Whether the model is in training mode.

        Returns:
            Combined loss tensor.
        Nrs   r   c                 S   s
   | ||S Nr   )loo_predr   r   r0   rM     s   
 z(Distiller.compute_loss.<locals>.<lambda>   r   rH   r   ra   T)r{   FzFailed to extract features for z targeting teacher layer 'z' and student layer 'z'. Original error: rj   zDistillation loss z' returned a non-scalar loss with shape z6. The compute_loss method must return a scalar tensor.)#r   r   rR   r   flattenr"   r&   opssumstackrP   rH   meanrv   rx   r8   r    r!   ra   r|   r   RuntimeErrorr   __name__rb   validate_outputscompute_lossr/   addmultiplysubtractr)   update_stater*   r+   )r,   ru   yrw   sample_weightrt   r   loss_valuesflat_lossesr   teacher_featuresstudent_featuresdistillation_loss_fnweight distillation_loss_teacher_output distillation_loss_student_outputecurrent_distillation_lossr   r   r   r0   r     s   




	zDistiller.compute_lossc                    s,   t    | j  | j  | j  dS )zReset all metrics.N)r   reset_metricsr)   reset_stater*   r+   rL   r.   r   r0   r   )  s   


zDistiller.reset_metricsc                    sD   t   }|t| jt| jdd | jD | j| j	d |S )z$Get configuration for serialization.c                 S      g | ]}t |qS r   )r   serialize_keras_object.0r   r   r   r0   
<listcomp>;      z(Distiller.get_config.<locals>.<listcomp>)r   r   r    r!   r   )
r   
get_configupdater   r   r   r   r    r!   r   )r,   configr.   r   r0   r   0  s    
zDistiller.get_configc                 C   sP   |  }t|d |d< t|d |d< dd |d D |d< | di |S )z#Create instance from configuration.r   r   c                 S   r   r   )r   deserialize_keras_objectr   r   r   r0   r   Q  r   z)Distiller.from_config.<locals>.<listcomp>r    Nr   )copyr   r   )clsr   r   r   r0   from_configE  s   
zDistiller.from_config)Nr   r	   )r}   NNr   )NNNNT)r   
__module____qualname____doc__r   r   r>   rA   r1   r2   r3   r#   r$   rd   rv   rx   r|   r   r   r   r   r   classmethodr   __classcell__r   r   r.   r0   r   	   s6    ZO$&

 r   )r&   	keras.srcr   keras.src.api_exportr   (keras.src.distillation.distillation_lossr   keras.src.models.modelr   keras.src.savingr   r   r   r   r   r0   <module>   s    