o
    bi9                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ dd Z	edG d	d
 d
Z
edG dd de
ZedG dd de
ZdS )    N)tree)keras_export)serialization_lib)trackingc                 C   sB   | du rdS t | trtj| }|du rtd|  d|S | S )a/  Convert a loss string identifier to a loss function.

    Arguments:
        loss_item: Either a string identifier, a loss function instance,
            or `None`.

    Returns:
        A loss function instance, or `None`.

    Raises:
        ValueError: If the loss string identifier is unknown.
    NzUnknown loss function: 'z'.)
isinstancestrkeraslossesget
ValueError)	loss_itemloss_fn r   \/home/ubuntu/.local/lib/python3.10/site-packages/keras/src/distillation/distillation_loss.py_convert_loss_to_function   s   
r   z#keras.distillation.DistillationLossc                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	DistillationLossa  Base class for distillation loss computation.

    Distillation losses define how to compute the distillation loss
    between teacher and student outputs. Each loss implements a specific
    approach to knowledge transfer, from simple logits matching to feature-based
    distillation.

    To create custom distillation losses, subclass this class and
    override the `compute_loss` method.
    c                 K   s   t d)a  Compute distillation loss between teacher and student outputs.

        This method should implement the specific distillation logic for
        transferring knowledge from teacher to student.

        Arguments:
            teacher_outputs: Outputs from the teacher model. Can be a single
                tensor or a list/tuple of tensors for multi-output models.
            student_outputs: Outputs from the student model. Can be a single
                tensor or a list/tuple of tensors for multi-output models.
            **kwargs: Additional arguments for custom distillation_loss.
        Returns:
            Distillation loss tensor.
        z&Subclasses must implement compute_loss)NotImplementedError)selfteacher_outputsstudent_outputskwargsr   r   r   compute_loss-   s   zDistillationLoss.compute_lossc                 C   s   t j|| dS )a  Validate that teacher and student outputs are compatible.

        Arguments:
            teacher_outputs: Outputs from the teacher model.
            student_outputs: Outputs from the student model.
        Raises:
            ValueError: If outputs are not compatible.
        N)r   r   assert_same_structure)r   r   r   r   r   r   validate_outputs>   s   	z!DistillationLoss.validate_outputsc                 C   s   dS )a  Validate that teacher and student models are compatible.

        Arguments:
            teacher: The teacher model.
            student: The student model.
        Raises:
            ValueError: If models are not compatible with this distillation
                loss.
        Nr   )r   teacherstudentr   r   r   validate_model_compatibilityI   s   
z-DistillationLoss.validate_model_compatibilityN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r       s
    r   z&keras.distillation.FeatureDistillationc                       sV   e Zd ZdZej	dddZdd Z fdd	Zd
d Z	dd Z
edd Z  ZS )FeatureDistillationa6  Feature distillation loss.

    Feature distillation transfers knowledge from intermediate layers of the
    teacher model to corresponding layers of the student model. This approach
    helps the student learn better internal representations and often leads
    to better performance compared to logits-only distillation.

    Arguments:
        loss: Loss function to use for feature distillation. Can be:
            - String identifier (e.g., 'mse', 'cosine_similarity', 'mae')
            - Keras loss instance
            - Nested structure of losses matching the layer output structure
            - `None` to skip distillation for that output (useful for
              multi-output models where you only want to distill some outputs)
            At least one loss must be non-`None`. Defaults to 'mse'.
        teacher_layer_name: Name of the teacher layer to extract features from.
            If `None`, uses the final output. Defaults to `None`.
        student_layer_name: Name of the student layer to extract features from.
            If `None`, uses the final output. Defaults to `None`.

    Examlpe(s):

    ```python
    # Basic feature distillation from final outputs
    distillation_loss = FeatureDistillation(loss="mse")

    # Distill from specific intermediate layers
    distillation_loss = FeatureDistillation(
        loss="mse",
        teacher_layer_name="dense_1",
        student_layer_name="dense_1"
    )

    # Use cosine similarity for different feature sizes
    distillation_loss = FeatureDistillation(
        loss="cosine_similarity",
        teacher_layer_name="conv2d_2",
        student_layer_name="conv2d_1"
    )

    # With custom loss instance
    distillation_loss = FeatureDistillation(
        loss=keras.losses.MeanAbsoluteError()
    )

    # For multi-output models
    distillation_loss = FeatureDistillation(
        loss=["mse", "cosine_similarity"]
    )

    # For multi-output models, only distill some outputs
    distillation_loss = FeatureDistillation(
        loss=["mse", None, "cosine_similarity"]  # Skip middle output
    )
    ```
    mseNc                 C   sD   || _ || _tt|| _t| j}tdd |D r tdd S )Nc                 s       | ]}|d u V  qd S Nr   .0lr   r   r   	<genexpr>       z/FeatureDistillation.__init__.<locals>.<genexpr>zXThe `loss` argument in `FeatureDistillation` must contain at least one non-`None` value.)	teacher_layer_namestudent_layer_namer   map_structurer   lossflattenallr   )r   r-   r*   r+   flat_lossesr   r   r   __init__   s   zFeatureDistillation.__init__c              
   C   s  | j dus
| jdurAt|d p|jdu }t|d p|jdu }|s$|rAg }|r-|d |r4|d d|}td| d| j durcz	|j| j d W n tyb } ztd	| d}~ww | jdurz
|j| jd W dS  ty } ztd
| d}~ww dS )zYValidate that teacher and student models are compatible for feature
        distillation.Ninputsr   r   z and z\FeatureDistillation with specific layer names requires Functional or Sequential models. The z model(s) appear to be subclassed (no symbolic inputs/outputs). Either use Functional/Sequential models, or use FeatureDistillation without layer names (to distill final outputs only), or use LogitsDistillation instead.)namezIn teacher model: zIn student model: )r*   r+   hasattrr2   appendjoinr   	get_layer)r   r   r   teacher_is_subclassedstudent_is_subclassedsubclassed_models
models_strer   r   r   r      sB   







z0FeatureDistillation.validate_model_compatibilityc              
      sd   t  || z
t| j| W dS  ty1 } ztdt| j dt| d| d}~ww )z>Validate that outputs are compatible for feature distillation.z)Loss structure mismatch. Loss structure: z, Output structure: z	. Error: N)superr   r   r   r-   r   	structure)r   r   r   r<   	__class__r   r   r      s   
z$FeatureDistillation.validate_outputsc                 K   s8   dd }t || j||}t |}tjtj|S )aT  Compute feature distillation loss using extracted features.

        Arguments:
            teacher_outputs: Extracted features from teacher layer.
            student_outputs: Extracted features from student layer.
            **kwargs: Additional arguments (ignored).
        Returns:
            Scalar distillation loss tensor.
        c                 S   s"   | d u rdS t j| ||}|S )N        )r   opsmean)r   teacher_featuresstudent_featuresr-   r   r   r   
apply_loss   s   z4FeatureDistillation.compute_loss.<locals>.apply_lossr   r,   r-   r.   r   rB   sumstack)r   r   r   r   rF   loss_valuesr0   r   r   r   r      s   

z FeatureDistillation.compute_lossc                 C   s   t j| j| j| jdS )$Get configuration for serialization.)r-   r*   r+   )r   r	   	serializer-   r*   r+   r   r   r   r   
get_config   s   zFeatureDistillation.get_configc                 C   *   |  }tj|d |d< | di |S z#Create instance from configuration.r-   Nr   copyr   r	   deserializeclsconfigr   r   r   from_config      zFeatureDistillation.from_config)r"   NN)r   r   r   r    r    no_automatic_dependency_trackingr1   r   r   r   rN   classmethodrW   __classcell__r   r   r?   r   r!   V   s    9,r!   z%keras.distillation.LogitsDistillationc                   @   s@   e Zd ZdZej		dddZdd Zdd	 Ze	d
d Z
dS )LogitsDistillationaH  Distillation loss that transfers knowledge from final model outputs.

    This distillation loss applies temperature scaling to the teacher's logits
    before computing the loss between teacher and student predictions. It's the
    most common approach for knowledge distillation.

    Arguments:
        temperature: Temperature for softmax scaling. Higher values produce
            softer probability distributions that are easier for the student to
            learn. Typical values range from 3-5. Defaults to 3.0.
        loss: Loss function to use for distillation. Can be:
            - String identifier (e.g., 'kl_divergence',
              'categorical_crossentropy')
            - Keras loss instance
            - Nested structure of losses matching the model output structure
            - `None` to skip distillation for that output (useful for
              multi-output models where you only want to distill some outputs)
            At least one loss must be non-`None`. Defaults to 'kl_divergence'.

    Examlpe(s):

    ```python
    # Basic logits distillation with KL divergence
    distillation_loss = LogitsDistillation(temperature=3.0)

    # With categorical crossentropy loss
    distillation_loss = LogitsDistillation(
        temperature=4.0,
        loss="categorical_crossentropy"
    )

    # With custom loss instance
    distillation_loss = LogitsDistillation(
        temperature=4.0,
        loss=keras.losses.CategoricalCrossentropy(from_logits=True)
    )

    # For multi-output models
    distillation_loss = LogitsDistillation(
        temperature=3.0,
        loss=["kl_divergence", "categorical_crossentropy"]
    )

    # For multi-output models, only distill some outputs
    distillation_loss = LogitsDistillation(
        temperature=3.0,
        loss=["kl_divergence", None]  # Skip second output
    )
    ```
          @kl_divergencec                 C   st   || _ tt|| _t| j}tdd |D rtdt| j t	t
fs/tdt| j  | j dkr8tdd S )Nc                 s   r#   r$   r   r%   r   r   r   r(   C  r)   z.LogitsDistillation.__init__.<locals>.<genexpr>z%At least one loss must be non-`None`.z"temperature must be a number, got rA   ztemperature must be positive.)temperaturer   r,   r   r-   r.   r/   r   r   intfloattype)r   r_   r-   r0   r   r   r   r1   9  s   
zLogitsDistillation.__init__c           	         sd   t  fdd|}t  fdd|} fdd}t | j||}t |}tjtj|S )a  Compute distillation loss using the configured loss function.

        Arguments:
            teacher_outputs: Logits from teacher model. Can be a single tensor,
                list/tuple of tensors, or dict of tensors.
            student_outputs: Logits from student model. Can be a single tensor,
                list/tuple of tensors, or dict of tensors.
            **kwargs: Additional arguments (ignored).
        Returns:
            Distillation loss tensor.
        c                       t j|  jS r$   r   rB   divider_   xrM   r   r   <lambda>[      z1LogitsDistillation.compute_loss.<locals>.<lambda>c                    rc   r$   rd   rf   rM   r   r   rh   ^  ri   c                    sl   | d u rdS t | tjjr-tjj|dd}tjj|dd}tj| ||}| jd  S tj| ||S )NrA   )axis   )r   r   r	   KLDivergencerB   softmaxrC   r_   )r   teacher_logitsstudent_logitsteacher_probsstudent_probsr-   rM   r   r   rF   b  s   z3LogitsDistillation.compute_loss.<locals>.apply_lossrG   )	r   r   r   r   teacher_scaledstudent_scaledrF   rJ   r0   r   rM   r   r   M  s   

zLogitsDistillation.compute_lossc                 C   s   | j t| jdS )rK   )r_   r-   )r_   r   serialize_keras_objectr-   rM   r   r   r   rN   z  s   
zLogitsDistillation.get_configc                 C   rO   rP   rQ   rT   r   r   r   rW     rX   zLogitsDistillation.from_configN)r]   r^   )r   r   r   r    r   rY   r1   r   rN   rZ   rW   r   r   r   r   r\     s    3-r\   )r   	keras.srcr   keras.src.api_exportr   keras.src.savingr   keras.src.utilsr   r   r   r!   r\   r   r   r   r   <module>   s    5 .