o
    	۷i-                     @   s$  d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZ eeZeeddG dd deZeG dd deZdddZG dd dejZ G dd dejZ!eddG dd deZ"ddgZ#dS ) zPyTorch VitPose model.    )	dataclass)OptionalUnionN)nn   )BackboneOutput)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)load_backbone)can_return_tuple   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )custom_introc                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )VitPoseEstimatorOutputaH  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
    heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
        Heatmaps as predicted by the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler    r    r    b/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   %   s   
 r   c                   @   s@   e Zd ZU eed< dZdZdZdee	j
e	je	jf fddZdS )	VitPosePreTrainedModelconfigvitpixel_valuesTmodulec                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS dS )zInitialize the weightsg        )meanstdNg      ?)
isinstancer   LinearConv2dinittrunc_normal_weightdatator   float32r#   initializer_rangedtypebiaszero_	LayerNormfill_)selfr&   r    r    r!   _init_weightsD   s   

z$VitPosePreTrainedModel._init_weightsN)r   r   r   r   r   base_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r*   r+   r6   r9   r    r    r    r!   r"   =   s   
 "r"   gaussian-heatmapc                 C   s   |dvrt d| jdkrt d| j\}}}}d}|dkr7d}| dddddd	f  | dddddd	f< | |d
|||} |  }| D ]$\}	}
| dd|
d	f |dd|	d	f< | dd|	d	f |dd|
d	f< qH|||||f}|d
}|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )r=   combined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   r>   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrightr    r    r!   	flip_backS   s"   
. "
rS   c                       sH   e Zd ZdZdef fddZddejdeej dejfd	d
Z	  Z
S )VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r#   c                    sH   t    t | _tj|jddd| _tj|j	j
|jdddd| _d S )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsamplerV   
upsamplingr+   backbone_confighidden_size
num_labelsconvr8   r#   	__class__r    r!   r^      s   

zVitPoseSimpleDecoder.__init__Nhidden_staterI   returnc                 C   s4   |  |}| |}| |}|d urt||}|S N)r`   rb   rf   rS   r8   rj   rI   r   r    r    r!   forward   s   



zVitPoseSimpleDecoder.forwardrl   r   r   r   r   r   r^   r   Tensorr   rn   __classcell__r    r    rh   r!   rT   {   s    *	rT   c                       sB   e Zd ZdZdef fddZd
dejdeej fdd	Z	  Z
S )VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    r#   c                    s   t    tj|jjdddddd| _td| _t	 | _
tjddddddd| _td| _t	 | _tjd|jdddd| _d S )	N   r?      r   F)rZ   r[   r\   r4   r   rY   )r]   r^   r   ConvTranspose2drc   rd   deconv1BatchNorm2d
batchnorm1r_   relu1deconv2
batchnorm2relu2r+   re   rf   rg   rh   r    r!   r^      s   


zVitPoseClassicDecoder.__init__Nrj   rI   c                 C   s\   |  |}| |}| |}| |}| |}| |}| |}|d ur,t||}|S rl   )rv   rx   ry   rz   r{   r|   rf   rS   rm   r    r    r!   rn      s   







zVitPoseClassicDecoder.forwardrl   ro   r    r    rh   r!   rr      s    $rr   z?
    The VitPose model with a pose estimation head on top.
    c                       sl   e Zd Zdef fddZee			ddejde	ej de	ej de	ej d	e
e d
efddZ  ZS )VitPoseForPoseEstimationr#   c                    s|   t  | t|| _t| jjdstdt| jjds!tdt| jjds,td|jr3t|nt	|| _
|   d S )Nrd   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)r]   r^   r   backbonehasattrr#   rA   use_simple_decoderrT   rr   head	post_initrg   rh   r    r!   r^      s   
z!VitPoseForPoseEstimation.__init__Nr%   dataset_indexrI   labelskwargsrk   c                 K   s   d}|dur
t d| jj|fd|i|}|jd }|jd }	| jjjd | jjjd  }
| jjjd | jjjd  }|	ddd}|
|	d|
| }| j||d}t|||j|jd	S )
ac  
        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supportedr   r@   r   r   rt   )rI   )r   r   r   r   )NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsrC   r#   rc   r~   r   permuterD   
contiguousr   r   r   r   )r8   r%   r   rI   r   r   r   outputssequence_outputrK   patch_heightpatch_widthr   r    r    r!   rn      s.   '

z VitPoseForPoseEstimation.forward)NNN)r   r   r   r   r^   r   r   r   rp   r   r	   r   r   rn   rq   r    r    rh   r!   r}      s(    r}   )r=   )$r   dataclassesr   typingr   r   r   r   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   r   r   r   utils.backbone_utilsr   utils.genericr   configuration_vitposer   
get_loggerr   loggerr   r"   rS   ModulerT   rr   r}   __all__r    r    r    r!   <module>   s8   

(&V