o
    wi0                     @   s  d Z ddlmZ ddlmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ d
dlmZ eeZeeddG dd deZeG dd de
ZdddZG dd dejZG dd dejZeddG dd deZddgZdS )zPyTorch VitPose model.    )	dataclass)OptionalUnionN)nn   )PreTrainedModel)ModelOutputauto_docstringlogging)load_backbone   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )custom_introc                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )VitPoseEstimatorOutputaH  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
    heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
        Heatmaps as predicted by the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler    r   r   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   '   s   
 r   c                   @   s>   e Zd ZeZdZdZdZdee	j
e	je	jf ddfddZdS )	VitPosePreTrainedModelvitpixel_valuesTmodulereturnNc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS dS )zInitialize the weightsg        )meanstdNg      ?)
isinstancer   LinearConv2dinittrunc_normal_weightdatator   float32configinitializer_rangedtypebiaszero_	LayerNormfill_)selfr!   r   r   r   _init_weightsF   s   

z$VitPosePreTrainedModel._init_weights)r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r&   r'   r3   r6   r   r   r   r   r   ?   s    &r   gaussian-heatmapc                 C   s   |dvrt d| jdkrt d| j\}}}}d}|dkr7d}| dddddd	f  | dddddd	f< | |d
|||} |  }| D ]$\}	}
| dd|
d	f |dd|	d	f< | dd|	d	f |dd|
d	f< qH|||||f}|d
}|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )r;   combined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   r<   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrightr   r   r   	flip_backU   s"   
. "
rQ   c                       sD   e Zd ZdZd
 fddZddejdeej dejfdd	Z  Z	S )VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r"   Nc                    sH   t    t | _tj|jddd| _tj|j	j
|jdddd| _d S )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsamplerT   
upsamplingr'   backbone_confighidden_size
num_labelsconvr5   r.   	__class__r   r   r\      s   

zVitPoseSimpleDecoder.__init__hidden_staterG   c                 C   s4   |  |}| |}| |}|d urt||}|S N)r^   r`   rd   rQ   r5   rh   rG   r   r   r   r   forward   s   



zVitPoseSimpleDecoder.forward)r"   Nri   )
r   r   r   r   r\   r   Tensorr   rk   __classcell__r   r   rf   r   rR   }   s    *	rR   c                       sB   e Zd ZdZdef fddZd
dejdeej fdd	Z	  Z
S )VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    r.   c                    s   t    tj|jjdddddd| _td| _t	 | _
tjddddddd| _td| _t	 | _tjd|jdddd| _d S )	N   r=      r   F)rX   rY   rZ   r1   r   rW   )r[   r\   r   ConvTranspose2dra   rb   deconv1BatchNorm2d
batchnorm1r]   relu1deconv2
batchnorm2relu2r'   rc   rd   re   rf   r   r   r\      s   


zVitPoseClassicDecoder.__init__Nrh   rG   c                 C   s\   |  |}| |}| |}| |}| |}| |}| |}|d ur,t||}|S ri   )rr   rt   ru   rv   rw   rx   rd   rQ   rj   r   r   r   rk      s   







zVitPoseClassicDecoder.forwardri   )r   r   r   r   r   r\   r   rl   r   rk   rm   r   r   rf   r   rn      s    $rn   z?
    The VitPose model with a pose estimation head on top.
    c                       s   e Zd Zdeddf fddZe						ddejdeej deej d	eej d
ee	 dee	 dee	 de
eef fddZ  ZS )VitPoseForPoseEstimationr.   r"   Nc                    s|   t  | t|| _t| jjdstdt| jjds!tdt| jjds,td|jr3t|nt	|| _
|   d S )Nrb   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)r[   r\   r   backbonehasattrr.   r?   use_simple_decoderrR   rn   head	post_initre   rf   r   r   r\      s   
z!VitPoseForPoseEstimation.__init__r    dataset_indexrG   labelsoutput_attentionsoutput_hidden_statesreturn_dictc                 C   sF  |dur|n| j j}|dur|n| j j}|dur|n| j j}d}|dur(td| jj|||||d}	|r:|	jd n|	d d }
|
jd }| j j	j
d | j j	jd  }| j j	j
d | j j	jd  }|
ddd|d|| }
| j|
|d}|s|r|f|	dd  }n	|f|	dd  }|dur|f| S |S t|||	j|	jd	S )
ac  
        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supported)r   r   r   r   r>   r   r   rp   )rG   )r   r   r   r   )r.   use_return_dictr   r   NotImplementedErrorr|   forward_with_filtered_kwargsfeature_mapsrA   ra   rz   r{   permuterB   
contiguousr   r   r   r   )r5   r    r   rG   r   r   r   r   r   outputssequence_outputrI   patch_heightpatch_widthr   outputr   r   r   rk      s@   (	
z VitPoseForPoseEstimation.forward)NNNNNN)r   r   r   r   r\   r	   r   rl   r   boolr   r   r   rk   rm   r   r   rf   r   ry      s4    
	ry   )r;   )r   dataclassesr   typingr   r   r   torch.utils.checkpointr   modeling_utilsr   utilsr   r	   r
   utils.backbone_utilsr   configuration_vitposer   
get_loggerr   loggerr   r   rQ   ModulerR   rn   ry   __all__r   r   r   r   <module>   s4   

(&g