o
    ei,                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ eeZeeddG dd deZeG dd deZdddZG dd dejZ G dd dejZ!eddG dd deZ"ddgZ#dS ) zPyTorch VitPose model.    )	dataclassN)nn   )initialization)load_backbone)BackboneOutput)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuple   )VitPoseConfigz6
    Class for outputs of pose estimation models.
    )custom_introc                   @   sj   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dS )VitPoseEstimatorOutputaH  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
    heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
        Heatmaps as predicted by the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler    r   r   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   $   s   
 r   c                   @   sJ   e Zd ZU eed< dZdZdZdZe	
 dejejB ejB fddZd	S )
VitPosePreTrainedModelconfigvitpixel_values)imageTmodulec                 C   sr   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS dS )zInitialize the weightsg        )meanstdN)
isinstancer   LinearConv2dinittrunc_normal_weightr"   initializer_rangebiaszeros_	LayerNormones_)selfr&   r   r   r    _init_weightsD   s   
z$VitPosePreTrainedModel._init_weightsN)r   r   r   r   r   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointingr   no_gradr   r*   r+   r2   r5   r   r   r   r    r!   <   s   
 "r!   gaussian-heatmapc                 C   s   |dvrt d| jdkrt d| j\}}}}d}|dkr7d}| dddddd	f  | dddddd	f< | |d
|||} |  }| D ]$\}	}
| dd|
d	f |dd|	d	f< | dd|	d	f |dd|
d	f< qH|||||f}|d
}|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )r;   combined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   r<   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrightr   r   r    	flip_backP   s"   
. "
rQ   c                       sH   e Zd ZdZdef fddZddejdejdB dejfd	d
Z  Z	S )VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r"   c                    sH   t    t | _tj|jddd| _tj|j	j
|jdddd| _d S )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsamplerT   
upsamplingr+   backbone_confighidden_size
num_labelsconvr4   r"   	__class__r   r    r\   ~   s   

zVitPoseSimpleDecoder.__init__Nhidden_staterG   returnc                 C   s4   |  |}| |}| |}|d urt||}|S N)r^   r`   rd   rQ   r4   rh   rG   r   r   r   r    forward   s   



zVitPoseSimpleDecoder.forwardrj   
r   r   r   r   r   r\   r   Tensorrl   __classcell__r   r   rf   r    rR   x   s    *	rR   c                       sB   e Zd ZdZdef fddZd
dejdejdB fdd	Z  Z	S )VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    r"   c                    s   t    tj|jjdddddd| _td| _t	 | _
tjddddddd| _td| _t	 | _tjd|jdddd| _d S )	N   r=      r   F)rX   rY   rZ   r0   r   rW   )r[   r\   r   ConvTranspose2dra   rb   deconv1BatchNorm2d
batchnorm1r]   relu1deconv2
batchnorm2relu2r+   rc   rd   re   rf   r   r    r\      s   


zVitPoseClassicDecoder.__init__Nrh   rG   c                 C   s\   |  |}| |}| |}| |}| |}| |}| |}|d ur,t||}|S rj   )rt   rv   rw   rx   ry   rz   rd   rQ   rk   r   r   r    rl      s   







zVitPoseClassicDecoder.forwardrj   rm   r   r   rf   r    rp      s    $rp   z?
    The VitPose model with a pose estimation head on top.
    c                       sl   e Zd Zdef fddZee			ddejdejdB dejdB dejdB d	e	e
 d
efddZ  ZS )VitPoseForPoseEstimationr"   c                    s|   t  | t|| _t| jjdstdt| jjds!tdt| jjds,td|jr3t|nt	|| _
|   d S )Nrb   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)r[   r\   r   backbonehasattrr"   r?   use_simple_decoderrR   rp   head	post_initre   rf   r   r    r\      s   
z!VitPoseForPoseEstimation.__init__Nr$   dataset_indexrG   labelskwargsri   c                 K   s   d}|dur
t d| jj|fd|i|}|jd }|jd }	| jjjd | jjjd  }
| jjjd | jjjd  }|	ddd}|
|	d|
| }| j||d}t|||j|jd	S )
a  
        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supportedr   r>   r   r   rr   )rG   )r   r   r   r   )NotImplementedErrorr~   forward_with_filtered_kwargsfeature_mapsrA   r"   ra   r|   r}   permuterB   
contiguousr   r   r   r   )r4   r$   r   rG   r   r   r   outputssequence_outputrI   patch_heightpatch_widthr   r   r   r    rl      s.   )

z VitPoseForPoseEstimation.forward)NNN)r   r   r   r   r\   r   r   r   rn   r	   r   r   rl   ro   r   r   rf   r    r{      s(    r{   )r;   )$r   dataclassesr   r   r    r   r,   backbone_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr	   utilsr
   r   r   r   utils.genericr   configuration_vitposer   
get_loggerr   loggerr   r!   rQ   ModulerR   rp   r{   __all__r   r   r   r    <module>   s8   

(&X