o
    eiL                    @   s&  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZmZmZm Z  ddl!m"Z" e #e$Z%		dvdej&dej'dej'dej'dej'dB de(dB de(dee fddZ)G dd dej&Z*G dd  d ej&Z+		!	dwd"ej'd#e(d$e,dB d%e-d&e.f
d'd(Z/		dxd"ej'd)e,e.B d$e,dB d&e.fd*d+Z0G d,d- d-ej&Z1G d.d/ d/ej&Z2G d0d1 d1ej&Z3eG d2d3 d3eZ4G d4d5 d5ej&Z5G d6d7 d7ej&Z6G d8d9 d9e4Z7eed:d;G d<d= d=eZ8eed>d;G d?d@ d@eZ9eedAd;G dBdC dCeZ:eedDd;G dEdF dFeZ;eedGd;G dHdI dIeZ<eedJd;G dKdL dLeZ=dMej>j?dNej'dOej'fdPdQZ@dydRej'dSej'dB dOej'fdTdUZAG dVdW dWej&ZBG dXdY dYej&ZCG dZd[ d[ej&ZDG d\d] d]ej&ZEeG d^d_ d_e4ZFG d`da daej&ZGedbd;G dcdd dde4ZHG dedf dfej&ZIedgd;G dhdi die4ZJedjd;G dkdl dlej&ZKedmd;G dndo doe4ZLG dpdq dqej&ZMedrd;G dsdt dte4ZNg duZOdS )zzPyTorch PatchTST model.    N)Callable)	dataclass)nn   )initialization)ACT2CLS)is_deepspeed_zero3_enabled)FlashAttentionKwargs)BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)NegativeBinomialOutputNormalOutputStudentTOutput)ModelOutputTransformersKwargsauto_docstringlogging   )PatchTSTConfig        modulequerykeyvalueattention_maskscalingdropoutkwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )N         r   dim)ptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr   r&   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_output r0   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/patchtst/modeling_patchtst.pyeager_attention_forward'   s   
r2   c                       s   e Zd ZdZ					ddededed	ed
edededB f fddZ			dde	j
de	j
dB de	j
dB dedB dee dee	j
e	j
dB ee	j
 dB f fddZ  ZS )PatchTSTAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr   
is_decoderbias	is_causalconfigc                    s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).r!   r7   )super__init__r4   r5   r   head_dimr9   
ValueErrorr   r6   r8   r   Lineark_projv_projq_projout_proj)selfr4   r5   r   r6   r7   r8   r9   	__class__r0   r1   r=   G   s&   



zPatchTSTAttention.__init__hidden_stateskey_value_statesr   output_attentionsr   returnc                 K   s  |du}|j dd \}}|r|j d n|}	||d| jf}
||	d| jf}| |j|
 dd}|r4|n|}| |j| dd}| |j| dd}t| j	j
t}|| ||||f| jsbdn| j| j|d|\}}|||d }| |}||dfS )z#Input shape: Batch x Time x ChannelNr    r   r"   r   )r   r   rJ   )shaper>   rC   viewr*   rA   rB   r   get_interfacer9   _attn_implementationr2   r&   r   r   reshaper-   rD   )rE   rH   rI   r   rJ   r   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer/   r.   r0   r0   r1   forwardf   s8   	


zPatchTSTAttention.forward)r   FTFN)NNF)__name__
__module____qualname____doc__intfloatboolr   r=   r(   Tensorr   r	   tupler\   __classcell__r0   r0   rF   r1   r3   D   sL    "	r3   c                       6   e Zd ZdZdef fddZdejfddZ  Z	S )PatchTSTBatchNormzP
    Compute batch normalization over the sequence length (time) dimension.
    r9   c                    s"   t    tj|j|jd| _d S )Neps)r<   r=   r   BatchNorm1dd_modelnorm_eps	batchnormrE   r9   rF   r0   r1   r=      s   
zPatchTSTBatchNorm.__init__inputsc                 C   s"   | dd}| |}| ddS )a  
        Parameters:
            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                input for Batch norm calculation
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
        r   r"   )r*   rn   )rE   rp   outputr0   r0   r1   r\      s   
zPatchTSTBatchNorm.forward
r]   r^   r_   r`   r   r=   r(   rd   r\   rf   r0   r0   rF   r1   rh      s    rh   Frp   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                 C   s*  |dk s|dkrt d| d| j\}}}}| j}	t|d|  }
|r5tj|d||	d}|d|d}n	tj||||	d}tj||||	d}d|ddddd|
f< tj|dd}tj|dd}tj	|d|d	}|
dddd|}|durd|dd|ddddf< | | |}||d
 fS )a  random_masking: Mask the input considering the control variables.

    Args:
        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
            The input tensor to mask.
        mask_ratio (`float`):
            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
        unmasked_channel_indices (list, *optional*):
            Indices of channels that will not be masked.
        channel_consistent_masking (bool, *optional*, defaults to `False`):
            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
            across channels.
        mask_value (int, *optional*, defaults to 0):
            Define the value of masked patches for pretraining.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
        n]
    r   r   zMask ratio z has to be between 0 and 1.deviceNr    r#   )r$   index.r   )r?   rL   rx   ra   r(   randrepeatonesargsortgather	unsqueezemasked_fillrc   )rp   rs   rt   ru   rv   
batch_sizenum_channelssequence_lengthnum_featuresrx   len_keepnoisemaskids_shuffleids_restoreinputs_maskr0   r0   r1   random_masking   s&   r   num_forecast_mask_patchesc                 C   s  t |tr|g}dd |D }| j\}}}}tj|||| jd}	g }
d}t|}t||D ](\}}|dks9||krAtd| dt|| | }|
	|||g ||7 }q-t
|
dd d	}
||k rq|
d d
 ||  |
d d
< n||kr|
d d
 ||  |
d d
< d}|
D ]\}}}|| }d|	||dd| df< |}qt|	jd }|	| }	|	dddd|}	|durd|	dd|ddddf< | |	 |}||	d fS )a  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

    Parameters:
        inputs (`torch.Tensor`):
            Input of shape `(bs, num_channels, num_patch, patch_length)`
        num_forecast_mask_patches (`list`):
            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
        unmasked_channel_indices (`list`, *optional*):
            Indices of channels that are not masked.
        mask_value (`int`, *optional*, defaults to 0):
            Values in the masked patches will be filled by `mask_value`.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
    c                 S   s   g | ]}d qS )r   r0   .0_r0   r0   r1   
<listcomp>  s    z$forecast_masking.<locals>.<listcomp>rw   r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                 S   s   | d S )Nr"   r0   )xr0   r0   r1   <lambda>  s    z"forecast_masking.<locals>.<lambda>)r   r"   r    r   Nrz   )
isinstancera   rL   r(   zerosrx   sumzipr?   appendsortedrandpermr   r|   r   rc   )rp   r   rt   rv   forecast_mask_ratiosr   r   r   r   r   t_listtotal_lengthtotal_ratiopatch_lengthratiotemp_lenbatch1	patch_lenr   batch2permr   r0   r0   r1   forecast_masking   sB   


r   c                       rg   )PatchTSTPatchifyz
    A class to patchify the time series sequence into different patches

    Returns:
        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
    r9   c                    s   t    |j| _|j| _|j| _| j| jkr$td| j d| j dt| j| j| j | j d | _| j| j| jd   }| j| | _	d S )NzSequence length (z+) has to be greater than the patch length ()r   )
r<   r=   context_lengthr   r   patch_strider?   maxnum_patchessequence_start)rE   r9   new_sequence_lengthrF   r0   r1   r=   6  s   
 zPatchTSTPatchify.__init__past_valuesc                 C   sp   |j d }|| jkrtd| d| j d|dd| jdddf }|jd| j| jd}|dd }|S )a!  
        Parameters:
            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
                Input for patchification

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
        zInput sequence length (z%) doesn't match model configuration (r:   N)	dimensionr'   step)	rL   r   r?   r   unfoldr   r   r*   r-   )rE   r   r   rq   r0   r0   r1   r\   G  s   
	
zPatchTSTPatchify.forwardrr   r0   r0   rF   r1   r   .  s    r   c                       rg   )PatchTSTMaskinga  
    Class to perform random or forecast masking.

    Parameters:
        config (`PatchTSTConfig`): model config
    Returns:
        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
            Masked patched input
        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
            Bool tensor indicating True on masked points
    r9   c                    sX   t    |j| _|j| _|j| _|j| _|j| _|j| _| jd ur*t| j| _d S d S N)	r<   r=   random_mask_ratioru   	mask_typer   rt   rv   r   ro   rF   r0   r1   r=   k  s   

zPatchTSTMasking.__init__patch_inputc                 C   sr   | j dkrt|| j| j| j| jd\}}n| j dkr(t|| j| j| jd\}}n	td| j  d|	 }||fS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input

        Return:
            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                Masked patched input
            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                Bool tensor indicating True on masked points

        random)rp   rs   rt   ru   rv   forecast)rp   r   rt   rv   zInvalid mask type .)
r   r   r   rt   ru   rv   r   r   r?   rc   )rE   r   masked_inputr   r0   r0   r1   r\   v  s$   

zPatchTSTMasking.forwardrr   r0   r0   rF   r1   r   ^  s    r   c                       s@   e Zd ZdZdef fddZd
dejdedB fdd	Z	  Z
S )PatchTSTEncoderLayerz 
    PatchTST encoder layer
    r9   c              
      s  t    |j| _t|j|j|j|d| _|jdkr t	
|jnt	 | _|jdkr0t|| _n|jdkr@t	j|j|jd| _nt|j d| jr~|jdkrVt	
|jnt	 | _|jdkrft|| _n|jdkrvt	j|j|jd| _nt|j dt	t	j|j|j|jdt|j  |jdkrt	
|jnt	 t	j|j|j|jd| _|jdkrt	
|jnt	 | _|jdkrt|| _n|jdkrt	j|j|jd| _nt|j d|j| _d S )N)r4   r5   r   r9   r   rn   	layernormri   z$ is not a supported norm layer type.r;   ) r<   r=   channel_attentionr3   rl   num_attention_headsattention_dropout	self_attnpath_dropoutr   DropoutIdentitydropout_path1	norm_typerh   norm_sublayer1	LayerNormrm   r?   dropout_path2norm_sublayer2
Sequentialr@   ffn_dimr7   r   activation_function
ff_dropoutffdropout_path3norm_sublayer3pre_normro   rF   r0   r1   r=     sD   
 

 


 

zPatchTSTEncoderLayer.__init__Nhidden_staterJ   c                 C   s  |j \}}}}||| ||}| jr(| j| ||d\}}}	|| | }n| j||d\}}}	| || | }|||||}| jr|dd	 }||| ||}| jrp| j| 
||d\}}
}	|| | }n| j||d\}}
}	| 
|| | }|||||}|dd	 }||| ||}| jr|| | | | }n| || | | }|||||}|f}|r|| jr||
fn|f7 }|S )a  
        Parameters:
            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
                Past values of the time series
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
        Return:
            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`

        )rH   rJ   r"   r   )rL   rM   r   r   r   r   rP   r   r*   r-   r   r   r   r   r   )rE   r   rJ   r   num_input_channelsr   rl   r/   r.   r   channel_attn_weightsoutputsr0   r0   r1   r\     sF   

zPatchTSTEncoderLayer.forwardr   )r]   r^   r_   r`   r   r=   r(   rd   rc   r\   rf   r0   r0   rF   r1   r     s    "2r   c                   @   sT   e Zd ZU eed< dZdZdZdZdZ	dZ
dZe dejfdd	Zdd
dZdS )PatchTSTPreTrainedModelr9   modelr   )timeFTr   c                 C   s~  t |trjt| jj| jj| jj | jj d }| jjr(tj	|j
dd |d7 }|| j|}t raddl}|jj|jdd |j dkrNt|j| W d   n
1 sXw   Y  dS dS t|j| dS t |tjtjfrt|j t|j t|dddurt|j t|j t|j dS dS t |tjrtj	|jd| jjd	 |jdurt|j dS dS dS )
z$
        Initialize weights
        r   g{Gz?)stdr   N)modifier_rankrunning_meanr   )meanr   ) r   PatchTSTPositionalEncodingr   r9   r   r   r   use_cls_tokeninitnormal_	cls_token_init_per   	deepspeedzeroGatheredParametersposition_encnumelcopy_r   r   rk   zeros_r7   ones_weightgetattrr   running_varnum_batches_trackedr@   init_std)rE   r   r   r   r   r0   r0   r1   _init_weights0  s@   
$
z%PatchTSTPreTrainedModel._init_weightsc                 C   s   t |tr
||_d S d S r   )r   PatchTSTEncodergradient_checkpointing)rE   r   r   r0   r0   r1   _set_gradient_checkpointingT  s   

z3PatchTSTPreTrainedModel._set_gradient_checkpointingN)F)r]   r^   r_   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr(   no_gradr   Moduler   r   r0   r0   r0   r1   r   %  s   
 #r   c                       2   e Zd Zdef fddZdejfddZ  ZS )PatchTSTEmbeddingr9   c                    sl   t    |j| _|j| _| jrt|j|j| _d S t	 | _t
|jD ]}| jt|j|j q%d S r   )r<   r=   r   share_embeddingr   r@   r   rl   input_embedding
ModuleListranger   )rE   r9   r   rF   r0   r1   r=   Z  s   

zPatchTSTEmbedding.__init__r   c                    sj    j d }|jkrtdj d| djr  }|S  fddt|D }tj|dd}|S )a%  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input for embedding
        return:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
        r   z&The defined number of input channels (zQ) in the config has to be the same as the number of channels in the batch input (r   c              	      s2   g | ]}j |  d d |d d d d f qS r   )r  r   ir   rE   r0   r1   r   x  s   2 z-PatchTSTEmbedding.forward.<locals>.<listcomp>r#   )rL   r   r?   r  r  r  r(   stack)rE   r   r   
embeddingsr0   r	  r1   r\   f  s   
	


zPatchTSTEmbedding.forward	r]   r^   r_   r   r=   r(   rd   r\   rf   r0   r0   rF   r1   r  Y  s    r  c                       sV   e Zd ZdZdedef fddZedededej	fddZ
d	ejfd
dZ  ZS )r   z'
    Class for positional encoding
    r9   r   c                    sz   t    |j| _|j| _|jr!ttddd|j| _	|d7 }| 
||| _|jdkr6t|j| _d S t | _d S )Nr   r   )r<   r=   r   r   r   	Parameterr(   r   rl   r   r   r   positional_dropoutr   r   rE   r9   r   rF   r0   r1   r=     s   
z#PatchTSTPositionalEncoding.__init__rK   c                 C   s   | j dkrtjt|| jdd}|S | j dkrst|| j}td|d}t	td| jdt
d| j   }t|| |d d dd df< t|| |d d dd df< ||  }|| d	  }tj|d
d}|S t| j  d)Nr   Trequires_gradsincosr   r   r"   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   r  r(   randnrl   r   aranger   expmathlogsincosr   r   r?   )r9   r   r   positiondiv_termr0   r0   r1   r     s    

(  
z#PatchTSTPositionalEncoding._init_per   c                 C   s   | j r8| || jdd d d f  }| j| jd dd d f  }||jd | jdd}tj||fdd}|S | || j }|S )Nr   r   r    r"   r#   )	r   r  r   r   expandrL   r   r(   cat)rE   r   r   
cls_tokensr   r0   r0   r1   r\     s    z"PatchTSTPositionalEncoding.forward)r]   r^   r_   r`   r   ra   r=   staticmethodr   r  r   r(   rd   r\   rf   r0   r0   rF   r1   r   }  s    r   c                	       sT   e Zd ZdZdedef fddZ		ddejde	dB d	e	dB d
e
fddZ  ZS )r   z
    PatchTST Encoder
    r9   r   c                    sT   t    d| _t | _t || _t fddt	 j
D | _|   d S )NFc                    s   g | ]}t  qS r0   )r   r  r9   r0   r1   r         z,PatchTSTEncoder.__init__.<locals>.<listcomp>)r<   r=   r   r  embedderr   positional_encoderr   r  r  num_hidden_layerslayers	post_initr  rF   r"  r1   r=     s   
 zPatchTSTEncoder.__init__Nr   output_hidden_statesrJ   rK   c           
      K   s   |dur|n| j j}|dur|n| j j}| |}| |}|r"dnd}|r(dnd}| jD ]}|r6||f }|||d}	|	d }|rI||	d f }q-t|||dS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Past values of the time series
            output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
            output_attentions (bool, optional): Indicates if attentions should be outputted.

        return:
            `BaseModelOutput`
        Nr0   )r   rJ   r   r   )last_hidden_staterH   
attentions)r9   rJ   r)  r$  r%  r'  r
   )
rE   r   r)  rJ   r   r   encoder_statesall_attentionsencoder_layerlayer_outputsr0   r0   r1   r\     s    



zPatchTSTEncoder.forwardNN)r]   r^   r_   r`   r   ra   r=   r(   rd   rc   r
   r\   rf   r0   r0   rF   r1   r     s    r   zG
    Base class for model's outputs, with potential hidden states.
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZe	ej dB ed< dZ
e	ej dB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed	< dS )
PatchTSTModelOutputa>  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
        the model at the output of each layer plus the optional initial embedding outputs.
    mask (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*):
        Bool masked tensor indicating which patches are masked
    loc (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
        Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    scale (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*):
        Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
        Patched input to the Transformer
    Nr*  rH   r+  r   locscaler   )r]   r^   r_   r`   r*  r(   FloatTensorr   rH   re   r+  r   r3  r4  r   r0   r0   r0   r1   r2    s   
 r2  z4
    Output type of [`PatchTSTForPretraining`].
    c                   @   b   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dS )PatchTSTForPretrainingOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    prediction_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction outputs of the time series modeling heads.
    Nlossprediction_outputrH   r+  )r]   r^   r_   r`   r8  r(   r5  r   r9  rH   re   r+  r0   r0   r0   r1   r7       
 r7  z3
    Output type of [`PatchTSTForRegression`].
    c                   @   r6  )PatchTSTForRegressionOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Regression outputs of the time series modeling heads.
    Nr8  regression_outputsrH   r+  )r]   r^   r_   r`   r8  r(   r5  r   r<  rH   re   r+  r0   r0   r0   r1   r;  )  r:  r;  z3
    Output type of [`PatchTSTForPrediction`].
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dZejdB ed< dZejdB ed< dS )	PatchTSTForPredictionOutputa!  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        MSE loss.
    prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, -1)`):
        Prediction outputs of the time series modeling heads.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
        Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
    Nr8  prediction_outputsrH   r+  r3  r4  )r]   r^   r_   r`   r8  r(   r5  r   r>  rH   re   r+  r3  r4  r0   r0   r0   r1   r=  =  s   
 r=  z7
    Output type of [`PatchTSTForClassification`].
    c                   @   r6  )PatchTSTForClassificationOutputa  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the masked language modeling loss and the next sequence prediction
        (classification) loss.
    prediction_logits (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
        Prediction scores of the PatchTST modeling head (scores before SoftMax).
    Nr8  prediction_logitsrH   r+  )r]   r^   r_   r`   r8  r(   r5  r   r@  rH   re   r+  r0   r0   r0   r1   r?  ]  s   
 r?  z
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.
    c                   @   s$   e Zd ZU dZdZejdB ed< dS )SamplePatchTSTOutputz
    sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, num_targets)`):
        Sampled values from the chosen distribution.
    N	sequences)r]   r^   r_   r`   rB  r(   r5  r   r0   r0   r0   r1   rA  r  s   
 rA  inputtargetrK   c                 C   s   |  | S )zc
    Computes the negative log likelihood loss from input distribution with respect to target.
    )log_prob)rC  rD  r0   r0   r1   nll  s   rF  input_tensorweightsc                 C   sr   |dur3t |dk| | t | }t j|r|j|dn| dd}|r-|j|d| S | | S | j|dS )aj  
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    Nr   r#         ?min)r(   where
zeros_likeclampr   r   )rG  rH  r$   weighted_tensorsum_weightsr0   r0   r1   weighted_average  s
   " rQ  c                	       P   e Zd ZdZdef fddZdejdejdeejejejf fdd	Z	  Z
S )
PatchTSTStdScalerz
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    r9   c                    sV   t    t|dr|jnd| _t|dr|jnd| _t|dr&|j| _d S d| _d S )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r<   r=   hasattrrT  r$   rU  rV  ro   rF   r0   r1   r=     s   
 zPatchTSTStdScaler.__init__dataobserved_indicatorrK   c                 C   sz   |j | j| jd}|d}|| j | j| jd| }|| | d j | j| jd| }t|| j }|| | ||fS )C  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        rU  rI  r"   )r   r$   rU  	clamp_minr(   sqrtrV  )rE   rX  rY  denominatorr3  variancer4  r0   r0   r1   r\     s   
"zPatchTSTStdScaler.forwardr]   r^   r_   r`   r   r=   r(   rd   re   r\   rf   r0   r0   rF   r1   rS    s    rS  c                	       rR  )
PatchTSTMeanScalerz
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    r9   c                    sl   t    t|dr|jnd| _t|dr|jnd| _t|dr#|jnd| _t|dr1|j| _d S d | _d S )NrT  r   rU  TrV  绽|=default_scale)r<   r=   rW  rT  r$   rU  rV  rc  ro   rF   r0   r1   r=     s
   
 zPatchTSTMeanScaler.__init__rX  rY  rK   c           
      C   s   ||   j| jdd}|j| jdd}|tj|dd }| jdu r:|jdd}tj|ddd}t|| }n| jt| }t|dk||}tj|| j	d}|| }	| j
sa|j| jd}|	t||fS )rZ  Tr[  r   rJ  Nr   r#   )absr   r$   r(   rN  rc  squeeze	ones_likerL  rV  rU  rM  )
rE   rX  rY  ts_sumnum_observedr4  	batch_sumbatch_observationsrc  scaled_datar0   r0   r1   r\     s   
zPatchTSTMeanScaler.forwardr`  r0   r0   rF   r1   ra    s    ra  c                
       sX   e Zd ZdZdef fddZ	ddejdejdB deejejejf fd	d
Z	  Z
S )PatchTSTNOPScalerz|
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    r9   c                    s@   t    t|dr|jnd| _t|dr|j| _d S d| _d S )NrT  r   rU  T)r<   r=   rW  rT  r$   rU  ro   rF   r0   r1   r=     s   
 zPatchTSTNOPScaler.__init__NrX  rY  rK   c                 C   sB   t j|ddj| j| jd}t j|ddj| j| jd}|||fS )a  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        Fr  )r$   rU  )r(   rf  r   r$   rU  rM  )rE   rX  rY  r4  r3  r0   r0   r1   r\   	  s   
zPatchTSTNOPScaler.forwardr   r`  r0   r0   rF   r1   rl    s    rl  c                	       sL   e Zd Zdef fddZdejdejdeejejejf fddZ  Z	S )	PatchTSTScalerr9   c                    sR   t    |jdks|jdu rt|| _d S |jdkr"t|| _d S t|| _d S )Nr   Tr   )r<   r=   r   ra  scalerrS  rl  ro   rF   r0   r1   r=     s   

zPatchTSTScaler.__init__rX  rY  rK   c                 C   s   |  ||\}}}|||fS )a>  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Input for scaler calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, um_input_channels)`)
        )rn  )rE   rX  rY  r3  r4  r0   r0   r1   r\   $  s   
zPatchTSTScaler.forward)
r]   r^   r_   r   r=   r(   rd   re   r\   rf   r0   r0   rF   r1   rm    s    	rm  c                       sr   e Zd Zdef fddZ					ddejdejdB dejdB dedB d	edB d
edB dee	B fddZ
  ZS )PatchTSTModelr9   c                    sf   t  | t|| _t|| _|j| _| jj}| jr!t|| _	nt
 | _	t||d| _|   d S )N)r   )r<   r=   rm  rn  r   
patchifierdo_mask_inputr   r   maskingr   r   r   encoderr(  r  rF   r0   r1   r=   8  s   


zPatchTSTModel.__init__Nr   past_observed_maskfuture_valuesr)  rJ   return_dictrK   c              	   K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r't|}| ||\}}	}
| |}| jr@| 	|\}}n| 	|d}}| j
|||d}|sk|j|j|jf}|||	|
|f }tdd |D S t|j|j|j||	|
|dS )a  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.BoolTensor` of shape `(batch_size, prediction_length, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTModel

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> model = PatchTSTModel.from_pretrained("namctin/patchtst_etth1_pretrain")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> last_hidden_state = outputs.last_hidden_state
        ```N)r   r)  rJ   c                 s   s    | ]	}|d ur|V  qd S r   r0   )r   vr0   r0   r1   	<genexpr>      z(PatchTSTModel.forward.<locals>.<genexpr>)r*  rH   r+  r   r3  r4  r   )r9   use_return_dictrJ   r)  r(   rf  rn  rp  rq  rr  rs  r*  rH   r+  re   r2  )rE   r   rt  ru  r)  rJ   rv  r   scaled_past_valuesr3  r4  patched_valuesmasked_valuesr   encoder_outputr   r0   r0   r1   r\   J  s6   7

zPatchTSTModel.forwardNNNNN)r]   r^   r_   r   r=   r(   rd   rc   re   r2  r\   rf   r0   r0   rF   r1   ro  6  s,    	ro  c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	PatchTSTMaskPretrainHeadz-
    Pretraining head for mask modelling
    r9   c                    sH   t    |jdkrt|jnt | _t|j|j	| _
|j| _d S Nr   )r<   r=   head_dropoutr   r   r   r   r@   rl   r   linearr   ro   rF   r0   r1   r=     s   
 z!PatchTSTMaskPretrainHead.__init__	embeddingrK   c                 C   s:   |  | |}| jr|ddddddddf }|S )a  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True

        Nr   )r  r   r   )rE   r  r0   r0   r1   r\     s    z PatchTSTMaskPretrainHead.forwardrr   r0   r0   rF   r1   r    s    r  z*
    The PatchTST for pretrain model.
    c                       sf   e Zd Zdef fddZ				ddejdejdB dedB dedB d	edB d
ee	B fddZ
  ZS )PatchTSTForPretrainingr9   c                    s4   t  | d|_t|d| _t|| _|   d S )NTr"  )r<   r=   rq  ro  r   r  headr(  ro   rF   r0   r1   r=     s
   
zPatchTSTForPretraining.__init__Nr   rt  r)  rJ   rv  rK   c                 K   s   |dur|n| j j}| j||||dd}| |j}tjdd}	|	||j}
|
jdd|j	 
 |j	
 d  }|j}|sU|f|d	d
  }|durQ|f| }|S |}|S t||||jdS )a	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPretraining

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Config for random mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='random',
        ...     random_mask_ratio=0.4,
        ...     use_cls_token=True,
        ... )
        >>> # Config for forecast mask pretraining
        >>> config = PatchTSTConfig(
        ...     num_input_channels=7,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     mask_type='forecast',
        ...     num_forecast_mask_patches=5,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForPretraining(config)

        >>> # during training, one provides both past and future values
        >>> outputs = model(past_values=batch["past_values"])

        >>> loss = outputs.loss
        >>> loss.backward()
        ```NTr   rt  r)  rJ   rv  none	reductionr    r#   rb  r   )r8  r9  rH   r+  )r9   rz  r   r  r*  r   MSELossr   r   r   r   rH   r7  r+  )rE   r   rt  r)  rJ   rv  r   model_outputx_hatr8  loss_valmasked_lossr,  r   r0   r0   r1   r\     s,   F
$
zPatchTSTForPretraining.forward)NNNN)r]   r^   r_   r   r=   r(   rd   rc   re   r7  r\   rf   r0   r0   rF   r1   r    s&    r  c                       r  )PatchTSTClassificationHeadr9   c                    sd   t    |j| _|j| _tjdd| _|jdkrt|jnt	 | _
t|j|j |j| _d S Nr   	start_dimr   )r<   r=   r   pooling_typer   Flattenflattenr  r   r   r   r@   r   rl   num_targetsr  ro   rF   r0   r1   r=   :  s   
 z#PatchTSTClassificationHead.__init__r  c                 C   s   | j r|dddddddf }n"| jdkr|jdd}n| jdkr+|jddj}n	td| j d| |}| | |}|S )	a[  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, num_targets)`

        Nr   r   r"   r#   r   pooling operator  is not implemented yet)	r   r  r   r   valuesr?   r  r  r   rE   r  pooled_embeddingrq   r0   r0   r1   r\   B  s   



z"PatchTSTClassificationHead.forwardr  r0   r0   rF   r1   r  9  s    r  z0
    The PatchTST for classification model.
    c                       st   e Zd Zdef fddZe					ddejdejdB dedB dedB d	edB d
edB de	e
B fddZ  ZS )PatchTSTForClassificationr9   c                    sB   t  | |jrtd d|_t|| _t|| _| 	  d S )N+Setting `do_mask_input` parameter to False.F)
r<   r=   rq  loggerwarningro  r   r  r  r(  ro   rF   r0   r1   r=   d  s   


z"PatchTSTForClassification.__init__Nr   target_valuesrt  r)  rJ   rv  rK   c                 K   s   |dur|n| j j}| j||||dd}| |j}	d}
|dur)t }||	|}
|sC|	f|dd  }|
dur?|
f| }|S |}|S t|
|	|j|j	dS )ac  
        past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
            Input sequence to the model
        target_values (`torch.Tensor`, *optional*):
            Labels associates with the `past_values`
        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:

            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForClassification

        >>> # classification task with two input channel2 and 3 classes
        >>> config = PatchTSTConfig(
        ...     num_input_channels=2,
        ...     num_targets=3,
        ...     context_length=512,
        ...     patch_length=12,
        ...     stride=12,
        ...     use_cls_token=True,
        ... )
        >>> model = PatchTSTForClassification(config=config)

        >>> # during inference, one only provides past values
        >>> past_values = torch.randn(20, 512, 2)
        >>> outputs = model(past_values=past_values)
        >>> labels = outputs.prediction_logits
        ```NTr  r   r   )r8  r@  rH   r+  )
r9   rz  r   r  r*  r   CrossEntropyLossr?  rH   r+  )rE   r   r  rt  r)  rJ   rv  r   r  y_hatr  r8  r   r0   r0   r1   r\   r  s2   -
z!PatchTSTForClassification.forwardr  )r]   r^   r_   r   r=   r   r(   rd   rc   re   r?  r\   rf   r0   r0   rF   r1   r  ^  s.    	r  z,
    The PatchTST for regression Model.
    c                       s8   e Zd Zd	dedef fddZdejfddZ  Z	S )
PatchTSTPredictionHeadNr9   r   c                    sD  t    |j| _|j| _|j| _|j| _| js| jr|j}n|j| }| jsvt | _	t | _
t | _t| jD ]8}| jtjdd |du rW| j	t||j n	| j	|| | j
|jdkrnt|jnt  q;dS tjdd| _|du rt||j| _n||| _|jdkrt|jnt | _dS )a  
        num_patches (`int`):
            The number of patches in the input sequence.
        distribution_output (`DistributionOutput`, *optional*):
            The distribution output layer for probabilistic forecasting. If None, a linear output layer is used.
        r"   r  Nr   )r<   r=   share_projectionr   r   r  rl   r   r  projectionsdropoutsflattensr  r   r  r@   prediction_lengthget_parameter_projectionr  r   r   r  
projectionr   )rE   r9   r   distribution_outputr>   r  rF   r0   r1   r=     s0   




($zPatchTSTPredictionHead.__init__r  c                 C   s  | j r|dddddddf }n| jdkr|jdd}n| jdkr+|jddj}n|}| jseg }t| jD ]%}| j| |dd|ddf }| j	| |}| j
| |}|| q7tj|dd}n| |}| |}| |}t|trtdd	 |D }|S |dd}|S )
aj  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, forecast_len, num_channels)`

        Nr   r   r"   r#   r   r   c                 s   s    | ]	}| d dV  qdS )r"   r   N)r*   )r   zr0   r0   r1   rx    ry  z1PatchTSTPredictionHead.forward.<locals>.<genexpr>)r   r  r   r   r  r  r  r   r  r  r  r   r(   r
  r  r   r  r   re   r*   )rE   r  r  rq   r  r0   r0   r1   r\     s.   


 



zPatchTSTPredictionHead.forwardr   )
r]   r^   r_   r   ra   r=   r(   rd   r\   rf   r0   r0   rF   r1   r    s    +r  z,
    The PatchTST for prediction model.
    c                       s   e Zd Zdef fddZ					ddejdejdB dejdB dedB d	edB d
edB dee	B fddZ
e 	ddejdejdB defddZ  ZS )PatchTSTForPredictionr9   c                    s   t  | |jrtd d|_t|| _|jdkrd | _n/|jdkr,t	|j
d| _n"|jdkr9t|j
d| _n|jdkrFt|j
d| _ntd|j t|| jjj| jd	| _|   d S )
Nr  Fmse	student_tr#   normalnegative_binomialUnknown distribution output )r  )r<   r=   rq  r  r  ro  r   r8  r  r   r  r   r   r?   r  rp  r   r  r(  ro   rF   r0   r1   r=   &  s$   





zPatchTSTForPrediction.__init__Nr   rt  ru  r)  rJ   rv  rK   c                 K   s   |dur|n| j j}| j||||dd}| |j}	d}
| jr"|	}n|	|j |j }|durQ| jrF| jj|	|j|jd}t	||}
t
|
}
ntjdd}|||}
|j}|j}|sq|f|dd  }|
durm|
f| }|S |}|S t|
||j|j||d	S )
aV	  
        Parameters:
            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
                Input sequence to the model
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
                Future target values associated with the `past_values`
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers
            output_attentions (`bool`, *optional*):
                Whether or not to return the output attention of all layers
            return_dict (`bool`, *optional*):
                Whether or not to return a `ModelOutput` instead of a plain tuple.

        Returns:
            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
            `config.return_dict`=False)

        Examples:

        ```python
        >>> from huggingface_hub import hf_hub_download
        >>> import torch
        >>> from transformers import PatchTSTConfig, PatchTSTForPrediction

        >>> file = hf_hub_download(
        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
        ... )
        >>> batch = torch.load(file)

        >>> # Prediction task with 7 input channels and prediction length is 96
        >>> model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")

        >>> # during training, one provides both past and future values
        >>> outputs = model(
        ...     past_values=batch["past_values"],
        ...     future_values=batch["future_values"],
        ... )

        >>> loss = outputs.loss
        >>> loss.backward()

        >>> # during inference, one only provides past values, the model outputs future values
        >>> outputs = model(past_values=batch["past_values"])
        >>> prediction_outputs = outputs.prediction_outputs
        ```NTr  r3  r4  r   r  r   r    )r8  r>  rH   r+  r3  r4  )r9   rz  r   r  r*  r  r4  r3  distributionrF  rQ  r   r  r=  rH   r+  )rE   r   rt  ru  r)  rJ   rv  r   r  r  r  	y_hat_outr  r8  r3  r4  r   r0   r0   r1   r\   C  sL   >



zPatchTSTForPrediction.forwardc                    sr   | j j}| |d|dd}| jr.| jj|j|j|jd  fddt|D }tj	|dd}n|j
d}t|d	S )
a   
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)`
            for multivariate predictions.
        NF)r   ru  rt  r)  r  c                       g | ]}   qS r0   sampler   r  r0   r1   r     r#  z2PatchTSTForPrediction.generate.<locals>.<listcomp>r   r#   rB  )r9   num_parallel_samplesr  r  r>  r3  r4  r  r(   r
  r   rA  rE   r   rt  r  r   samplesr0   r  r1   generate  s   
zPatchTSTForPrediction.generater  r   )r]   r^   r_   r   r=   r(   rd   rc   re   r=  r\   r   rA  r  rf   r0   r0   rF   r1   r     s>     	
nr  c                       s8   e Zd ZdZd	def fddZdejfddZ  Z	S )
PatchTSTRegressionHeadz
    Regression head
    Nr9   c                    s   t    |j| _|j| _|j| _|| _|j|j }t	j
dd| _|jdkr,t	|jnt	 | _|d u r?t	||j| _d S ||| _d S r  )r<   r=   output_rangey_ranger   r  r  r   rl   r   r  r  r  r   r   r   r@   r  r  r  )rE   r9   r  r>   rF   r0   r1   r=     s   
 zPatchTSTRegressionHead.__init__r  c                 C   s   | j r|dddddddf }n"| jdkr|jdd}n| jdkr+|jddj}n	td| j d| | |}| |}| j	du | j
du@ r_t|| j
d	 | j
d   | j
d  }|S )
aY  
        Parameters:
            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
                Embedding from the model
        Returns:
            `torch.Tensor` of shape `(bs, output_dim)`

        Nr   r   r"   r#   r   r  r  r   )r   r  r   r   r  r?   r   r  r  r  r  r(   sigmoidr  r0   r0   r1   r\     s   



(zPatchTSTRegressionHead.forwardr   rr   r0   r0   rF   r1   r    s    r  z,
    The PatchTST for regression model.
    c                       s   e Zd Zdef fddZe					ddejdejdB dejdB dedB d	edB d
edB de	e
B fddZe 	ddejdejdB defddZ  ZS )PatchTSTForRegressionr9   c                    s   t  | |jrtd d|_t|| _|jdkrd | _n/|jdkr,t	|j
d| _n"|jdkr9t|j
d| _n|jdkrFt|j
d| _ntd|j t|| j| _|   d S )	Nr  Fr  r  r#   r  r  r  )r<   r=   rq  r  r  ro  r   r8  r  r   r  r   r   r?   r  r  r(  ro   rF   r0   r1   r=     s    





zPatchTSTForRegression.__init__Nr   r  rt  r)  rJ   rv  rK   c                    s   |dur|n j j} j||||dd} |j}	d}
|durI jr> j|	}t fdd|	D }	t||}
t	|
}
nt
jdd}
|
|	|}
|sc|	f|dd	  }|
dur_|
f| }|S |}|S t|
|	|j|jd
S )a#  
        past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
            Input sequence to the model
        target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
            Target values associates with the `past_values`
        past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:

            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
            Whether or not to return a `ModelOutput` instead of a plain tuple.

        Examples:

        ```python
        >>> from transformers import PatchTSTConfig, PatchTSTForRegression

        >>> # Regression task with 6 input channels and regress 2 targets
        >>> model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression")

        >>> # during inference, one only provides past values, the model outputs future values
        >>> past_values = torch.randn(20, 512, 6)
        >>> outputs = model(past_values=past_values)
        >>> regression_outputs = outputs.regression_outputs
        ```NTr  c                 3   s     | ]}| d  jjV  qdS )r    N)rM   r9   r  )r   itemrE   r0   r1   rx  p  s    z0PatchTSTForRegression.forward.<locals>.<genexpr>r   r  r   r   )r8  r<  rH   r+  )r9   rz  r   r  r*  r  r  re   rF  rQ  r   r  r;  rH   r+  )rE   r   r  rt  r)  rJ   rv  r   r  r  r8  r  r   r0   r  r1   r\   9  s<   &


zPatchTSTForRegression.forwardc                    sb   | j j}| |d|dd}| j|j  fddt|D }tj|ddd|| j j	}t
|d	S )
a  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Parameters:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.
            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
            samples, num_targets)`.
        NF)r   r  rt  r)  c                    r  r0   r  r   r  r0   r1   r     r#  z2PatchTSTForRegression.generate.<locals>.<listcomp>r   r#   r    r  )r9   r  r  r  r<  r  r(   r
  rM   r  rA  r  r0   r  r1   r    s   
zPatchTSTForRegression.generater  r   )r]   r^   r_   r   r=   r   r(   rd   rc   re   r;  r\   r   rA  r  rf   r0   r0   rF   r1   r    s@    	Jr  )ro  r   r  r  r  r  )Nr   )NFr   r  r0  )Pr`   r  collections.abcr   dataclassesr   r(   r    r   r   activationsr   integrations.deepspeedr   modeling_flash_attention_utilsr	   modeling_outputsr
   modeling_utilsr   r   processing_utilsr   time_series_utilsr   r   r   utilsr   r   r   r   configuration_patchtstr   
get_loggerr]   r  r   rd   rb   r2   r3   rh   listrc   ra   r   r   r   r   r   r   r  r   r   r2  r7  r;  r=  r?  rA  distributionsDistributionrF  rQ  rS  ra  rl  rm  ro  r  r  r  r  r  r  r  r  __all__r0   r0   r0   r1   <module>   s  


V
=
D0< 3$8?
"$7qp%X` >7 