o
    	۷i$                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$ e!%e&Z'dd Z(G dd dej)Z*G dd dej)Z+G dd dej)Z,G dd deZ-e G dd deZ.e G dd de.Z/e dd G d!d" d"e.eZ0e d#d G d$d% d%e.Z1g d&Z2dS )'zPyTorch OpenAI ImageGPT model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)auto_docstringloggingtorch_float   )ImageGPTConfigc                 C   s  z
ddl }ddl}W n ty   td  w tj|}td|  |j	
|}g }g }|D ]"\}	}
td|	 d|
  |j	||	}||	 ||  q2t||D ]\}	}|	dd }	|	d}	td	d
 |	D sy|	d dv rtdd|	 qZ| }|	d dvrt|d}|	D ]}|d|r|d|}n|g}|d dks|d dkrt|d}n|d dkrt|d}nw|d dks|d dkrt||d }t|d}n^|d dv rt|d}t|d}nMt|	dkr|	d dkr|d dkrt||d }t|d}n+|d dkr"t|d }t|d}n|d d!kr4t|d}t|d}nt||d }t|d"krLt|d }|| }qt|	dkr[|	d dksp|	d dksp|	d d!ksp|	d dkrqn%z|j|jks{J W n ty } z| j|j|jf7  _ d}~ww td#|	  |	d d$krt||j|jj|jddd|jf< qZ|	d d%krt||j|jj|jdd|jd"|j f< qZ|	d d&krt||j|jj|jddd"|j df< qZt|	dkr|	d dkr|	d" dkrt||j|j|_qZ|	d dkr-t||_qZ|	d dkrFt||jd|j d ddf< qZ|	d d!krVt||jd< qZt||_qZ| S )'z0
    Load tf checkpoints in a pytorch model
    r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape    /c                 s   s    | ]}|d v V  qdS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0nr!   r!   d/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/imagegpt/modeling_imagegpt.py	<genexpr>P   s
    
z.load_tf_weights_in_imagegpt.<locals>.<genexpr>)_stepzSkipping {})wtettransformerz[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte)q_projk_projv_projc_attnr   r   attnc_projr(   lm_headsos   zInitialize PyTorch weight r1   r2   r3   )!re
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsqueezezipsplitanyformatjoingetattr	fullmatchlenintshapeAssertionErrorargstorch
from_numpyreshapen_embdTdata
vocab_size)modelconfigimagegpt_checkpoint_pathr:   tftf_path	init_varsnamesarraysnamerQ   arraypointerm_namescope_namesnumer!   r!   r$   load_tf_weights_in_imagegpt/   s   



*

F.2*$rj   c                       sB   e Zd Zd
dee def fddZdejdejfdd	Z	  Z
S )ImageGPTLayerNormh㈵>hidden_sizeepsc                    s&   t    || _tt|| _d S N)super__init__rn   r   	ParameterrT   Tensorr,   )selfrm   rn   	__class__r!   r$   rq      s   
zImageGPTLayerNorm.__init__tensorreturnc                 C   s4   |t t jt |ddd| j  }|| j }|S )Nr&   T)axiskeepdim)rT   sqrtmeansquarern   r,   )rt   rw   r!   r!   r$   forward   s   &
zImageGPTLayerNorm.forward)rl   )__name__
__module____qualname__tuplerP   floatrq   rT   rs   r~   __classcell__r!   r!   ru   r$   rk      s    rk   c                       s   e Zd Zddee dee f fddZdd Zdd	d
ZdddZ	dd Z
dd Z								ddejdee deej deej deej deej dee dee deej defddZ  ZS ) ImageGPTAttentionFNis_cross_attention	layer_idxc                    sF  t    |j}| jdttj||ftjddd||dd | jdt	ddd |j
| _|j| _| j| j | _| j| _| j| j | jkrUtd| j d	| j d
|j| _|| _|j| _|| _|j| _| jr}td| j | j| _t| j| j| _n
td| j | j| _t| j| j| _t|j| _t|j| _t  | _!d S )Nr.   dtyper   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r9   r   )"rp   rq   max_position_embeddingsregister_bufferrT   trilonesboolviewrw   rm   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr   scale_attn_by_inverse_layer_idxr   reorder_and_upcast_attnr   r4   q_attnr6   r   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)rt   r\   r   r   max_positionsru   r!   r$   rq      sB   

zImageGPTAttention.__init__c                 C   s   t |dkrd S t|| j| j| j\}}t||| j |d| j  g}t| j	|dd| _	t| j
|dd| _
| j| j | jt |  | _| jt | | _| j|| _d S )Nr   r9   r   dim)rO   r   r   r   r   rT   catr   r   r4   r6   union)rt   headsindex
index_attnr!   r!   r$   prune_heads   s    zImageGPTAttention.prune_headsc                 C   s  t ||dd}| jr|t|dd  }| jr$|t| jd  }| j	s]|d|d}}| j
d d d d || |d |f }	t |jj}
t j|
|j|jd}
t |	||
}|d ure|| }tjdd|}||j}| |}|d ur|| }t ||}||fS )Nr&         ?r   r   devicer   )rT   matmul	transposer   r   sizer   r   r   r   r.   finfor   minrw   r   wherer   Softmaxtyper   )rt   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputr!   r!   r$   _attn   s(   &
zImageGPTAttention._attnc                 C   s  |  \}}}}	|  \}
}
}}
tj|| ||tj|jd}d}| jr.|t| dd  }| jr:|t| jd  }tj	|jj
dd1 |d||	|ddd|	|}}tj|| | d	|d
}|||||}W d    n1 sww   Y  | js| d| d}}| jd d d d || |d |f }t|jj}tj||j|jd}t|||}|d ur|| }tjdd|}|jtjkrtd|
|j}| |}|d ur|| }t||}||fS )Nr         ?r&   r   r   F)enabledr   r   )betaalphar   zDError with upcasting, attn_weights does not have dtype torch.float32)r   rT   emptyfloat32r   r   r   r   r   autocastr   rV   r   baddbmmr   r.   r   r   r   rw   r   r   r   RuntimeErrorr   r   )rt   r   r   r   r   r   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   r!   r!   r$   _upcast_and_reordered_attn	  s<   &&
z,ImageGPTAttention._upcast_and_reordered_attnc                 C   s2   |  dd ||f }|j| }|ddddS )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr&   r   r9   r   r   )r   r   permutert   rw   r   attn_head_size	new_shaper!   r!   r$   _split_heads=  s   
zImageGPTAttention._split_headsc                 C   s8   | dddd }| dd || f }||S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r9   r   r   Nr   )r   
contiguousr   r   r   r!   r!   r$   _merge_headsE  s   
zImageGPTAttention._merge_headshidden_states
layer_pastr   r   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionscache_positionrx   c
                 C   s  |d u}
|j \}}}|d ur&t|tr$|j| j}|
r |j}n|j}n|}|
r*|n|}|
rt| ds7t	d|d urQ|rQ| 
|}|j| j j}|j| j j}nY| 
|}| |j| jdd\}}||d| j| jdd}||d| j| jdd}n*| |j| jdd\}}}||d| j| jdd}||d| j| jdd}|d ur|
s|	nd }	|||| jd|	i\}}|
rd|j| j< |||| j| jdd}| jr| |||||\}}n| |||||\}}| || j| j}| |}| |}||fS )	Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r9   r   r&   r   r   T)rQ   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachehasattrr   r   layerskeysvaluesr4   rI   r   r   r   r   r   updater   r   r   r   r6   r   )rt   r   r   r   r   r   r   r   r   r   r   r   seq_lenr   r   curr_past_key_valuecurrent_statesr   r   r   r   r   r!   r!   r$   r~   M  sN   





zImageGPTAttention.forward)FN)NNNNNNNFFN)r   r   r   r   r   rP   rq   r   r   r   r   r   rT   rs   r	   r   r~   r   r!   r!   ru   r$   r      sH     +

&4	
r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )ImageGPTMLPc                    sF   t    |j}t||| _t||| _t|j | _t	
|j| _d S ro   )rp   rq   rm   r   c_fcr6   r   activation_functionactr   r   r   dropout)rt   intermediate_sizer\   r   ru   r!   r$   rq     s   
zImageGPTMLP.__init__r   rx   c                 C   s,   |  |}| |}| |}| |}|S ro   )r   r   r6   r   )rt   r   r!   r!   r$   r~     s
   



zImageGPTMLP.forward)r   r   r   rq   rT   rs   r~   r   r!   r!   ru   r$   r     s    r   c                       s   e Zd Zd fdd	Z								ddejdee deej deej d	eej d
eej dee dee deej de	fddZ
  ZS )ImageGPTBlockNc                    s   t    |j}|jd ur|jnd| }t||jd| _t||d| _t||jd| _	|j
r>t|d|d| _t||jd| _t||| _d S )N   rn   r   T)r   r   )rp   rq   rm   n_innerrk   layer_norm_epsilonln_1r   r5   ln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)rt   r\   r   rm   	inner_dimru   r!   r$   rq     s   
zImageGPTBlock.__init__Fr   r   r   r   r   r   r   r   r   rx   c
              
   C   s   |}
|  |}| j|||||||	d}|d }|dd  }||
 }|d urVt| ds2td|  d|}
| |}| j||||||||	d}|d }|
| }||dd   }|}
| |}| |}|
| }|f| S )N)r   r   r   r   r   r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r   r   r   r   )r   r5   r   r   r  r  r   r  )rt   r   r   r   r   r   r   r   r   r   residualattn_outputsr   outputscross_attn_outputsfeed_forward_hidden_statesr!   r!   r$   r~     sN   
	






zImageGPTBlock.forwardro   r   )r   r   r   rq   rT   rs   r   r	   r   r   r~   r   r!   r!   ru   r$   r     s>    	
r   c                       sD   e Zd ZU eed< eZdZdZdZ	dgZ
 fddZdd	 Z  ZS )
ImageGPTPreTrainedModelr\   r)   	input_idsTr   c                    s   t  j|i | d S ro   )rp   rq   )rt   inputskwargsru   r!   r$   rq     s   z ImageGPTPreTrainedModel.__init__c                 C   s   t |tjtfr|jjjd| jjd |j	dur|j	j
  n,t |tjr?|jjjd| jjd |jdur>|jj|j 
  nt |trK|jjd | D ]\}}d|v rnd|v rn|jjd| jjtd| jj  d qOdS )zInitialize the weights.g        )r|   stdNr   r6   r,   r9   )r   r   Linearr   r,   rY   normal_r\   initializer_ranger.   zero_	Embeddingpadding_idxrk   fill_named_parametersmathr{   n_layer)rt   modulerc   pr!   r!   r$   _init_weights  s"   


&z%ImageGPTPreTrainedModel._init_weights)r   r   r   r   __annotations__rj   load_tf_weightsbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesrq   r  r   r!   r!   ru   r$   r
    s   
 r
  c                $       s   e Zd Zdef fddZdd Zdd Zdd	 Ze	
	
	
	
	
	
	
	
	
	
	
	
	
	
dde	e
j de	e de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e
j de	e de	e de	e de	e de	e
j dedeeef f ddZ  ZS )ImageGPTModelr\   c                    s   t     j| _t j| j| _t j| j| _	t
 j| _t fddt jD | _t| j jd| _d| _d | _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   )r"   ir\   r!   r$   
<listcomp>$  s    z*ImageGPTModel.__init__.<locals>.<listcomp>r   F)rp   rq   rm   r   r   r  rZ   r0   r   r/   r   
embd_pdropdrop
ModuleListrangenum_hidden_layershrk   r   ln_fmodel_parallel
device_mapgradient_checkpointing	post_initrt   r\   ru   r$  r$   rq     s    zImageGPTModel.__init__c                 C   s   | j S ro   r0   )rt   r!   r!   r$   get_input_embeddings.  s   z"ImageGPTModel.get_input_embeddingsc                 C   s
   || _ d S ro   r2  )rt   new_embeddingsr!   r!   r$   set_input_embeddings1  s   
z"ImageGPTModel.set_input_embeddingsc                 C   s(   |  D ]\}}| j| j| qdS )zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsr+  r5   r   )rt   heads_to_prunelayerr   r!   r!   r$   _prune_heads4  s   zImageGPTModel._prune_headsNr  past_key_valuesr   token_type_idsposition_idsr   inputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   r  rx   c           $      K   sh  |dur|n| j j}|dur|n| j j}|
dur|
n| j j}
|dur$|n| j j}|dur4|dur4td|durP| || | }|d|d }|j	d }n|durb| dd }|j	d }ntd|durm|j
n|j
}| jr| jr|
rtd d}
|
r|du rtt| j dt| j d}|
rt|trtd	 t|}|dur| n|}|dur|d|d }|du rtj||d | tj|d
}|d}|dur|dkrtd||d}|ddddddf }|j| jd}d| t| jj }| j jr(|dur(| \}}}||f}|	du r"tj||d}	| |	}	nd}	|  || j j!}|du r<| "|}| #|}|||j
 }|durW| "|}|| }| $|}||df }|ridnd}|ru| j jrudnd}|r|dnd}t%| j&D ]\}} | j'rtj()|j
 |dur||j
}t|tj*r||j
}|r||f }| ||||| ||	|
||d	}!|!d }|r||!d f }| j jr||!d f }| j'r| j+, D ]!\}"}#||#d krdt-|" | j.kr|dt-|"d  }qq| /|}|j| }|r||f }|s+tdd |||||fD S t0|||||dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer&   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr$  zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   z$batch_size has to be defined and > 0r   r   )r   r!   )r   r   r   r   r   r9   zcuda:c                 s   s    | ]	}|d ur|V  qd S ro   r!   )r"   vr!   r!   r$   r%     s    z(ImageGPTModel.forward.<locals>.<genexpr>)last_hidden_stater:  r   
attentionscross_attentions)1r\   r   r>  r   use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   rQ   r   r/  trainingr=   warning_oncer   r
   r   r   from_legacy_cacheget_seq_lengthrT   arangelong	unsqueezetor   r   r   r   r   invert_attention_maskget_head_maskr  r0   r/   r'  	enumerater+  r-  cuda
set_devicers   r.  r6  strlast_devicer,  r   )$rt   r  r:  r   r;  r<  r   r=  r   r   r   r   r>  r?  r   r  input_shape
batch_sizer   past_lengthencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr#  blockr  r   r@  r!   r!   r$   r~   ;  s   0












"


zImageGPTModel.forward)NNNNNNNNNNNNNN)r   r   r   r   rq   r3  r5  r9  r   r   rT   rs   r	   r   r   r   r   r   r~   r   r!   r!   ru   r$   r"    sj    	

r"  z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                &       s   e Zd ZdgZdef fddZe															ddeej	 dee
 deej	 d	eej	 d
eej	 deej	 deej	 deej	 deej	 deej	 dee dee dee dee deej	 dedeeef f"ddZ  ZS )ImageGPTForCausalImageModelingzlm_head.weightr\   c                    sH   t  | t|| _tj|j|jd dd| _d| _	d | _
|   d S )Nr   Fr.   )rp   rq   r"  r)   r   r  rW   rZ   r7   r-  r.  r0  r1  ru   r!   r$   rq     s   
z'ImageGPTForCausalImageModeling.__init__Nr  r:  r   r;  r<  r   r=  r   r   labelsr   r   r>  r?  r   r  rx   c                 K   s   |dur|n| j j}| j|||||||||	|||||d}|d }| |}d}|
durU|dddddf  }|
dddf  }t }||d|d|d}|sk|f|dd  }|duri|f| S |S t|||j	|j
|j|jdS )a&
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
        >>> import torch
        >>> import matplotlib.pyplot as plt
        >>> import numpy as np

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> # unconditional generation of 8 images
        >>> batch_size = 4
        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
        >>> context = context.to(device)
        >>> output = model.generate(
        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
        ... )

        >>> clusters = image_processor.clusters
        >>> height = image_processor.size["height"]
        >>> width = image_processor.size["width"]

        >>> samples = output[:, 1:].detach().cpu().numpy()
        >>> samples_img = [
        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
        ... ]  # convert color cluster tokens back to pixels
        >>> f, axes = plt.subplots(1, batch_size, dpi=300)

        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
        ...     ax.axis("off")
        ...     ax.imshow(img)
        ```N)r:  r   r;  r<  r   r=  r   r   r   r   r>  r?  r   r   .r&   r   )losslogitsr:  r   rB  rC  )r\   rD  r)   r7   r   r   r   r   r   r:  r   rB  rC  )rt   r  r:  r   r;  r<  r   r=  r   r   re  r   r   r>  r?  r   r  transformer_outputsr   	lm_logitsrf  shift_logitsshift_labelsloss_fctoutputr!   r!   r$   r~     sH   G
z&ImageGPTForCausalImageModeling.forward)NNNNNNNNNNNNNNN)r   r   r   _tied_weights_keysr   rq   r   r   rT   rs   r	   r   r   r   r   r   r~   r   r!   r!   ru   r$   rc  
  sl    	

rc  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                        s   e Zd Zdef fddZe												ddeej dee	 deej deej d	eej d
eej deej deej dee
 dee
 dee
 dee
 dedeeef fddZ  ZS )ImageGPTForImageClassificationr\   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S )NFrd  )
rp   rq   
num_labelsr"  r)   r   r  rW   scorer0  r1  ru   r!   r$   rq     s
   
z'ImageGPTForImageClassification.__init__Nr  r:  r   r;  r<  r   r=  re  r   r   r>  r?  r  rx   c                 K   s   |dur|n| j j}| j||||||||	|
||d}|d }|jdd}| |}d}|dur7| ||| j }|sM|f|dd  }|durK|f| S |S t|||j|j|j	dS )ay  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```N)
r:  r   r;  r<  r   r=  r   r   r>  r?  r   r   r   )rf  rg  r:  r   rB  )
r\   rD  r)   r|   rq  loss_functionr   r:  r   rB  )rt   r  r:  r   r;  r<  r   r=  re  r   r   r>  r?  r  rh  r   pooled_hidden_statesrg  rf  rm  r!   r!   r$   r~     s<   2
z&ImageGPTForImageClassification.forward)NNNNNNNNNNNN)r   r   r   r   rq   r   r   rT   rs   r	   r   r   r   r   r   r~   r   r!   r!   ru   r$   ro    sX    		

ro  )rc  ro  r"  r
  rj   )3__doc__r  r?   typingr   r   r   rT   r   torch.nnr   activationsr   cache_utilsr	   r
   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_imagegptr   
get_loggerr   r=   rj   Modulerk   r   r   r   r
  r"  rc  ro  __all__r!   r!   r!   r$   <module>   sL   
l mM& q b