o
    ei                     @   s  d Z ddlZddlmZ ddlZddlmZ ddlmZ ddlm	Z
 ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ e%e&Z'G dd dej(Z)G dd dej(Z*G dd dej(Z+G dd deZ,eG dd deZ-eG dd de-Z.edd G d!d" d"e-eZ/ed#d G d$d% d%e-Z0g d&Z1dS )'zPyTorch OpenAI ImageGPT model.    N)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D)auto_docstringloggingtorch_float)maybe_autocast   )ImageGPTConfigc                       sB   e Zd Zd
dee def fddZdejdejfdd	Z	  Z
S )ImageGPTLayerNormh㈵>hidden_sizeepsc                    s&   t    || _tt|| _d S N)super__init__r   r   	ParametertorchTensorweight)selfr   r   	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/imagegpt/modeling_imagegpt.pyr   0   s   
zImageGPTLayerNorm.__init__tensorreturnc                 C   s4   |t t jt |ddd| j  }|| j }|S )NT)axiskeepdim)r    sqrtmeansquarer   r"   )r#   r(   r&   r&   r'   forward5   s   &
zImageGPTLayerNorm.forward)r   )__name__
__module____qualname__tupleintfloatr   r    r!   r0   __classcell__r&   r&   r$   r'   r   /   s    r   c                       s   e Zd ZddedB dedB f fddZdddZdd	d
Zdd Zdd Z								dde
jdedB de
jdB de
jdB de
jdB dedB dedB de
jdB defddZ  ZS )ImageGPTAttentionFNis_cross_attention	layer_idxc                    s.  t    || _|j}| jdttj||ftjd	dd||dd |j
| _|j| _| j| j | _| j| _| j| j | jkrMtd| j d| j d|j| _|| _|j| _|| _|j| _| jrutd	| j | j| _t| j| j| _n
td
| j | j| _t| j| j| _t|j| _t|j| _d S )Nbiasdtyper   F)
persistentz=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).   r   ) r   r   configmax_position_embeddingsregister_bufferr    trilonesboolviewr   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr9   scale_attn_by_inverse_layer_idxr:   reorder_and_upcast_attnr   c_attnq_attnc_projr   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropout)r#   r@   r9   r:   max_positionsr$   r&   r'   r   =   s@   

zImageGPTAttention.__init__c                 C   s  t ||dd}| jr|t|dd  }| jr$|t| jd  }| j	s]|d|d}}| j
d d d d || |d |f }t |jj}	t j|	|j|jd}	t |||	}|d ure|| }tjdd|}||j}| |}t ||}
|
|fS )Nr*         ?r   r=   devicedim)r    matmul	transposerM   r   sizerN   r6   r:   r9   r;   finfor=   minr(   r\   wherer   SoftmaxtyperU   )r#   querykeyvalueattention_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputr&   r&   r'   _attne   s$   &
zImageGPTAttention._attnc                 C   s  |  \}}}}|  \}	}	}
}	tj|| ||
tj|jd}d}| jr.|t| dd  }| jr:|t| jd  }t	|jj
dd1 |d|||ddd||
}}tj|| | d	|d
}|||||
}W d    n1 svw   Y  | js| d| d}}| jd d d d || |d |f }t|jj}tj||j|jd}t|||}|d ur|| }tjdd|}|jtjkrtd|
|j}| |}t||}||fS )Nr[         ?r*   rZ   r   F)enabledrY   r   )betaalphar]   zDError with upcasting, attn_weights does not have dtype torch.float32)ra   r    emptyfloat32r\   rM   r6   rN   r:   r   rf   reshaper`   baddbmmr9   r;   rb   r=   rc   r(   rd   r   re   RuntimeErrorrU   r_   )r#   rg   rh   ri   rj   bszrI   	q_seq_lendk_	k_seq_lenrk   scale_factorqkrl   rm   rn   ro   rp   r&   r&   r'   _upcast_and_reordered_attn   s8   &&
z,ImageGPTAttention._upcast_and_reordered_attnc                 C   s2   |  dd ||f }|j| }|ddddS )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr*   r   r?   r   r   )ra   rF   permuter#   r(   rI   attn_head_size	new_shaper&   r&   r'   _split_heads   s   
zImageGPTAttention._split_headsc                 C   s8   | dddd }| dd || f }||S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r?   r   r   NrY   )r   
contiguousra   rF   r   r&   r&   r'   _merge_heads   s   
zImageGPTAttention._merge_headshidden_states
layer_pastrj   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionscache_positionr)   c	                 C   s  |d u}	|j \}
}}|d ur&t|tr$|j| j}|	r |j}n|j}n|}|	r*|n|}|	rt| ds7t	d|d urQ|rQ| 
|}|j| j j}|j| j j}nY| 
|}| |j| jdd\}}||
d| j| jdd}||
d| j| jdd}n*| |j| jdd\}}}||
d| j| jdd}||
d| j| jdd}|d ur|	s|nd }|||| jd|i\}}|	rd|j| j< ||
|| j| jdd}| jr| ||||\}}n
| ||||\}}| || j| j}| |}| |}||fS )	NrQ   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r?   r]   r*   r   r   T)shape
isinstancer
   
is_updatedgetr:   cross_attention_cacheself_attention_cachehasattrrL   rQ   layerskeysvaluesrP   splitrK   rF   rI   rJ   r`   updaterO   r   rq   r   rR   rW   )r#   r   r   rj   r   r   r   r   r   r9   r{   seq_lenr~   r   curr_past_key_valuescurrent_statesrg   rh   ri   rp   rk   r&   r&   r'   r0      sN   





zImageGPTAttention.forward)FNr   NNNNFFN)r1   r2   r3   rE   r5   r   rq   r   r   r   r    r!   r   r4   r0   r7   r&   r&   r$   r'   r8   <   s@     
(
"0	
r8   c                       s2   e Zd Z fddZdejdejfddZ  ZS )ImageGPTMLPc                    sF   t    |j}t||| _t||| _t|j | _t	
|j| _d S r   )r   r   r   r   c_fcrR   r   activation_functionactr   rS   rV   dropout)r#   intermediate_sizer@   rG   r$   r&   r'   r     s   
zImageGPTMLP.__init__r   r)   c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   rR   r   )r#   r   r&   r&   r'   r0     s
   



zImageGPTMLP.forward)r1   r2   r3   r   r    r!   r0   r7   r&   r&   r$   r'   r     s    r   c                       s   e Zd Zd fdd	Z							ddejdedB dejdB dejdB d	ejdB d
edB dedB dejdB defddZ	  Z
S )ImageGPTBlockNc                    s   t    |j}|jd ur|jnd| }t||jd| _t||d| _t||jd| _	|j
r>t|d|d| _t||jd| _t||| _d S )N   r   r:   T)r9   r:   )r   r   r   n_innerr   layer_norm_epsilonln_1r8   attnln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)r#   r@   r:   r   	inner_dimr$   r&   r'   r     s   
zImageGPTBlock.__init__Fr   r   rj   r   r   r   r   r   r)   c	              	   C   s   |}	|  |}| j||||||d}
|
d }|
dd  }||	 }|d urTt| ds1td|  d|}	| |}| j|||||||d}|d }|	| }||dd   }|}	| |}| |}|	| }|f| S )N)r   rj   r   r   r   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   rj   r   r   r   r   )r   r   r   rL   r   r   r   r   )r#   r   r   rj   r   r   r   r   r   residualattn_outputsrp   outputscross_attn_outputsfeed_forward_hidden_statesr&   r&   r'   r0   .  sJ   



	


zImageGPTBlock.forwardr   r   )r1   r2   r3   r   r    r!   r   rE   r4   r0   r7   r&   r&   r$   r'   r     s8    	
r   c                       sD   e Zd ZU eed< dZdZdZdZdgZ	e
  fddZ  ZS )	ImageGPTPreTrainedModelr@   transformer	input_ids)imageTr   c              	      s   t  | t|tr1| D ]\}}d|v r.d|v r.tj|d| jjt	
d| jj  d qdS t|trT|jj}t|jttj||ftjddd|| dS dS )	zInitialize the weights.rR   r"   g        r?   )r.   stdr<   r   N)r   _init_weightsr   r   named_parametersinitnormal_r@   initializer_rangemathr-   n_layerr8   rA   copy_r;   r    rC   rD   rE   rF   )r#   modulenameprX   r$   r&   r'   r   q  s    
&
z%ImageGPTPreTrainedModel._init_weights)r1   r2   r3   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr    no_gradr   r7   r&   r&   r$   r'   r   h  s   
 r   c                !       s   e Zd Zdef fddZdd Zdd Ze													dd	ej	dB d
e
dB dej	dB dej	dB dej	dB dej	dB dej	dB dej	dB dedB dedB dedB dedB dej	dB dedeeB fddZ  ZS )ImageGPTModelr@   c                    s   t     j| _t j| j| _t j| j| _	t
 j| _t fddt jD | _t| j jd| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0ir@   r&   r'   
<listcomp>  s    z*ImageGPTModel.__init__.<locals>.<listcomp>r   F)r   r   r   rG   r   	Embedding
vocab_sizewterA   wperS   
embd_pdropdrop
ModuleListrangenum_hidden_layershr   r   ln_fgradient_checkpointing	post_initr#   r@   r$   r   r'   r     s    zImageGPTModel.__init__c                 C   s   | j S r   r   )r#   r&   r&   r'   get_input_embeddings  s   z"ImageGPTModel.get_input_embeddingsc                 C   s
   || _ d S r   r   )r#   new_embeddingsr&   r&   r'   set_input_embeddings  s   
z"ImageGPTModel.set_input_embeddingsNr   past_key_valuesrj   token_type_idsposition_idsinputs_embedsr   r   r   r   output_hidden_statesreturn_dictr   kwargsr)   c           !      K   s  |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|dur$|n| j j}|dur4|dur4td|durP| || | }|d|d }|j	d }n|durb| dd }|j	d }ntd|durm|j
n|j
}| jr| jr|	rtd d}	|dur|d|d }|	r|du rt| j d}|du r|dur| nd}tj|d |d	| }|du r|d}|dur|dkrtd
||d}|ddddddf }|j| jd}d| t| jj }| j jr|dur| \}}}||f}|du r
tj||d	}| |}nd}|du r| |}| |}|||j
 }|dur7| |}|| }| |}||df }|
rIdnd}|
rU| j jrUdnd}|r\dnd}t| jD ]3\}}|ro||f }|||||||	|
|d} | d }|
r|| d f }| j jr|| d f }qc|  |}|j| }|r||f }|st!dd |||||fD S t"|||||dS )a;  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTModel
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer*   r   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   )r\   z$batch_size has to be defined and > 0r<   rr   r&   )r   r   r   r   r   r?   c                 s   s    | ]	}|d ur|V  qd S r   r&   )r   vr&   r&   r'   	<genexpr>I  s    z(ImageGPTModel.forward.<locals>.<genexpr>)last_hidden_stater   r   
attentionscross_attentions)#r@   r   r   r   use_return_dictrL   %warn_if_padding_and_no_attention_maskra   rF   r   r\   r   trainingloggerwarning_oncer	   get_seq_lengthr    arange	unsqueezetor=   rb   rc   r   rD   invert_attention_maskr   r   r   	enumerater   r   r4   r   )!r#   r   r   rj   r   r   r   r   r   r   r   r   r   r   r   input_shape
batch_sizer\   past_seen_tokensencoder_batch_sizeencoder_sequence_lengthr~   encoder_hidden_shapeposition_embedsr   token_type_embedsoutput_shapeall_self_attentionsall_cross_attentionsall_hidden_statesr   blockr   r&   r&   r'   r0     s   1












zImageGPTModel.forward)NNNNNNNNNNNNN)r1   r2   r3   r   r   r   r   r   r    r!   r   rE   r   r4   r   r0   r7   r&   r&   r$   r'   r     sb    	
r   z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                #       s   e Zd ZddiZdef fddZe														ddejdB de	dB d	ejdB d
ejdB dejdB dejdB dejdB dejdB dejdB de
dB de
dB de
dB de
dB dejdB dedeeB f ddZ  ZS )ImageGPTForCausalImageModelingzlm_head.weightztransformer.wte.weightr@   c                    s<   t  | t|| _tj|j|jd dd| _| 	  d S )Nr   Fr;   )
r   r   r   r   r   Linearn_embdr   lm_headr   r   r$   r&   r'   r   a  s   
z'ImageGPTForCausalImageModeling.__init__Nr   r   rj   r   r   r   r   r   labelsr   r   r   r   r   r   r)   c                 K   s   |dur|n| j j}| j|||||||||
||||d}|d }| |}d}|	durT|dddddf  }|	dddf  }t }||d|d|d}|sj|f|dd  }|durh|f| S |S t|||j	|j
|j|jdS )a&
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
        >>> import torch
        >>> import matplotlib.pyplot as plt
        >>> import numpy as np

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> # unconditional generation of 8 images
        >>> batch_size = 4
        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
        >>> context = context.to(device)
        >>> output = model.generate(
        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
        ... )

        >>> clusters = image_processor.clusters
        >>> height = image_processor.size["height"]
        >>> width = image_processor.size["width"]

        >>> samples = output[:, 1:].detach().cpu().numpy()
        >>> samples_img = [
        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
        ... ]  # convert color cluster tokens back to pixels
        >>> f, axes = plt.subplots(1, batch_size, dpi=300)

        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
        ...     ax.axis("off")
        ...     ax.imshow(img)
        ```N)r   rj   r   r   r   r   r   r   r   r   r   r   r   .r*   r   )losslogitsr   r   r   r   )r@   r   r   r  r   r   rF   ra   r   r   r   r   r   )r#   r   r   rj   r   r   r   r   r   r  r   r   r   r   r   r   transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputr&   r&   r'   r0   i  sF   F
z&ImageGPTForCausalImageModeling.forward)NNNNNNNNNNNNNN)r1   r2   r3   _tied_weights_keysr   r   r   r    r!   r   rE   r   r4   r   r0   r7   r&   r&   r$   r'   r  X  sf    	
r  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                       s   e Zd Zdef fddZe											ddejdB dedB dejdB dejdB d	ejdB d
ejdB dejdB de	dB de	dB de	dB de	dB de
deeB fddZ  ZS )ImageGPTForImageClassificationr@   c                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S )NFr  )
r   r   
num_labelsr   r   r   r  r  scorer   r   r$   r&   r'   r     s
   
z'ImageGPTForImageClassification.__init__Nr   r   rj   r   r   r   r  r   r   r   r   r   r)   c                 K   s   |dur|n| j j}| j||||||||	|
|d
}|d }|jdd}| |}d}|dur6| ||| j }|sL|f|dd  }|durJ|f| S |S t|||j|j|j	dS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```N)	r   rj   r   r   r   r   r   r   r   r   r   r]   )r  r  r   r   r   )
r@   r   r   r.   r  loss_functionr   r   r   r   )r#   r   r   rj   r   r   r   r  r   r   r   r   r   r  r   pooled_hidden_statesr  r  r  r&   r&   r'   r0     s:   3
z&ImageGPTForImageClassification.forward)NNNNNNNNNNN)r1   r2   r3   r   r   r   r    r!   r   rE   r   r4   r   r0   r7   r&   r&   r$   r'   r    sR    		
r  )r  r  r   r   )2__doc__r   typingr   r    r   torch.nnr    r   r   activationsr   cache_utilsr   r	   r
   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   utils.genericr   configuration_imagegptr   
get_loggerr1   r   Moduler   r8   r   r   r   r   r  r  __all__r&   r&   r&   r'   <module>   sJ   
 RJ" M}b