o
    	۷i~                     @   sj  d Z ddlmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ e e!Z"dd Z#dd Z$d'ddZ%G dd dej&Z'dd Z(G dd dej&Z)eG dd deZ*eG dd de*Z+edd G d!d" d"e*eZ,ed#d G d$d% d%e*Z-g d&Z.dS )(zPyTorch CTRL model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
CTRLConfigc                 C   s$   dt dd|d  |  }| | S )Nr   i'     )torchpow)posid_model_sizeangle_rates r   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defn(   s   r    c                 C   s   t tj| tjd|dtj|tjd|d|}t|d d dd df }t|d d dd df }tj||gdd}|S )Ndtyper   r   r   dim)	r    r   arangeint64to	unsqueezesincoscat)positionr   r"   
angle_radssinescosinespos_encodingr   r   r   positional_encoding-   s   r2   c              	   C   s   t | |dddd}|jd }|t| }|d ur7|d|d}	}
|||
|	 |
d |
f d 7 }|d ur?|| }t j|dd}|d urN|| }t ||}||fS )	Nr   r   r   r   r#   g     r$   )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	head_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputr   r   r   scaled_dot_product_attention<   s   
 rH   c                       sD   e Zd Zd fdd	Zdd Zdd Z						dd	d
Z  ZS )MultiHeadAttentionNc                    sp   t    || _|| _|| _t|| j | _t||| _	t||| _
t||| _t||| _t | _d S N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdensesetpruned_heads)selfr   rM   rN   	__class__r   r   rL   W   s   
zMultiHeadAttention.__init__c                 C   s   | j | j }t|dkrd S t|| j|| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt| | _|| j | _ | j
|| _d S )Nr   r   r$   )r   rM   lenr   rW   r   rR   rS   rT   rU   union)rX   headsattention_head_sizeindexr   r   r   prune_headsf   s   zMultiHeadAttention.prune_headsc                 C   s"   | |d| j| j}|g dS )Nr#   r   r   r   r   )reshaperM   rP   r5   )rX   x
batch_sizer   r   r   split_into_headsw   s   z#MultiHeadAttention.split_into_headsFc                 C   s   |j d }| |}| |}| |}| ||}| ||}| ||}|d ur7|||| jd|
i\}}t||||||}|d g d}|d }|	|d| j
}| |}||fS )Nr   cache_positionra   r   r#   )r6   rR   rS   rT   re   updaterN   rH   r5   rb   r   rU   )rX   r=   r<   r;   r>   
layer_pastr?   r@   	use_cacheoutput_attentionsrf   rd   rG   scaled_attentionattnoriginal_size_attentionr   r   r   forward{   s   




zMultiHeadAttention.forwardrJ   NNNFFN)__name__
__module____qualname__rL   r`   re   rn   __classcell__r   r   rY   r   rI   V   s    
rI   c                 C   s"   t t | |t  t || S rJ   )r   
SequentialrQ   ReLU)r   dffr   r   r   point_wise_feed_forward_network   s   "rw   c                       s4   e Zd Zd fdd	Z						d	ddZ  ZS )
EncoderLayer皙?Nc                    sb   t    t|||d| _t||| _tj|dd| _tj|dd| _	t
|| _t
|| _d S )NrN   gư>eps)rK   rL   rI   multi_head_attentionrw   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)rX   r   rM   rv   raterN   rY   r   r   rL      s   
zEncoderLayer.__init__Fc	                 C   s~   |  |}	| j|	|	|	|||||||d
}
|
d }| |}|| }| |}| |}| |}|| }|f|
dd   }|S )Nrh   r?   r@   ri   rj   rf   r   r   )r   r}   r   r   r~   r   )rX   rc   r>   rh   r?   r@   ri   rj   rf   normedattn_outputsattn_outputout1out2
ffn_outputoutputsr   r   r   rn      s,   




zEncoderLayer.forward)ry   Nro   )rp   rq   rr   rL   rn   rs   r   r   rY   r   rx      s    rx   c                   @   s"   e Zd ZU eed< dZdd ZdS )CTRLPreTrainedModelconfigtransformerc                 C   s   t |tjtfr"|jjjd| jjd |j	dur |j	j
  dS dS t |tjrE|jjjd| jjd |jdurC|jj|j 
  dS dS t |tjrZ|j	j
  |jjd dS dS )zInitialize the weights.g        )meanstdN      ?)
isinstancer   rQ   r   weightdatanormal_r   initializer_rangebiaszero_	Embeddingpadding_idxr   fill_)rX   moduler   r   r   _init_weights   s   

z!CTRLPreTrainedModel._init_weightsN)rp   rq   rr   r   __annotations__base_model_prefixr   r   r   r   r   r      s   
 r   c                       s   e Zd Z fddZdd Zdd Zdd Ze																								dd
ee	j
 dee dee	j dee	j
 dee	j
 dee	j dee	j dee dee dee dee dee	j deee	j ef fddZ  ZS )	CTRLModelc                    s   t     j| _ j| _t j| jtj	| _
t j j| _t j| _t fddt jD | _tj j jd| _|   d S )Nc              	      s&   g | ]}t  j j j j|d qS )rz   )rx   n_embdn_headrv   resid_pdrop).0r   r   r   r   
<listcomp>   s    z&CTRLModel.__init__.<locals>.<listcomp>r{   )rK   rL   r   r   n_layer
num_layersr2   n_positionsr   floatr1   r   r   
vocab_sizewr   
embd_pdropdropout
ModuleListrangehr   layer_norm_epsilon	layernorm	post_initrX   r   rY   r   r   rL      s   
zCTRLModel.__init__c                 C   s   | j S rJ   r   )rX   r   r   r   get_input_embeddings  s   zCTRLModel.get_input_embeddingsc                 C   s
   || _ d S rJ   r   )rX   new_embeddingsr   r   r   set_input_embeddings  s   
zCTRLModel.set_input_embeddingsc                 C   s(   |  D ]\}}| j| j| qdS )zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsr   r}   r`   )rX   heads_to_prunelayerr]   r   r   r   _prune_heads  s   zCTRLModel._prune_headsN	input_idspast_key_valuesr?   token_type_idsposition_idsr@   inputs_embedsri   rj   output_hidden_statesreturn_dictrf   returnc                 K   sD  |	dur|	n| j j}	|dur|n| j j}|
dur|
n| j j}
|dur$|n| j j}|dur4|dur4td|durP| || | }|d|d }|j	d }n|durb| dd }|j	d }ntd|durm|j
n|j
}|r||du r|t| j d}|rt|trtd t|}|dur| nd}|du rtj||d | tj|d}|d}|dur|dkrtd	||d}|d
d}|j| jd}d| t| jj }| || j j}|dur|d|d }| |}|t| j9 }nd}|du r| |}|d }t t!|| || d
|}|t| j9 }| j"|| _"| j"|ddf }|| | }| #|}|
rEdnd}|	rLdnd}t$| j%D ])\}}|
r_||f }||||||| ||	|d}|d }|	r{||d
 f7 }qS| &|}|
r||f }|stdd ||||fD S t'||||dS )aE  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CTRLModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 5, 1280]
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   r   z5You have to specify either input_ids or inputs_embedsr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.)r"   devicez$batch_size has to be defined and > 0r   r   r!   r   r   r   c                 s   s    | ]	}|d ur|V  qd S rJ   r   )r   r=   r   r   r   	<genexpr>  s    z$CTRLModel.forward.<locals>.<genexpr>)last_hidden_stater   hidden_states
attentions)(r   rj   ri   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr9   viewr6   r   r
   r   tupleloggerwarning_oncefrom_legacy_cacheget_seq_lengthr   r&   longr)   r(   r"   finfominget_head_maskr   r   r7   r8   r   triuonesr1   r   	enumerater   r   r   )rX   r   r   r?   r   r   r@   r   ri   rj   r   r   rf   kwargsinput_shaperd   r   past_lengthtoken_type_embedsseq_lenr>   
pos_embedsr   all_hidden_statesall_attentionsr   r   r   r   r   r   rn     s   1




"





zCTRLModel.forwardNNNNNNNNNNNN)rp   rq   rr   rL   r   r   r   r   r   r   
LongTensorr	   FloatTensorboolTensorr   r   r   rn   rs   r   r   rY   r   r      sZ    	
r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                        s   e Zd ZdgZ fddZe													ddeej dee	 deej
 deej d	eej d
eej
 deej
 deej dee dee dee dee deej deeej ef fddZdddZ  ZS )CTRLLMHeadModelzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NTr   )
rK   rL   r   r   r   rQ   r   r   lm_headr   r   rY   r   r   rL     s   
zCTRLLMHeadModel.__init__Nr   r   r?   r   r   r@   r   labelsri   rj   r   r   rf   r   c                 K   s   |dur|n| j j}| j||||||||	|
|||d}|d }| |}d}|dur8| j||fd| j ji|}|sN|f|dd  }|durL|f| S |S t|||j|j|j	dS )a
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLLMHeadModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> sequence_ids = model.generate(inputs["input_ids"])
        >>> sequences = tokenizer.batch_decode(sequence_ids)
        >>> sequences
        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

        >>> outputs = model(**inputs, labels=inputs["input_ids"])
        >>> round(outputs.loss.item(), 2)
        9.21

        >>> list(outputs.logits.shape)
        [1, 5, 246534]
        ```N)r   r?   r   r   r@   r   ri   rj   r   r   rf   r   r   r   )losslogitsr   r   r   )
r   r   r   r   loss_functionr   r   r   r   r   )rX   r   r   r?   r   r   r@   r   r   ri   rj   r   r   rf   r   transformer_outputsr   	lm_logitsr   rG   r   r   r   rn     sJ   <
zCTRLLMHeadModel.forwardc           
      K   s   |d ur#|  }|jd |kr|}n|jd d }|d d |d f }|||d}|dd  | D ]\}}	||vrGtd| d |	||< q3|S )Nr   )r   r   ri   r   z	Warning: z is not a recognized input.)r   r6   popr   print)
rX   r   r   ri   r   r   remove_prefix_lengthmodel_inputskeyvaluer   r   r   prepare_inputs_for_generation-  s   z-CTRLLMHeadModel.prepare_inputs_for_generation)NNNNNNNNNNNNNNN)rp   rq   rr   _tied_weights_keysrL   r   r   r   r   r	   r   r   r   r   r   r   rn   r   rs   r   r   rY   r   r     s^    	
er   a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                       s   e Zd Z fddZe												ddeej dee deej	 deej deej d	eej	 d
eej	 deej dee
 dee
 dee
 dee
 deeej ef fddZ  ZS )CTRLForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S )NFr   )
rK   rL   
num_labelsr   r   r   rQ   r   
classifierr   r   rY   r   r   rL   V  s
   
z&CTRLForSequenceClassification.__init__Nr   r   r?   r   r   r@   r   r   ri   rj   r   r   r   c                 C   sJ  |dur|n| j j}| j||||||||	|
||d}|d }| |}|dur1|jdd \}}n	|jdd \}}| j jdu rH|dkrHtd| j jdu rQd}n1|durv|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrd| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n,|||}n&| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a2  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> import torch

        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> labels = torch.tensor(1)
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)
        0.93
        ```

        Example of multi-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained(
        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
        ... )

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> num_labels = len(model.config.id2label)
        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
        ...     torch.float
        ... )
        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```N)
r   r?   r   r   r@   r   ri   rj   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r#   )r   r"   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r   r   r   r   )r   r   r   r   r6   pad_token_idr   r(   r   r   int32r&   argmaxr   r   rZ   rp   problem_typer   r"   r   rO   r   squeezer   r   r   r   r   r   )rX   r   r   r?   r   r   r@   r   r   ri   rj   r   r   r   r   r   rd   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctrG   r   r   r   rn   _  sx   h



"


z%CTRLForSequenceClassification.forwardr   )rp   rq   rr   rL   r   r   r   r   r	   r   r   r   r   r   r   rn   rs   r   r   rY   r   r   J  sT    		
r   )r   r   r   r   r   )/__doc__typingr   r   numpyr7   r   r   torch.nnr   r   r   cache_utilsr	   r
   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_ctrlr   
get_loggerrp   r   r    r2   rH   ModulerI   rw   rx   r   r   r   r   __all__r   r   r   r   <module>   sH   

G2 M  ?