o
    eij                     @   sR  d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ eeZdd Zdd Z d&ddZ!G dd dej"Z#dd Z$G dd dej"Z%eG dd deZ&eG dd de&Z'eddG d d! d!e&eZ(ed"dG d#d$ d$e&Z)g d%Z*dS )'zPyTorch CTRL model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)CacheDynamicCache)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPastSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )
CTRLConfigc                 C   s$   dt dd|d  |  }| | S )Nr   i'     )torchpow)posid_model_sizeangle_rates r   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ctrl/modeling_ctrl.py
angle_defn%   s   r   c                 C   s   t tj| tjd|dtj|tjd|d|}t|d d dd df }t|d d dd df }tj||gdd}|S )Ndtyper   r   r   dim)	r   r   arangeint64to	unsqueezesincoscat)positionr   r   
angle_radssinescosinespos_encodingr   r   r   positional_encoding*   s   r.   c              	   C   s   t | |dddd}|jd }|t| }|d ur7|d|d}}	|||	| |	d |	f d 7 }|d ur?|| }t j|dd}
t |
|}||
fS )	Nr   r   r   r   r   g     r    )r   matmulpermuteshapenpsqrtsizesoftmax)qkvmaskattention_mask	matmul_qkdkscaled_attention_logitsndnsattention_weightsoutputr   r   r   scaled_dot_product_attention9   s   
 rC   c                       s:   e Zd Zd	 fdd	Zdd Z					d
ddZ  ZS )MultiHeadAttentionNc                    sh   t    || _|| _|| _t|| j | _t||| _	t||| _
t||| _t||| _d S N)super__init__	num_headsr   	layer_idxintdepthr   LinearWqWkWvdense)selfr   rH   rI   	__class__r   r   rG   P   s   
zMultiHeadAttention.__init__c                 C   s"   | |d| j| j}|g dS )Nr   r   r   r   r   )reshaperH   rK   r1   )rQ   x
batch_sizer   r   r   split_into_heads^   s   z#MultiHeadAttention.split_into_headsFc
                 C   s   |j d }
| |}| |}| |}| ||
}| ||
}| ||
}|d ur7|||| jd|	i\}}t|||||}|d g d}|d }|	|
d| j
}| |}||fS )Nr   cache_positionrT   r   r   )r2   rM   rN   rO   rX   updaterI   rC   r1   rU   r   rP   )rQ   r9   r8   r7   r:   
layer_pastr;   	use_cacheoutput_attentionsrY   rW   rB   scaled_attentionattnoriginal_size_attentionr   r   r   forwardb   s   




zMultiHeadAttention.forwardrE   NNFFN)__name__
__module____qualname__rG   rX   ra   __classcell__r   r   rR   r   rD   O   s    
rD   c                 C   s"   t t | |t  t || S rE   )r   
SequentialrL   ReLU)r   dffr   r   r   point_wise_feed_forward_network   s   "rj   c                       s2   e Zd Zd fdd	Z					d	ddZ  ZS )
EncoderLayer皙?Nc                    sb   t    t|||d| _t||| _tj|dd| _tj|dd| _	t
|| _t
|| _d S )NrI   gư>eps)rF   rG   rD   multi_head_attentionrj   ffnr   	LayerNorm
layernorm1
layernorm2Dropoutdropout1dropout2)rQ   r   rH   ri   raterI   rR   r   r   rG      s   
zEncoderLayer.__init__Fc                 C   s|   |  |}| j|||||||||d	}	|	d }
| |
}
||
 }| |}| |}| |}|| }|f|	dd   }|S )Nr[   r;   r\   r]   rY   r   r   )rs   rp   rv   rt   rq   rw   )rQ   rV   r:   r[   r;   r\   r]   rY   normedattn_outputsattn_outputout1out2
ffn_outputoutputsr   r   r   ra      s*   





zEncoderLayer.forward)rl   Nrb   )rc   rd   re   rG   ra   rf   r   r   rR   r   rk      s    rk   c                       s*   e Zd ZU eed< dZ fddZ  ZS )CTRLPreTrainedModelconfigtransformerc                    s<   t  | t|trt|jt|jj	|j
tj d S d S rE   )rF   _init_weights
isinstance	CTRLModelinitcopy_r-   r.   r   n_positionsr   r   float)rQ   modulerR   r   r   r      s   
z!CTRLPreTrainedModel._init_weights)rc   rd   re   r   __annotations__base_model_prefixr   rf   r   r   rR   r   r      s   
 r   c                       s   e Zd Z fddZdd Zdd Ze											ddejdB d	e	dB d
ej
dB dejdB dejdB dej
dB dedB dedB dedB dedB dejdB deej eB fddZ  ZS )r   c                    s   t     j| _ j| _t j j| _	t
 j| _t fddt jD | _tj j jd| _| jdt j| jtjdd |   d S )Nc              	      s&   g | ]}t  j j j j|d qS )rm   )rk   n_embdn_headri   resid_pdrop).0r   r   r   r   
<listcomp>   s    z&CTRLModel.__init__.<locals>.<listcomp>rn   r-   F)
persistent)rF   rG   r   r   n_layer
num_layersr   	Embedding
vocab_sizewru   
embd_pdropdropout
ModuleListrangehrr   layer_norm_epsilon	layernormregister_bufferr.   r   r   r   	post_initrQ   r   rR   r   r   rG      s   
zCTRLModel.__init__c                 C   s   | j S rE   r   )rQ   r   r   r   get_input_embeddings   s   zCTRLModel.get_input_embeddingsc                 C   s
   || _ d S rE   r   )rQ   new_embeddingsr   r   r   set_input_embeddings   s   
zCTRLModel.set_input_embeddingsN	input_idspast_key_valuesr;   token_type_idsposition_idsinputs_embedsr\   r]   output_hidden_statesreturn_dictrY   returnc              
   K   s
  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	|
dur$|
n| j j}
|dur4|dur4td|durP| || | }|d|d }|j	d }n|durb| dd }|j	d }ntd|durm|j
n|j
}|r||du r|t| j d}|dur| nd}|du rtj||d | tj|d}|d}|dur|dkrtd||d}|d	d
}|j| jd}d| t| jj }|dur|d|d }| |}|t| j9 }nd}|du r| |}|d }tt|| || d	|}|t| j9 }| j|| _| j|ddf }|| | }| |}|	r+dnd}|r2dnd}t| jD ]&\}}|	rE||f }||||||||d}|d }|r^||d	 f7 }q9| |}|	rm||f }|
s}t dd ||||fD S t!||||dS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, CTRLModel
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 5, 1280]
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   r   z5You have to specify either input_ids or inputs_embedsr   )r   devicez$batch_size has to be defined and > 0r   r   r   g      ?r   ry   c                 s   s    | ]	}|d ur|V  qd S rE   r   )r   r9   r   r   r   	<genexpr>h  s    z$CTRLModel.forward.<locals>.<genexpr>)last_hidden_stater   hidden_states
attentions)"r   r]   r\   r   use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr5   viewr2   r   r	   get_seq_lengthr   r"   longr%   r$   r   finfominr   r3   r4   r   triuonesr-   r   	enumerater   r   tupler   )rQ   r   r   r;   r   r   r   r\   r]   r   r   rY   kwargsinput_shaperW   r   past_lengthtoken_type_embedsseq_lenr:   
pos_embedsr   all_hidden_statesall_attentionsr   r   r   r   r   r   ra      s   $


"

	


zCTRLModel.forwardNNNNNNNNNNN)rc   rd   re   rG   r   r   r   r   
LongTensorr   FloatTensorboolTensorr   r   ra   rf   r   r   rR   r   r      sR    	
r   z
    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       s   e Zd ZddiZ fddZe													ddejdB dedB d	ej	dB d
ejdB dejdB dej	dB dejdB de
dB de
dB de
dB de
dB dejdB deejB deej eB fddZ	d fdd	Z  ZS )CTRLLMHeadModelzlm_head.weightztransformer.w.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NTbias)
rF   rG   r   r   r   rL   r   r   lm_headr   r   rR   r   r   rG   }  s   
zCTRLLMHeadModel.__init__Nr   r   r   r;   r   r   r   labelsr\   r]   r   r   rY   logits_to_keepr   c                 K   s   |dur|n| j j}| j||||||||	|
||d}|d }t|tr)t| dn|}| |dd|ddf }d}|durM| j||fd| j ji|}|sc|f|dd  }|dura|f| S |S t	|||j
|j|jdS )ag  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLLMHeadModel

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> sequence_ids = model.generate(inputs["input_ids"])
        >>> sequences = tokenizer.batch_decode(sequence_ids)
        >>> sequences
        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']

        >>> outputs = model(**inputs, labels=inputs["input_ids"])
        >>> round(outputs.loss.item(), 2)
        9.21

        >>> list(outputs.logits.shape)
        [1, 5, 246534]
        ```N)
r   r;   r   r   r   r\   r]   r   r   rY   r   r   r   )losslogitsr   r   r   )r   r   r   r   rJ   slicer   loss_functionr   r   r   r   r   )rQ   r   r   r;   r   r   r   r   r\   r]   r   r   rY   r   r   transformer_outputsr   slice_indicesr   r   rB   r   r   r   ra     sJ   1zCTRLLMHeadModel.forwardFc                    s,   t  j|f|||d|}|dd  |S )N)r   r\   is_first_iterationr   )rF   prepare_inputs_for_generationpop)rQ   r   r   r\   r   r   model_inputsrR   r   r   r     s   	z-CTRLLMHeadModel.prepare_inputs_for_generation)NNNNNNNNNNNNr   )NNF)rc   rd   re   _tied_weights_keysrG   r   r   r   r   r   r   r   rJ   r   r   ra   r   rf   r   r   rR   r   r   t  s`    	
[r   a  
    The CTRL Model transformer with a sequence classification head on top (linear layer).
    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
    value in each row of the batch).
    c                       s   e Zd Z fddZe											ddejdB dedB dejdB dejdB dejdB d	ejdB d
ejdB de	dB de	dB de	dB de	dB de
ej eB fddZ  ZS )CTRLForSequenceClassificationc                    s@   t  | |j| _t|| _tj|j| jdd| _| 	  d S )NFr   )
rF   rG   
num_labelsr   r   r   rL   r   
classifierr   r   rR   r   r   rG     s
   
z&CTRLForSequenceClassification.__init__Nr   r   r;   r   r   r   r   r\   r]   r   r   r   c                 K   sH  |dur|n| j j}| j||||||||	|
|d
}|d }| |}|dur0|jdd \}}n	|jdd \}}| j jdu rG|dkrGtd| j jdu rPd}n1|duru|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d	 |t	j||jd
|f }d}|dur| j jdu r| jdkrd| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jdkrt }| jdkr|| | }n,|||}n&| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> import torch

        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> labels = torch.tensor(1)
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)
        0.93
        ```

        Example of multi-label classification:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification

        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
        >>> model = CTRLForSequenceClassification.from_pretrained(
        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
        ... )

        >>> # CTRL was trained with control codes as the first token
        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax().item()
        >>> model.config.id2label[predicted_class_id]
        'LABEL_0'
        ```

        ```python
        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
        >>> num_labels = len(model.config.id2label)
        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)

        >>> num_labels = len(model.config.id2label)
        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
        ...     torch.float
        ... )
        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```N)	r   r;   r   r   r   r\   r]   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r   r   r   r   )r   r   r   r   r2   pad_token_idr   r$   r   r   int32r"   argmaxloggerwarning_oncerS   rc   problem_typer   r   r   rJ   r   squeezer   r   r   r   r   r   )rQ   r   r   r;   r   r   r   r   r\   r]   r   r   r   r   r   r   rW   sequence_lengthlast_non_pad_tokennon_pad_masktoken_indicespooled_logitsr   loss_fctrB   r   r   r   ra     sv   ]



"


z%CTRLForSequenceClassification.forwardr   )rc   rd   re   rG   r   r   r   r   r   r   r   r   r   ra   rf   r   r   rR   r   r     sN    		
r   )r   r   r   r   rE   )+__doc__numpyr3   r   r   torch.nnr   r   r    r   r   cache_utilsr   r	   
generationr
   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   configuration_ctrlr   
get_loggerrc   r   r   r.   rC   ModulerD   rj   rk   r   r   r   r   __all__r   r   r   r   <module>   sD   

40 0y 3