o
    ci+                     @   s   d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ ddlmZ eeG dd	 d	Zed
dG dd dejZed
dG dd dejZed
dG dd dejZed
d	ddejdededeeef dejjf
ddZed
dG dd dejZdS )a  
Adapted from https://github.com/karpathy/minGPT

Full definition of a GPT Language Model, all of it in this single file.
References:
1) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
2) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers
        /models/gpt2/modeling_gpt2.py
    N)	dataclass)Tuple)
functional)DeveloperAPI)
Deprecatedc                   @   s^   e Zd ZU eed< dZeed< dZeed< dZeed< dZe	ed< dZ
e	ed	< dZe	ed
< dS )	GPTConfig
block_size   n_layern_headi   n_embedg?embed_pdropresid_pdrop
attn_pdropN)__name__
__module____qualname__int__annotations__r
   r   r   r   floatr   r    r   r   Q/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/models/torch/mingpt.pyr      s   
 r   F)errorc                   @   s   e Zd ZdZdd ZdS )NewGELUz
    Implementation of the GELU activation function currently in Google BERT
    repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper:
    https://arxiv.org/abs/1606.08415
    c                 C   s6   d| dt tdtj |dt |d     S )Ng      ?      ?g       @gHm?g      @)torchtanhmathsqrtpipow)selfxr   r   r   forward4   s   "zNewGELU.forwardN)r   r   r   __doc__r#   r   r   r   r   r   +   s    r   c                       0   e Zd ZdZdef fddZdddZ  ZS )	CausalSelfAttentionz
    Vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    configc              	      s   t    |j|j dksJ t|jd|j | _t|j|j| _t|j	| _
t|j| _| dtt|j|jdd|j|j |j| _|j| _d S )Nr      bias   )super__init__r   r   nnLinearc_attnc_projDropoutr   attn_dropoutr   resid_dropoutregister_bufferr   trilonesr   viewr!   r'   	__class__r   r   r,   I   s   
zCausalSelfAttention.__init__Nc                 C   sB  |  \}}}| |j| jdd\}}}|||| j|| j dd}|||| j|| j dd}|||| j|| j dd}||dd dt| d  }	|		| j
d d d d d |d |f dktd}	|d urx|	| }	tj|	dd}	| |	}	|	| }
|
dd |||}
| | |
}
|
|	fS )	N   )dimr*   r   r   z-inf)sizer/   splitr   r7   r   	transposer   r   masked_fillr)   r   Fsoftmaxr2   
contiguousr3   r0   )r!   r"   attention_masksBTCqkvattyr   r   r   r#   ^   s      $2
zCausalSelfAttention.forwardNr   r   r   r$   r   r,   r#   __classcell__r   r   r9   r   r&   A   s    r&   c                       r%   )	Blockzan unassuming Transformer blockr'   c              	      sv   t    t|j| _t|| _t|j| _t	t
t|jd|j td|j |jt t|jd| _d S )N   )c_fcr0   actdropout)r+   r,   r-   	LayerNormr   ln_1r&   attnln_2
ModuleDictdictr.   r   r1   r   mlpr8   r9   r   r   r,      s   



zBlock.__init__Nc              	   C   sT   | j | ||d\}}|| }| j| j| j| j|}|| }||fS )NrF   )rY   rX   r]   rV   r0   rU   rT   )r!   r"   rF   x_attrM   x_ffnr   r   r   r#      s
   $zBlock.forwardrO   rP   r   r   r9   r   rR   }   s    rR   g?gffffff?modellearning_rateweight_decaybetasreturnc                    st  t  }t  }tjjf}tjjtjjf}|  D ]A\}	}
|
 D ]8\}}|	r+d|	|f n|}|dr8|	| q|drHt
|
|rH|	| q|drWt
|
|rW|	| qqt|   ||@ }||B }t|dkswJ dt| dt  | dksJ dt  |  d fdd	t|D |d
 fdd	t|D dd
g}tjj|f||d|}|S )ai  
    This long function is unfortunately doing something very simple and is
    being very defensive: We are separating out all parameters of the model
    into two buckets: those that will experience weight decay for regularization
    and those that won't (biases, and layernorm/embedding weights). We are then
    returning the PyTorch optimizer object.
    z%s.%sr)   weightr   zparameters z' made it into both decay/no_decay sets!z3 were not separated into either decay/no_decay set!c                       g | ]} | qS r   r   .0pn
param_dictr   r   
<listcomp>       z+configure_gpt_optimizer.<locals>.<listcomp>)paramsrd   c                    rh   r   r   ri   rl   r   r   rn      ro           )lrre   )setr   r-   r.   rW   	Embeddingnamed_modulesnamed_parametersendswithadd
isinstancer\   lenstrkeyssortedoptimAdamW)rb   rc   rd   re   kwargsdecayno_decaywhitelist_w_modulesblacklist_w_modulesmnmrk   pfpninter_paramsunion_paramsoptim_groups	optimizerr   rl   r   configure_gpt_optimizer   sB   



r   c                       s8   e Zd ZdZdef fddZdd Zdd	d
Z  ZS )GPTzGPT Transformer Modelr'   c              	      s   t     jd usJ  j| _ttt jt fddt	 j
D t jd| _| | j |  D ]\}}|drUtjjj|ddtd j
  d q:d S )	Nc                    s   g | ]}t  qS r   )rR   )rj   _r'   r   r   rn      ro   z GPT.__init__.<locals>.<listcomp>)drophln_fzc_proj.weightrq   {Gz?r;   meanstd)r+   r,   r   r-   r[   r\   r1   r   
ModuleListranger
   rW   r   transformerapply_init_weightsrv   rw   r   initnormal_r   r   )r!   r'   rk   r   r9   r   r   r,      s$   




zGPT.__init__c                 C   s   t |tjr"tjjj|jddd |jd ur tjj|j d S d S t |tj	r5tjjj|jddd d S t |tj
rMtjj|j tjj|j d S d S )Nrq   r   r   )ry   r-   r.   r   r   r   rg   r)   zeros_rt   rW   ones_)r!   moduler   r   r   r      s   
zGPT._init_weightsNFc                 C   s   |  \}}}|| jksJ d| d| j |durD|  \}}||kr)||ks+J |ddddddf }|j|jd}d| d }| j|}	g }
| jjD ]}||	|d\}	}|
| qP| j|	}	|rl|	|
fS |	S )z
        input_embeds: [batch_size x seq_len x n_embed]
        attention_masks: [batch_size x seq_len], 0 don't attend, 1 attend
        z"Cannot forward sequence of length z, block size is only N)dtyper   g    er^   )	r?   r   tor   r   r   r   appendr   )r!   input_embedsrF   return_attentionsrG   rH   rI   _B_Tr"   attsblockrM   r   r   r   r#     s*   zGPT.forward)NF)	r   r   r   r$   r   r,   r   r#   rQ   r   r   r9   r   r      s
    r   )ra   )r$   r   dataclassesr   typingr   r   torch.nnr-   r   rC   ray.rllib.utils.annotationsr   ray.rllib.utils.deprecationr   r   Moduler   r&   rR   r   r~   	Optimizerr   r   r   r   r   r   <module>   sB   ;
A