o
    eiq                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' e#(e)Z*G dd dej+Z,dd Z-d=ddZ.G dd dej+Z/G dd dej+Z0dej1de2d ej1fd!d"Z3	#d>d$ej+d%ej1d&ej1d'ej1d(ej1dB d)e4d*e4d+ee  fd,d-Z5G d.d/ d/ej+Z6G d0d1 d1eZ7e!G d2d3 d3eZ8e!G d4d5 d5e8Z9G d6d7 d7e8eZ:G d8d9 d9ee8Z;G d:d; d;ee8Z<g d<Z=dS )?zPyTorch StableLM model.    )Callable)OptionalN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocast   )StableLmConfigc                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )StableLmRotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)super__init__max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)selfr   devicerope_init_fnr   	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/stablelm/modeling_stablelm.pyr$   :   s   


z StableLmRotaryEmbedding.__init__r.   ztorch.deviceseq_lenreturnztorch.Tensorc           	      C   st   | j d }| j dd}t| ddp| j| j }t|| }d}d|tjd|dtjdj	|tj
d	|   }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr      dtype)r.   r;   )r(   getgetattrhidden_sizenum_attention_headsinttorcharangeint64tofloat)	r   r.   r4   baser7   r8   dimattention_factorr   r2   r2   r3   r)   J   s   
&z7StableLmRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r   mpscpuF)device_typeenabledr9   rG   r:   )r   rE   expandshaperD   r.   
isinstancetypestrr   	transposerA   catcosr*   sinr;   )
r-   xposition_idsinv_freq_expandedposition_ids_expandedrL   freqsembrV   rW   r2   r2   r3   forwardk   s   0&zStableLmRotaryEmbedding.forwardN)NNN)__name__
__module____qualname__rA   Tensor__annotations__r   r$   staticmethodr   r@   tuplerE   r)   no_gradr   r^   __classcell__r2   r2   r0   r3   r   7   s&   
 

r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrI   r9   rN   )rP   rA   rU   )rX   x1x2r2   r2   r3   rotate_half|   s   rk   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerk   )qkrV   rW   unsqueeze_dimq_embedk_embedr2   r2   r3   apply_rotary_pos_emb   s
   

rr   c                       s$   e Zd Z fddZdd Z  ZS )StableLmMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)r#   r$   r   r>   intermediate_sizer   Linear	gate_projup_proj	down_projr   
hidden_actact_fnr-   r   r0   r2   r3   r$      s   
zStableLmMLP.__init__c                 C   s$   |  | | || | }|S r_   )r{   r}   ry   rz   )r-   rX   r{   r2   r2   r3   r^      s    zStableLmMLP.forward)r`   ra   rb   r$   r^   rh   r2   r2   r0   r3   rs      s    
rs   c                       s.   e Zd Zd fdd	ZdejfddZ  ZS )	StableLmLayerNormPerHeadh㈵>Fc                    s>   t    | _|| _t fddt| jD | _d S )Nc                    s   g | ]
}t j d qS ))epsrv   )r   	LayerNorm).0_rv   rG   r   r2   r3   
<listcomp>   s    z5StableLmLayerNormPerHead.__init__.<locals>.<listcomp>)r#   r$   rG   	num_headsr   
ModuleListrangenorms)r-   rG   r   r   rv   r0   r   r3   r$      s   
(z!StableLmLayerNormPerHead.__init__hidden_statesc                 C   s0   t j|ddd}t jdd t| j|D ddS )Nr   rN   c                 S   s   g | ]\}}||qS r2   r2   )r   normr   r2   r2   r3   r      s    z4StableLmLayerNormPerHead.forward.<locals>.<listcomp>)rA   splitrU   zipr   )r-   r   states_per_headsr2   r2   r3   r^      s    z StableLmLayerNormPerHead.forward)r   F)r`   ra   rb   r$   rA   rc   r^   rh   r2   r2   r0   r3   r      s    r   r   n_repr5   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rP   rO   reshape)r   r   batchnum_key_value_headsslenr8   r2   r2   r3   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr9   r   rI   )rG   r;   )ptrainingr   )r   num_key_value_groupsrA   matmulrT   r   
functionalsoftmaxfloat32rD   r;   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr2   r2   r3   eager_attention_forward   s   
r   c                       s   e Zd ZdZddededB f fddZ							ddejd	ejdB d
ej	dB de
dB dededej	dB deejejf dB deejejdB eej dB f fddZ  ZS )StableLmAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNr   	layer_idxc                    sr  t    || _|| _|d u rtd| jj d |j| _|j	| _
| j| j
 | _|j| _| j
| j | _t| j|jd  | _d| _| jd | _| j| j
 | jkr^td| j d| j
 dtj| j| j
| j |jd	| _tj| j| j| j |jd	| _tj| j| j| j |jd	| _tj| j| jd
d	| _|j| _| jrt| j| j
|jd| _t| j| j|jd| _|j | _ d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r7   Tg      z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).ru   Fr   )!r#   r$   r   r   loggerwarning_oncer1   r`   r>   r?   r   r8   r   r   r@   r(   rotary_ndims	is_causalr   
ValueErrorr   rx   use_qkv_biasq_projk_projv_projo_projqk_layernormr   layer_norm_epsq_layernormk_layernormattention_dropoutr-   r   r   r0   r2   r3   r$      s@   

zStableLmAttention.__init__Fr   r   rY   past_key_valuesoutput_attentions	use_cachecache_positionposition_embeddingsr5   c	                 K   s  |  \}
}}| |}| |}| |}||
|| j| jdd}||
|| j| jdd}||
|| j| jdd}| j	rM| 
|}| |}|\}}|dd | jf |d| jd f }}|dd | jf |d| jd f }}t||||\}}tj||fdd}tj||fdd}|d ur||| j|d}|||| j|\}}t| jjt}|| ||||f| jsdn| j| j|d|	\}}||
|d}| |}||fS )	Nr   r9   .rI   rN   )rW   rV   partial_rotation_sizer   r   )r   r   rY   )sizer   r   r   viewr   r8   rT   r   r   r   r   r   rr   rA   rU   updater   r   get_interfacer   _attn_implementationr   r   r   r   r   r   )r-   r   r   rY   r   r   r   r   r   r   bszq_lenr   query_statesr   r   rV   rW   	query_rot
query_passkey_rotkey_passcache_kwargsattention_interfacer   r   r2   r2   r3   r^     s\   




	

zStableLmAttention.forwardr_   NNNFFNN)r`   ra   rb   __doc__r   r@   r$   rA   rc   
LongTensorr   boolrf   r^   rh   r2   r2   r0   r3   r      s:    +	r   c                       s   e Zd Zdedef fddZ							ddejdejdB d	ejdB d
e	dB de
dB de
dB dejdB deejejf dB deejeejejf dB f fddZ  ZS )StableLmDecoderLayerr   r   c                    sx   t    |j| _|j| _t||d| _t|| _tj	|j|j
d| _d | _| js3tj	|j|j
d| _t|j| _d S )N)r   r   )r#   r$   use_parallel_residualr>   r   	self_attnrs   mlpr   r   r   input_layernormpost_attention_layernormDropouthidden_dropoutr   r   r0   r2   r3   r$   Z  s   

zStableLmDecoderLayer.__init__NFr   r   rY   r   r   r   r   r   r5   c	              
   C   s   |}	|  |}| j||||||||d\}
}| jr*| |}| |}|	|
 | }n|	|
 }	| | |	}| |}|	| }|f}|rI||f7 }|S )ao  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
                `[0, config.n_positions - 1]`.

                [What are position IDs?](../glossary#position-ids)
            past_key_values (`Cache`, *optional*):
                cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r   r   rY   r   r   r   r   r   )r   r   r   r   r   r   )r-   r   r   rY   r   r   r   r   r   residualself_attn_outputself_attn_weights
mlp_outputoutputsr2   r2   r3   r^   f  s0   $





zStableLmDecoderLayer.forwardr   )r`   ra   rb   r   r@   r$   rA   rc   r   r   r   rf   FloatTensorr^   rh   r2   r2   r0   r3   r   Y  s8    	
r   c                   @   s4   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdS )StableLmPreTrainedModelr   modelTr   r   N)r`   ra   rb   r   rd   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr2   r2   r2   r3   r     s   
 r   c                       s   e Zd ZdZdef fddZee									ddej	dB dej
dB dej	dB d	edB d
ejdB dedB dedB dedB dej	dB defddZ  ZS )StableLmModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`StableLmDecoderLayer`]

    Args:
        config: StableLmConfig
    r   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj j jd| _ j| _d| _t| jd| _|   d S )Nc                    s   g | ]}t  |qS r2   )r   )r   r   r   r2   r3   r     s    z*StableLmModel.__init__.<locals>.<listcomp>r   Fr   )r#   r$   pad_token_idpadding_idx
vocab_sizer   	Embeddingr>   embed_tokensr   r   num_hidden_layerslayersr   r   r   r   gradient_checkpointingr   r   
rotary_emb	post_initr~   r0   r   r3   r$     s   zStableLmModel.__init__N	input_idsr   rY   r   inputs_embedsr   r   output_hidden_statesr   r5   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|rE|d u rEt	| j d}|d u rN| 
|}|	d u rj|d urZ| nd}tj|||jd  |jd}	|d u rs|	d}t| j |||	||d}|}| j||d	}|rd
nd }|rd
nd }| jD ]"}|r||f7 }||||||||	|d}|d }|r||d f7 }q| |}|r||f7 }t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )r.   )r   r   r   r   r   rY   )rY   r2   )r   rY   r   r   r   r   r   )last_hidden_stater   r   
attentions)r   r   r   r   r   r   r   r   r   r   r   get_seq_lengthrA   rB   rP   r.   rl   r
   r   r   r   r   )r-   r   r   rY   r   r   r   r   r   r   r   past_seen_tokenscausal_maskr   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr2   r2   r3   r^     sz   

	



zStableLmModel.forward)	NNNNNNNNN)r`   ra   rb   r   r   r$   r   r   rA   r   rc   r   r   r   r   r^   rh   r2   r2   r0   r3   r     sF    	
r   c                       s   e Zd ZddiZ fddZee											ddejdB dej	dB d	ejdB d
e
dB dejdB dejdB dedB dedB dedB dejdB deej	B defddZ  ZS )StableLmForCausalLMzlm_head.weightzmodel.embed_tokens.weightc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rt   )
r#   r$   r   r   r   r   rx   r>   lm_headr   r~   r0   r2   r3   r$   =  s
   
zStableLmForCausalLM.__init__Nr   r   r   rY   r   r   labelsr   r   r   r   logits_to_keepr5   c                 K   s   |dur|n| j j}|	dur|	n| j j}	| j||||||||	|
d	}|j}t|tr0t| dn|}| |dd|ddf }d}|durT| j	||fd| j j
i|}t|||j|j|jdS )ui  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, StableLmForCausalLM

        >>> model = StableLmForCausalLM.from_pretrained("adept/persimmon-8b-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("adept/persimmon-8b-base")

        >>> prompt = "human: Hey, what should I eat for dinner?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        'human: Hey, what should I eat for dinner?\n\ncat: 🐱\n\nhuman: 😐\n\n'
        ```N)	r   r   rY   r   r   r   r   r   r   r   )losslogitsr   r   r   )r   r   r   r   r   rQ   r@   slicer  loss_functionr   r   r   r   r   )r-   r   r   rY   r   r   r  r   r   r   r   r	  r   r   r   slice_indicesr  r
  r2   r2   r3   r^   F  sD   )zStableLmForCausalLM.forward)NNNNNNNNNNr   )r`   ra   rb   _tied_weights_keysr$   r   r   rA   r   rc   r   r   r   r@   r   r^   rh   r2   r2   r0   r3   r  9  sR    		
r  c                   @      e Zd ZdS )!StableLmForSequenceClassificationNr`   ra   rb   r2   r2   r2   r3   r        r  c                   @   r  )StableLmForTokenClassificationNr  r2   r2   r2   r3   r    r  r  )r  r   r   r  r  )r   )r   )>r   collections.abcr   typingr   rA   r   activationsr   cache_utilsr   r   
generationr	   masking_utilsr
   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   configuration_stablelmr   
get_loggerr`   r   Moduler   rk   rr   rs   r   rc   r@   r   rE   r   r   r   r   r   r  r  r  __all__r2   r2   r2   r3   <module>   sh   
E

uYz_