o
    ¾e¦iÐW  ã                   @   s<  d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z. G dd„ dej/ƒZ0dd„ Z1edƒd>dd„ƒZ2dej3de4dej3fd d!„Z5	"d?d#ej/d$ej3d%ej3d&ej3d'ej3dB d(e6d)e6d*e#e% fd+d,„Z7ee2ƒG d-d.„ d.ej/ƒƒZ8G d/d0„ d0eƒZ9e&G d1d2„ d2e!ƒƒZ:G d3d4„ d4ej/ƒZ;e&G d5d6„ d6e:ƒƒZ<e&G d7d8„ d8e:eƒƒZ=G d9d:„ d:ee:ƒZ>G d;d<„ d<ee:ƒZ?g d=¢Z@dS )@é    )ÚCallable)ÚOptionalN)Únné   )ÚACT2FN)ÚCacheÚDynamicCache)ÚGenerationMixin)Úuse_kernel_func_from_hubÚuse_kernelized_func)Úcreate_causal_maskÚ!create_sliding_window_causal_mask)ÚFlashAttentionKwargs)Ú GenericForSequenceClassificationÚGenericForTokenClassificationÚGradientCheckpointingLayer)ÚBaseModelOutputWithPastÚCausalLMOutputWithPast)ÚROPE_INIT_FUNCTIONSÚdynamic_rope_update)ÚALL_ATTENTION_FUNCTIONSÚPreTrainedModel)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tuple)Úmaybe_autocastÚmerge_with_config_defaults)Úcapture_outputsé   )ÚStarcoder2Configc                       s@   e Zd Zdef‡ fdd„Zdeej dB dejfdd„Z‡  Z	S )	ÚStarcoder2MLPÚconfigc                    sT   t ƒ  ¡  |j}tj||j|jd| _tj|j||jd| _t	|j
 | _|j| _d S )N©Úbias)ÚsuperÚ__init__Úhidden_sizer   ÚLinearÚintermediate_sizeÚuse_biasÚc_fcÚc_projr   Ú
hidden_actÚactÚresidual_dropout)Úselfr"   Ú	embed_dim©Ú	__class__© úp/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/starcoder2/modeling_starcoder2.pyr&   6   s   
zStarcoder2MLP.__init__Úhidden_statesNÚreturnc                 C   s8   |   |¡}|  |¡}|  |¡}tjj|| j| jd}|S )N©ÚpÚtraining)r+   r.   r,   r   Ú
functionalÚdropoutr/   r:   )r0   r6   r4   r4   r5   Úforward>   s
   


zStarcoder2MLP.forward)
Ú__name__Ú
__module__Ú__qualname__r    r&   ÚtupleÚtorchÚFloatTensorr=   Ú__classcell__r4   r4   r2   r5   r!   5   s    &r!   c                 C   sH   | dd| j d d …f }| d| j d d d…f }tj| |fddS )z*Rotates half the hidden dims of the input..Néÿÿÿÿé   ©Údim)ÚshaperB   Úcat)ÚxÚx1Úx2r4   r4   r5   Úrotate_halfF   s   rN   Úrotary_pos_embc                 C   sD   |  |¡}|  |¡}| | t| ƒ|  }|| t|ƒ|  }||fS )a…  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )Ú	unsqueezerN   )ÚqÚkÚcosÚsinÚunsqueeze_dimÚq_embedÚk_embedr4   r4   r5   Úapply_rotary_pos_embM   s
   

rX   r6   Ún_repr7   c                 C   s^   | j \}}}}|dkr| S | dd…dd…ddd…dd…f  |||||¡} |  ||| ||¡S )zÔ
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rI   ÚexpandÚreshape)r6   rY   ÚbatchÚnum_key_value_headsÚslenÚhead_dimr4   r4   r5   Ú	repeat_kvg   s
   0r`   ç        ÚmoduleÚqueryÚkeyÚvalueÚattention_maskÚscalingr<   Úkwargsc                 K   s”   t || jƒ}t || jƒ}	t || dd¡¡| }
|d ur |
| }
tjj|
dtjd 	|j
¡}
tjj|
|| jd}
t |
|	¡}| dd¡ ¡ }||
fS )NrF   r   rE   )rH   Údtyper8   r   )r`   Únum_key_value_groupsrB   ÚmatmulÚ	transposer   r;   ÚsoftmaxÚfloat32Útori   r<   r:   Ú
contiguous)rb   rc   rd   re   rf   rg   r<   rh   Ú
key_statesÚvalue_statesÚattn_weightsÚattn_outputr4   r4   r5   Úeager_attention_forwards   s   
ru   c                       sš   e Zd ZdZddededB f‡ fdd„Z		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejdB e	ej dB f fdd„Z‡  ZS )ÚStarcoder2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr"   Ú	layer_idxc                    sÜ   t ƒ  ¡  || _|| _t|dd ƒp|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _|j| _d S )Nr_   g      à¿Tr#   )r%   r&   r"   rw   Úgetattrr'   Únum_attention_headsr_   r]   rj   rg   Úattention_dropoutÚ	is_causalr   r(   r*   Úq_projÚk_projÚv_projÚo_projr/   ©r0   r"   rw   r2   r4   r5   r&      s   
zStarcoder2Attention.__init__r6   Úposition_embeddingsrf   Úpast_key_valuesÚcache_positionrh   r7   c                 K   s:  |j d d… }g |¢d‘| j‘R }|  |¡ |¡ dd¡}	|  |¡ |¡ dd¡}
|  |¡ |¡ dd¡}|\}}t|	|
||ƒ\}	}
|d urW|||dœ}| |
|| j	|¡\}
}t
 | jjt¡}|| |	|
||f| jskdn| j| jt| jdd ƒdœ|¤Ž\}}|jg |¢d‘R Ž  ¡ }|  |¡}tjj|| j| jd}||fS )	NrE   r   rF   )rT   rS   rƒ   ra   Úsliding_window)r<   rg   r„   r8   )rI   r_   r|   Úviewrl   r}   r~   rX   Úupdaterw   r   Úget_interfacer"   Ú_attn_implementationru   r:   rz   rg   rx   r[   rp   r   r   r;   r<   r/   )r0   r6   r   rf   r‚   rƒ   rh   Úinput_shapeÚhidden_shapeÚquery_statesrq   rr   rS   rT   Úcache_kwargsÚattention_interfacert   rs   r4   r4   r5   r=   Ÿ   s@   	ÿûø	
÷

ÿzStarcoder2Attention.forward©N)NN)r>   r?   r@   Ú__doc__r    Úintr&   rB   ÚTensorrA   r   Ú
LongTensorr   r   r=   rD   r4   r4   r2   r5   rv   Œ   s(    úþýüûúùørv   c                       s”   e Zd Zdedef‡ fdd„Z						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfdd„Z‡  ZS )ÚStarcoder2DecoderLayerr"   rw   c                    sV   t ƒ  ¡  |j| _t||d| _t|ƒ| _tj|j|j	d| _
tj|j|j	d| _d S )N)r"   rw   ©Úeps)r%   r&   r'   rv   Ú	self_attnr!   Úmlpr   Ú	LayerNormÚnorm_epsilonÚinput_layernormÚpost_attention_layernormr€   r2   r4   r5   r&   Ñ   s   

zStarcoder2DecoderLayer.__init__NFr6   rf   Úposition_idsr‚   Ú	use_cacherƒ   r   rh   r7   c              
   K   s^   |}	|   |¡}| jd|||||||dœ|¤Ž\}}
|	| }|}	|  |¡}|  |¡}|	| }|S )N)r6   rf   rœ   r‚   r   rƒ   r   r4   )rš   r–   r›   r—   )r0   r6   rf   rœ   r‚   r   rƒ   r   rh   ÚresidualÚ_r4   r4   r5   r=   Ù   s&   
ù
ø


zStarcoder2DecoderLayer.forward)NNNFNN)r>   r?   r@   r    r   r&   rB   r‘   r’   r   ÚboolrA   r   r   r=   rD   r4   r4   r2   r5   r“   Ð   s6    øþýüûúùø	÷
ör“   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedœZdS )ÚStarcoder2PreTrainedModelr"   ÚmodelTr“   r‚   )r6   Ú
attentionsN)r>   r?   r@   r    Ú__annotations__Úbase_model_prefixÚsupports_gradient_checkpointingÚ_no_split_modulesÚ_skip_keys_device_placementÚ_supports_flash_attnÚ_supports_sdpaÚ_supports_flex_attnÚ_can_compile_fullgraphÚ_supports_attention_backendr“   rv   Ú_can_record_outputsr4   r4   r4   r5   r¡   û   s   
 
þr¡   c                       s~   e Zd ZU ejed< ddef‡ fdd„Ze			ddedB de	d de
dB d	ed
ef fdd„ƒZe ¡ edd„ ƒƒZ‡  ZS )ÚStarcoder2RotaryEmbeddingÚinv_freqNr"   c                    s‚   t ƒ  ¡  |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|ƒ\}| _
| jd|dd | jd| ¡ dd d S )NÚ	rope_typeÚdefaultr°   F)Ú
persistentÚoriginal_inv_freq)r%   r&   Úmax_position_embeddingsÚmax_seq_len_cachedÚoriginal_max_seq_lenr"   Úrope_parametersr±   Úcompute_default_rope_parametersr   Úattention_scalingÚregister_bufferÚclone)r0   r"   ÚdeviceÚrope_init_fnr°   r2   r4   r5   r&     s   


z"Starcoder2RotaryEmbedding.__init__r½   ztorch.deviceÚseq_lenr7   ztorch.Tensorc                 C   sZ   | j d }t| ddƒp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a¨  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        Ú
rope_thetar_   Ng      ð?r   rF   ©ri   )r½   ri   )	r¸   rx   r'   ry   rB   ÚarangeÚint64ro   Úfloat)r"   r½   r¿   ÚbaserH   Úattention_factorr°   r4   r4   r5   r¹   !  s   
&ÿz9Starcoder2RotaryEmbedding.compute_default_rope_parametersc           
      C   sþ   | j d d d …d f  ¡  |jd dd¡ |j¡}|d d …d d d …f  ¡ }t|jjtƒr6|jjdkr6|jjnd}t	|dd+ | ¡ | ¡   
dd¡}tj||fdd	}| ¡ | j }| ¡ | j }	W d   ƒ n1 slw   Y  |j|jd
|	j|jd
fS )Nr   rE   r   ÚmpsÚcpuF)Údevice_typeÚenabledrF   rG   rÁ   )r°   rÄ   rZ   rI   ro   r½   Ú
isinstanceÚtypeÚstrr   rl   rB   rJ   rS   rº   rT   ri   )
r0   rK   rœ   Úinv_freq_expandedÚposition_ids_expandedrÉ   ÚfreqsÚembrS   rT   r4   r4   r5   r=   ?  s   0&üz!Starcoder2RotaryEmbedding.forwardrŽ   )NNN)r>   r?   r@   rB   r‘   r¤   r    r&   Ústaticmethodr   r   rA   rÄ   r¹   Úno_gradr   r=   rD   r4   r4   r2   r5   r¯     s&   
 
ýÿþý
ür¯   c                       s–   e Zd Zdef‡ fdd„Zee							ddejdB dej	dB dejdB de
dB d	ejdB d
edB dejdB dee deeB fdd„ƒƒZ‡  ZS )ÚStarcoder2Modelr"   c                    sŒ   t ƒ  ˆ ¡ ˆ j| _ˆ j| _t ˆ jˆ j| j¡| _t 	‡ fdd„t
ˆ jƒD ƒ¡| _tjˆ jˆ jd| _tˆ d| _d| _ˆ j| _|  ¡  d S )Nc                    s   g | ]}t ˆ |ƒ‘qS r4   )r“   )Ú.0rw   ©r"   r4   r5   Ú
<listcomp>X  s    z,Starcoder2Model.__init__.<locals>.<listcomp>r”   rÖ   F)r%   r&   Úpad_token_idÚpadding_idxÚ
vocab_sizer   Ú	Embeddingr'   Úembed_tokensÚ
ModuleListÚrangeÚnum_hidden_layersÚlayersr˜   r™   Únormr¯   Ú
rotary_embÚgradient_checkpointingÚembedding_dropoutÚ	post_init©r0   r"   r2   rÖ   r5   r&   Q  s   ÿzStarcoder2Model.__init__NÚ	input_idsrf   rœ   r‚   Úinputs_embedsr   rƒ   rh   r7   c              
   K   s6  |d u |d uA rt dƒ‚|d u r|  |¡}|r!|d u r!t| jd}|d u r=|d ur-| ¡ nd}	tj|	|	|jd  |jd}|d u rF| 	d¡}| jj
d u rNtnt}
|
| j|||||d}|}tjj|| j| jd}| j||d}| jd | jj… D ]}||f||||||d	œ|¤Ž}qx|  |¡}t||r—|d
S d d
S )Nz:You must specify exactly one of input_ids or inputs_embedsrÖ   r   r   )r½   )r"   rè   rf   rƒ   r‚   rœ   r8   )rœ   )rf   rœ   r‚   r   rƒ   r   )Úlast_hidden_stater‚   )Ú
ValueErrorrÜ   r   r"   Úget_seq_lengthrB   rÂ   rI   r½   rP   r„   r   r   r   r;   r<   rä   r:   râ   rà   rß   rá   r   )r0   rç   rf   rœ   r‚   rè   r   rƒ   rh   Úpast_seen_tokensÚmask_functionÚcausal_maskr6   r   Údecoder_layerr4   r4   r5   r=   b  s^   
ÿ
ú	
ÿÿùø
þþzStarcoder2Model.forward)NNNNNNN)r>   r?   r@   r    r&   r   r   rB   r’   r‘   r   rC   r    r   r   rA   r   r=   rD   r4   r4   r2   r5   rÔ   O  s<    øþýüûúùø	÷
örÔ   c                       sÄ   e Zd ZddiZddiZddgdgfiZ‡ fdd„Zee																	
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB dee defdd„ƒƒZ‡  ZS )ÚStarcoder2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightÚlm_headÚcolwise_gather_outputr6   Úlogitsc                    s@   t ƒ  |¡ t|ƒ| _|j| _tj|j|jdd| _|  	¡  d S )NFr#   )
r%   r&   rÔ   r¢   rÚ   r   r(   r'   rñ   rå   ræ   r2   r4   r5   r&   «  s
   
zStarcoder2ForCausalLM.__init__Nr   rç   rf   rœ   r‚   rè   Úlabelsr   rƒ   Úlogits_to_keeprh   r7   c
              
   K   sœ   | j d|||||||dœ|
¤Ž}|j}t|	tƒrt|	 dƒn|	}|  |dd…|dd…f ¡}d}|durB| jd||| jjdœ|
¤Ž}t	|||j
|j|jdS )aí  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Starcoder2ForCausalLM

        >>> model = Starcoder2ForCausalLM.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rç   rf   rœ   r‚   rè   r   rƒ   N)ró   rô   rÚ   )Úlossró   r‚   r6   r£   r4   )r¢   ré   rË   r   Úslicerñ   Úloss_functionr"   rÚ   r   r‚   r6   r£   )r0   rç   rf   rœ   r‚   rè   rô   r   rƒ   rõ   rh   Úoutputsr6   Úslice_indicesró   rö   r4   r4   r5   r=   ´  s0    ùøûzStarcoder2ForCausalLM.forward)	NNNNNNNNr   )r>   r?   r@   Ú_tied_weights_keysÚ_tp_planÚ_pp_planr&   r   r   rB   r’   r‘   r   rC   r    r   r   r   r   r=   rD   r4   r4   r2   r5   rð   ¥  sN    	öþýüûúùø	÷
öõôrð   c                   @   ó   e Zd ZdS )Ú#Starcoder2ForSequenceClassificationN©r>   r?   r@   r4   r4   r4   r5   rÿ   ñ  ó    rÿ   c                   @   rþ   )Ú Starcoder2ForTokenClassificationNr   r4   r4   r4   r5   r  õ  r  r  )rð   rÔ   r¡   rÿ   r  )r   )ra   )AÚcollections.abcr   Útypingr   rB   r   Úactivationsr   Úcache_utilsr   r   Ú
generationr	   Úintegrationsr
   r   Úmasking_utilsr   r   Úmodeling_flash_attention_utilsr   Úmodeling_layersr   r   r   Úmodeling_outputsr   r   Úmodeling_rope_utilsr   r   Úmodeling_utilsr   r   Úprocessing_utilsr   Úutilsr   r   r   Úutils.genericr   r   Úutils.output_capturingr   Úconfiguration_starcoder2r    ÚModuler!   rN   rX   r‘   r   r`   rÄ   ru   rv   r“   r¡   r¯   rÔ   rð   rÿ   r  Ú__all__r4   r4   r4   r5   Ú<module>   sn   ùÿþýüûúù
øC+AUK