o
    eiW                     @   s<  d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z. G dd dej/Z0dd Z1edd>ddZ2dej3de4dej3fd d!Z5	"d?d#ej/d$ej3d%ej3d&ej3d'ej3dB d(e6d)e6d*e#e% fd+d,Z7ee2G d-d. d.ej/Z8G d/d0 d0eZ9e&G d1d2 d2e!Z:G d3d4 d4ej/Z;e&G d5d6 d6e:Z<e&G d7d8 d8e:eZ=G d9d: d:ee:Z>G d;d< d<ee:Z?g d=Z@dS )@    )Callable)OptionalN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )Starcoder2Configc                       s@   e Zd Zdef fddZdeej dB dejfddZ  Z	S )	Starcoder2MLPconfigc                    sT   t    |j}tj||j|jd| _tj|j||jd| _t	|j
 | _|j| _d S )Nbias)super__init__hidden_sizer   Linearintermediate_sizeuse_biasc_fcc_projr   
hidden_actactresidual_dropout)selfr"   	embed_dim	__class__ p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/starcoder2/modeling_starcoder2.pyr&   6   s   
zStarcoder2MLP.__init__hidden_statesNreturnc                 C   s8   |  |}| |}| |}tjj|| j| jd}|S )Nptraining)r+   r.   r,   r   
functionaldropoutr/   r:   )r0   r6   r4   r4   r5   forward>   s
   


zStarcoder2MLP.forward)
__name__
__module____qualname__r    r&   tupletorchFloatTensorr=   __classcell__r4   r4   r2   r5   r!   5   s    &r!   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shaperB   cat)xx1x2r4   r4   r5   rotate_halfF   s   rN   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerN   )qkcossinunsqueeze_dimq_embedk_embedr4   r4   r5   apply_rotary_pos_embM   s
   

rX   r6   n_repr7   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rI   expandreshape)r6   rY   batchnum_key_value_headsslenhead_dimr4   r4   r5   	repeat_kvg   s
   0r`           modulequerykeyvalueattention_maskscalingr<   kwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )NrF   r   rE   )rH   dtyper8   r   )r`   num_key_value_groupsrB   matmul	transposer   r;   softmaxfloat32tori   r<   r:   
contiguous)rb   rc   rd   re   rf   rg   r<   rh   
key_statesvalue_statesattn_weightsattn_outputr4   r4   r5   eager_attention_forwards   s   
ru   c                       s   e Zd ZdZddededB f fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )Starcoder2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNr"   	layer_idxc                    s   t    || _|| _t|dd p|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _|j| _d S )Nr_   g      Tr#   )r%   r&   r"   rw   getattrr'   num_attention_headsr_   r]   rj   rg   attention_dropout	is_causalr   r(   r*   q_projk_projv_projo_projr/   r0   r"   rw   r2   r4   r5   r&      s   
zStarcoder2Attention.__init__r6   position_embeddingsrf   past_key_valuescache_positionrh   r7   c                 K   s:  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jt| jdd d|\}}|jg |dR   }| |}tjj|| j| jd}||fS )	NrE   r   rF   )rT   rS   r   ra   sliding_window)r<   rg   r   r8   )rI   r_   r|   viewrl   r}   r~   rX   updaterw   r   get_interfacer"   _attn_implementationru   r:   rz   rg   rx   r[   rp   r   r   r;   r<   r/   )r0   r6   r   rf   r   r   rh   input_shapehidden_shapequery_statesrq   rr   rS   rT   cache_kwargsattention_interfacert   rs   r4   r4   r5   r=      s@   		


zStarcoder2Attention.forwardN)NN)r>   r?   r@   __doc__r    intr&   rB   TensorrA   r   
LongTensorr   r   r=   rD   r4   r4   r2   r5   rv      s(    rv   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )Starcoder2DecoderLayerr"   rw   c                    sV   t    |j| _t||d| _t|| _tj|j|j	d| _
tj|j|j	d| _d S )N)r"   rw   eps)r%   r&   r'   rv   	self_attnr!   mlpr   	LayerNormnorm_epsiloninput_layernormpost_attention_layernormr   r2   r4   r5   r&      s   

zStarcoder2DecoderLayer.__init__NFr6   rf   position_idsr   	use_cacher   r   rh   r7   c              
   K   s^   |}	|  |}| jd|||||||d|\}}
|	| }|}	| |}| |}|	| }|S )N)r6   rf   r   r   r   r   r   r4   )r   r   r   r   )r0   r6   rf   r   r   r   r   r   rh   residual_r4   r4   r5   r=      s&   




zStarcoder2DecoderLayer.forward)NNNFNN)r>   r?   r@   r    r   r&   rB   r   r   r   boolrA   r   r   r=   rD   r4   r4   r2   r5   r      s6    	
r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )Starcoder2PreTrainedModelr"   modelTr   r   )r6   
attentionsN)r>   r?   r@   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   rv   _can_record_outputsr4   r4   r4   r5   r      s   
 
r   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Starcoder2RotaryEmbeddinginv_freqNr"   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r%   r&   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr"   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r0   r"   devicerope_init_fnr   r2   r4   r5   r&     s   


z"Starcoder2RotaryEmbedding.__init__r   ztorch.deviceseq_lenr7   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetar_   Ng      ?r   rF   ri   )r   ri   )	r   rx   r'   ry   rB   arangeint64ro   float)r"   r   r   baserH   attention_factorr   r4   r4   r5   r   !  s   
&z9Starcoder2RotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   rE   r   mpscpuF)device_typeenabledrF   rG   r   )r   r   rZ   rI   ro   r   
isinstancetypestrr   rl   rB   rJ   rS   r   rT   ri   )
r0   rK   r   inv_freq_expandedposition_ids_expandedr   freqsembrS   rT   r4   r4   r5   r=   ?  s   0&z!Starcoder2RotaryEmbedding.forwardr   )NNN)r>   r?   r@   rB   r   r   r    r&   staticmethodr   r   rA   r   r   no_gradr   r=   rD   r4   r4   r2   r5   r     s&   
 

r   c                       s   e Zd Zdef fddZee							ddejdB dej	dB dejdB de
dB d	ejdB d
edB dejdB dee deeB fddZ  ZS )Starcoder2Modelr"   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj j jd| _t d| _d| _ j| _|   d S )Nc                    s   g | ]}t  |qS r4   )r   ).0rw   r"   r4   r5   
<listcomp>X  s    z,Starcoder2Model.__init__.<locals>.<listcomp>r   r   F)r%   r&   pad_token_idpadding_idx
vocab_sizer   	Embeddingr'   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointingembedding_dropout	post_initr0   r"   r2   r   r5   r&   Q  s   zStarcoder2Model.__init__N	input_idsrf   r   r   inputs_embedsr   r   rh   r7   c              
   K   s6  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}| jj
d u rNtnt}
|
| j|||||d}|}tjj|| j| jd}| j||d}| jd | jj D ]}||f||||||d	|}qx| |}t||r|d
S d d
S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )r"   r   rf   r   r   r   r8   )r   )rf   r   r   r   r   r   )last_hidden_stater   )
ValueErrorr   r   r"   get_seq_lengthrB   r   rI   r   rP   r   r   r   r   r;   r<   r   r:   r   r   r   r   r   )r0   r   rf   r   r   r   r   r   rh   past_seen_tokensmask_functioncausal_maskr6   r   decoder_layerr4   r4   r5   r=   b  s^   

	

zStarcoder2Model.forward)NNNNNNN)r>   r?   r@   r    r&   r   r   rB   r   r   r   rC   r   r   r   rA   r   r=   rD   r4   r4   r2   r5   r   O  s<    	
r   c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																	
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB dee defddZ  ZS )Starcoder2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr6   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S )NFr#   )
r%   r&   r   r   r   r   r(   r'   r   r   r   r2   r4   r5   r&     s
   
zStarcoder2ForCausalLM.__init__Nr   r   rf   r   r   r   labelsr   r   logits_to_keeprh   r7   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Starcoder2ForCausalLM

        >>> model = Starcoder2ForCausalLM.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r   rf   r   r   r   r   r   N)r   r   r   )lossr   r   r6   r   r4   )r   r   r   r   slicer   loss_functionr"   r   r   r   r6   r   )r0   r   rf   r   r   r   r   r   r   r   rh   outputsr6   slice_indicesr   r   r4   r4   r5   r=     s0    zStarcoder2ForCausalLM.forward)	NNNNNNNNr   )r>   r?   r@   _tied_weights_keys_tp_plan_pp_planr&   r   r   rB   r   r   r   rC   r   r   r   r   r   r=   rD   r4   r4   r2   r5   r     sN    		
r   c                   @      e Zd ZdS )#Starcoder2ForSequenceClassificationNr>   r?   r@   r4   r4   r4   r5   r         r   c                   @   r   ) Starcoder2ForTokenClassificationNr   r4   r4   r4   r5   r    r  r  )r   r   r   r   r  )r   )ra   )Acollections.abcr   typingr   rB   r   activationsr   cache_utilsr   r   
generationr	   integrationsr
   r   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_starcoder2r    Moduler!   rN   rX   r   r   r`   r   ru   rv   r   r   r   r   r   r   r  __all__r4   r4   r4   r5   <module>   sn   
C+AUK