o
    eiZ1                     @   s&  d dl Z ddlmZmZ ddlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZmZ d	d
lmZ d	dlmZmZmZmZ d	dlmZmZ eeZG dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd de
Z"G dd deZ#G dd deZ$g dZ%dS )    N   )CacheDynamicCache)layer_type_validation)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)Unpack)TransformersKwargslogging   )LlamaConfig)LlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel)Qwen2AttentionQwen2RotaryEmbeddingc                ,       s   e Zd ZdZdZdZdddddd	d
ddddddg dddddddddfdededededededededed ed!ed"e	d#edB d$ed%e	d&ed'ed(e	d)e
dB d*ed+ee dB f* fd,d-Z  ZS ).	CwmConfiga  
    Configuration for Code World Model (CWM).
    This is an inherited Llama3-compatible configuration with layer-interleaved
    sliding-window attention. Configures a `CwmModel`. Designed to yield a configuration mirroring the model in the
    [facebook/cwm](https://huggingface.co/facebook/cwm) architecture by default. Other models include:
    - [facebook/cwm-sft](https://huggingface.co/facebook/cwm-sft)
    - [facebook/cwm-pretrain](https://huggingface.co/facebook/cwm-pretrain)

    Args:
        vocab_size (`int`, *optional*, defaults to 128256):
            Vocabulary size of the CWM model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CwmModel`]
        hidden_size (`int`, *optional*, defaults to 6144):
            Dimension of the hidden representations
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations
        num_hidden_layers (`int`, *optional*, defaults to 64):
            Number of hidden layers in the Transformer decoder
        num_attention_heads (`int`, *optional*, defaults to 48):
            Number of attention heads for each attention layer in the Transformer decoder
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention (GQA).
            If it is not specified, will default to `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 128):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with. CWM's attention allows sequence
            lengths up to 131072 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        eos_token_id (`int` or `list[int]`, *optional*, defaults to `[128001, 128008, 128009]`):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
        bos_token_id (`int`, *optional*, defaults to 128000):
            The id of the *beginning-of-sequence* token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Tensor parallelism degree used during pretraining. See [this
            document](https://huggingface.co/docs/transformers/parallelism) and [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        sliding_window (`int`, *optional*, defaults to 8192):
            Sliding window attention window size.
        layer_types (`List[str]`, *optional*):
            List of layer types for each layer. Each element should be either "full_attention" or "sliding_attention".
            If not specified, will default to alternating pattern based on the provided window pattern.
    cwm    .Ai  i   i T  @   0         silui   g{Gz?gh㈵>TN)i i i	 i  Fg               
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_idtie_word_embeddingsattention_dropoutpretraining_tpmlp_biasrope_parameterssliding_windowlayer_typesc                    s   |d u rddddddd}|d u rd  fd	d
t |D }nt|| |r*t|nd | _t|| _t jd!i d|d|d|d|d|d|d|d|d|	d|
d|d|d|dt|d|d|ddd|d|d|d || | `d S )"Nr   g      0@g      @g      ?r   llama3)
rope_thetafactorhigh_freq_factorlow_freq_factor original_max_position_embeddings	rope_type   c                    s    g | ]}|  d krdndqS )r   full_attentionsliding_attention ).0iwindow_patternr=   a/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/cwm/modular_cwm.py
<listcomp>   s    z&CwmConfig.__init__.<locals>.<listcomp>r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   eos_token_idr+   r,   attention_biasFr-   r0   r.   r/   r=   )	ranger   intr1   listr2   super__init__rE   )selfr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   rD   r+   r,   r-   r.   r/   r0   r1   r2   kwargs	__class__r@   rB   rJ   i   sz   	


	
zCwmConfig.__init__)__name__
__module____qualname____doc__
model_typedefault_thetarG   strfloatbooldictrH   rJ   __classcell__r=   r=   rM   rB   r   %   s    @	

r   c                   @      e Zd ZdS )CwmRotaryEmbeddingNrO   rP   rQ   r=   r=   r=   rB   r[          r[   c                       &   e Zd Zdedef fddZ  ZS )CwmAttentionconfig	layer_idxc                    sn   t  j||d tjj|j|j| j dd| _tjj|j|j	| j dd| _
tjj|j|j	| j dd| _d S )Nr`   ra   F)bias)rI   rJ   torchnnLinearr   r"   r$   q_projr#   k_projv_projrK   r`   ra   rM   r=   rB   rJ      s   "zCwmAttention.__init__rO   rP   rQ   r   rG   rJ   rY   r=   r=   rM   rB   r_          r_   c                       r^   )CwmDecoderLayerr`   ra   c                    s.   t  j||d |j| | _t||d| _d S )Nrb   )rI   rJ   r2   attention_typer_   	self_attnrj   rM   r=   rB   rJ      s   zCwmDecoderLayer.__init__rk   r=   r=   rM   rB   rm      rl   rm   c                   @   rZ   )CwmPreTrainedModelNr\   r=   r=   r=   rB   rp      r]   rp   c                   @   rZ   )CwmModelOutputWithPastNr\   r=   r=   r=   rB   rq      r]   rq   c                       s   e Zd ZeZdef fddZ							ddejdB dejdB dejdB de	dB d	ej
dB d
ejdB dedB dee defddZ  ZS )CwmModelr`   c                    s2   t    tj fddt jD | _d S )Nc                    s   g | ]}t  |qS r=   )rm   )r>   ra   r`   r=   rB   rC      s    z%CwmModel.__init__.<locals>.<listcomp>)rI   rJ   rd   re   
ModuleListrF   r!   layers)rK   r`   rM   rs   rB   rJ      s   
zCwmModel.__init__N	input_idsattention_maskposition_idspast_key_valuesinputs_embedscache_positionr)   rL   returnc              	   K   s2  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r<|d ur-| nd}	tj|jd |jd|	 }|d u rE|	d}t
| }
tsi| j|||||d}| }td
i |td
i |d}
|}| ||}| jd | jj D ]}||f|
|j ||||d|}qz| |}t||d	S )Nz:You must specify exactly one of input_ids or inputs_embedsrs   r   r   )device)r`   rz   rw   r{   ry   rx   )r;   r<   )rw   rx   ry   r{   position_embeddings)last_hidden_statery   r=   )
ValueErrorembed_tokensr   r`   get_seq_lengthrd   arangeshaper}   	unsqueeze
isinstancerX   copyr   r   
rotary_embru   r!   rn   normrq   )rK   rv   rw   rx   ry   rz   r{   r)   rL   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargshidden_statesr~   decoder_layerr=   r=   rB   forward   sV   



zCwmModel.forward)NNNNNNN)rO   rP   rQ   r   config_classrJ   rd   
LongTensorTensorr   FloatTensorrW   r	   r
   rq   r   rY   r=   r=   rM   rB   rr      s:    	
rr   c                   @   rZ   )CwmForCausalLMNr\   r=   r=   r=   rB   r     r]   r   )r   rp   rr   r   )&rd   cache_utilsr   r   configuration_utilsr   masking_utilsr   r   modeling_outputsr   processing_utilsr	   utilsr
   r   llama.configuration_llamar   llama.modeling_llamar   r   r   r   qwen2.modeling_qwen2r   r   
get_loggerrO   loggerr   r[   r_   rm   rp   rq   rr   r   __all__r=   r=   r=   rB   <module>   s*   
 K