o
    ei
                     @   sB  d dl Z d dlmZ d dlmZmZ d dlZd dlm  m	Z
 d dlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 e/ rd dl8m9Z9 edG dd dej:Z;G dd dej:Z<dd Z=eddOd d!Z>d"ej?d#e@d$ej?fd%d&ZA	'dPd(ej:d)ej?d*ej?d+ej?d,ej?dB d-eBd.eBd/e*e, fd0d1ZC		dQd(ej:d)ej?d*ej?d+ej?d,eej?d2f d-eBdB d3eBdB d$eDej?ej?f fd4d5ZEe' ZFeEeFd6< G d7d8 d8ej:ZGG d9d: d:ej:ZHG d;d< d<ej:ZIG d=d> d>eZJe-G d?d@ d@e(ZKe-G dAdB dBeKZL			C	dRdDej?eDej? B dB dEe@dB dFe@dB dGe@d,ej?dB d$ej?e@B fdHdIZMe-G dJdK dKeKeZNG dLdM dMeeKZOg dNZPdS )S    N)Callable)OptionalUnion)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hub)compile_friendly_flex_attention)create_causal_mask!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_available)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )
DogeConfig)	BlockMaskRMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )DogeRMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z:
        DogeRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer(   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/doge/modeling_doge.pyr+   7   s   

zDogeRMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)keepdim)	dtypetor-   float32powmeanrsqrtr0   r/   )r1   r7   input_dtypevariancer5   r5   r6   forward?   s
   zDogeRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler/   shaper0   )r1   r5   r5   r6   
extra_reprF   s   zDogeRMSNorm.extra_repr)r'   )
__name__
__module____qualname__floatr+   r-   TensorrC   rF   __classcell__r5   r5   r3   r6   r&   5   s    r&   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )DogeRotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrN   F)
persistentoriginal_inv_freq)r*   r+   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrO   rope_parametersrP   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r1   rO   devicerope_init_fnrN   r3   r5   r6   r+   M   s   


zDogeRotaryEmbedding.__init__r\   ztorch.deviceseq_lenr)   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r   r8   r;   r\   r;   )	rW   getattrr2   num_attention_headsr-   arangeint64r<   rJ   )rO   r\   r^   basedimattention_factorrN   r5   r5   r6   rX   ]   s   
&z3DogeRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r9   r"   mpscpuF)device_typeenabledr8   ri   rb   )rN   rJ   expandrE   r<   r\   
isinstancetypestrr   	transposer-   catcosrY   sinr;   )
r1   xposition_idsinv_freq_expandedposition_ids_expandedrm   freqsembrv   rw   r5   r5   r6   rC   {   s   0&zDogeRotaryEmbedding.forwardNNNN)rG   rH   rI   r-   rK   __annotations__r#   r+   staticmethodr   intrD   rJ   rX   no_gradr   rC   rL   r5   r5   r3   r6   rM   J   s&   
 

rM   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr9   r8   ro   )rE   r-   ru   )rx   x1x2r5   r5   r6   rotate_half   s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkrv   rw   unsqueeze_dimq_embedk_embedr5   r5   r6   apply_rotary_pos_emb   s
   

r   r7   n_repr)   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)rE   rp   reshape)r7   r   batchnum_key_value_headsslenr`   r5   r5   r6   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr8   r   r9   )ri   r;   ptrainingr"   )r   num_key_value_groupsr-   matmulrt   r   
functionalsoftmaxr=   r<   r;   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr5   r5   r6   eager_attention_forward   s   
r   r$   softcapc              
      s   d }d  t |tr|}n|  d ur% d d d d d d d |jd f   fdd}	t||||	|d|dd\}
}||j}|
dd }
|
|fS )Nc                    s>   d urt |   }  d ur|  | | | |  } | S r~   )r-   tanh)score	batch_idxhead_idxq_idxkv_idxcausal_maskr   r5   r6   	score_mod   s
   z)flex_attention_forward.<locals>.score_modT)r   
block_mask
enable_gqascale
return_lser"   r8   )rq   r$   rE   r   r<   r;   rt   r   )r   r   r   r   r   r   r   r   r   r   r   attention_weightsr5   r   r6   flex_attention_forward   s*   

&
r   doge_flex_attentionc                       s   e Zd ZddededB f fddZ			ddejdeejejf dejdB d	e	dB d
ej
dB deejejdB eej dB f fddZ		ddejdejdedejdB fddZ  ZS )DogeAttentionNrO   	layer_idxc                    s(  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tt|j| _tj|j| j |j|jd| _tj|j| j |j|jd| _t| j|jd| _t| j|jd| _d S )Nr`   g      ࿩biasr(   )r*   r+   rO   r   rd   r2   re   r`   r   r   r   attention_dropoutkeep_window_sizer   Linearattention_biasq_projk_projv_projr,   r-   zerosAdt_projo_projr&   rms_norm_epsq_normk_normr1   rO   r   r3   r5   r6   r+     s4   
zDogeAttention.__init__r7   position_embeddingsr   past_key_valuescache_positionr)   c                 K   s  |j d d }g |d| jR }| | ||dd}	| | ||dd}
| ||dd}|\}}t	|	|
||\}	}
|d ur]|||d}|
|
|| j|\}
}| |dd|j d |j d d}t| jt| dd}| j||| j|d}t|| j}t| jjt}|| |	|
|f|| jsdn| j| jd	|\}}|jg |dR   }|  |}||fS )
Nr9   r"   r8   )rw   rv   r   r   r   )r7   	dt_statesr   r   r   )r   r   r   )!rE   r`   r   r   viewrt   r   r   r   r   updater   r   r   r-   expr   Fsoftplusprepare_dynamic_maskr   r   r   ALL_ATTENTION_FUNCTIONSget_interfacerO   _attn_implementationr   r   r   r   r   r   )r1   r7   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rv   rw   cache_kwargsr   	attn_maskattention_interfacer   r   r5   r5   r6   rC   "  sN   	 

zDogeAttention.forward   r   r   c           
   	   C   s  t |jj}|j}|dddddddf dd|jd d}|durZt|tsZ|jt jkrA|j}t 	|t j
d|j|d|}||ddddddd|jd f dk|}|jd |krt j|||jd}t j||ddd	d
j}	|d|	d}||dk|}|S )a8  
        The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

        Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

        Args:
            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
            dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
            keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
        Nr9   r"   r   rc   r   r;   r\   TF)ri   largestsortedra   )r-   finfor;   minrp   rE   rq   r$   boolwheretensorr\   masked_fill
zeros_liketopkindicesscatter)
r1   r7   r   r   r   	min_dtyper;   r   active_masktopk_indicesr5   r5   r6   r   Z  s$   2z"DogeAttention.prepare_dynamic_maskr~   r   )r   N)rG   rH   rI   r#   r   r+   r-   rK   rD   r	   
LongTensorrC   r   rL   r5   r5   r3   r6   r     s:    "
<r   c                       s$   e Zd Z fddZdd Z  ZS )DogeMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nr   )r*   r+   rO   r2   intermediate_sizer   r   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr1   rO   r3   r5   r6   r+     s   
zDogeMLP.__init__c                 C   s$   |  | | || | }|S r~   )r   r   r   r   )r1   rx   r   r5   r5   r6   rC     s    zDogeMLP.forward)rG   rH   rI   r+   rC   rL   r5   r5   r3   r6   r     s    
r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )	DogeCDMoErO   c                    s   t    |j| _|j| _t|j | _|j| _t	t
| j| _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _tj| j| jd dd| _t| j| j| _t| j| j| _d S )Nr   r8   F)r*   r+   r2   r   r   r   r   num_expertsmathfloorsqrtnum_keysnum_experts_per_toktop_knorm_topk_probr   r   r   r   r   r   router_gate	Embedding
down_embedup_embedr   r3   r5   r6   r+     s   
zDogeCDMoE.__init__r7   r)   c                 K   s  |j \}}}| |d|| d}|j| jdd\\}}\}	}
|d|d }|	d| j |
d }|jg |j d d dR  }|jg |j d d dR  }|j| jdd\}}|d|}tj	|dd}| j
rx||jddd }| |}| |}t|||| dd|| d}| || }t||| dd|||d}| | | || | }|| }||fS )Nr8   r9   ro   r   T)ri   r:   r"   )rE   r  r   r   r  r   r  gatherr   r   r  sumr
  r  r-   r   r   r   r   r   )r1   r7   r   bszr^   _router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr
  r  experts_weightsexperts_statesr5   r5   r6   rC     s(   

&$ zDogeCDMoE.forward)	rG   rH   rI   r#   r+   r-   rK   rC   rL   r5   r5   r3   r6   r     s    r   c                       s   e Zd ZddededB f fddZ						ddejdeejejf dB d	ejdB d
ej	dB de
dB dedB dej	dB dee deejeejejf dB f fddZ  ZS )DogeDecoderLayerNrO   r   c                    s   t    |j| _t|j|jd| _t||d| _t	
t|j| _t|j|jd| _|js3t|nt|| _t	
t|j| _d S )Nr   )rO   r   )r*   r+   hidden_dropoutr&   r2   r   input_layernormr   	self_attnr   r,   r-   r.   input_residualpost_attention_layernormis_moer   r   mlppost_attention_residualr   r3   r5   r6   r+     s   
zDogeDecoderLayer.__init__Fr7   r   r   ry   r   	use_cacher   r   r)   c              
   K   s   |}	|  |}| jd|||||||d|\}}
tj|| j| jd}| j|	 | }|}	| |}| |}tj|| j| jd}| j	|	 | }|S )N)r7   r   r   ry   r   r%  r   r   r5   )
r  r  r   r   r  r   r   r!  r#  r$  )r1   r7   r   r   ry   r   r%  r   r   residualself_attn_weightsr5   r5   r6   rC     s*   




zDogeDecoderLayer.forwardr~   )NNNNFN)rG   rH   rI   r#   r   r+   r-   rK   rD   r   r	   r   r   r   FloatTensorrC   rL   r5   r5   r3   r6   r    s6    	
r  c                       sj   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeeddeed	Ze  fd
dZ  ZS )DogePreTrainedModelrO   modelTr  r   Fr"   )index)r  r7   
attentionsc                    sv   t  | t|trt|drt|j dS dS t|tr7t|dr*t	|j
 t|dr9t	|j dS dS dS )zInitialize the weightsr   r   r$  N)r*   _init_weightsrq   r   hasattrinitzeros_r   r  ones_r   r$  )r1   r   r3   r5   r6   r-    s   




z!DogePreTrainedModel._init_weights)rG   rH   rI   r#   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr    r   r  r   _can_record_outputsr-   r   r-  rL   r5   r5   r3   r6   r)    s"   
 
r)  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )	DogeModelrO   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r5   )r  ).0r   rO   r5   r6   
<listcomp>$  s    z&DogeModel.__init__.<locals>.<listcomp>r   r>  F)r*   r+   pad_token_idpadding_idx
vocab_sizer   r	  r2   embed_tokens
ModuleListrangenum_hidden_layerslayersr&   r   normrM   
rotary_embgradient_checkpointing	post_initr   r3   r>  r6   r+     s   zDogeModel.__init__N	input_idsr   ry   r   inputs_embedsr%  r   r   r)   c              
   K   s  |d u |d uA rt d|r|d u rt| jd}|d u r!| |}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}| jj
d u rNtnt}
|
| j|||||d}|}| j||d}| jd | jj D ]}||f||||||d|}qm| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsr>  r   r"   )r\   )rO   rM  r   r   r   ry   )ry   )r   ry   r   r%  r   r   )last_hidden_stater   )
ValueErrorr
   rO   rC  get_seq_lengthr-   rf   rE   r\   r   sliding_windowr   r   rI  rG  rF  rH  r   )r1   rL  r   ry   r   rM  r%  r   r   past_seen_tokensmask_functionr   r7   r   decoder_layerr5   r5   r6   rC   -  sT   

	
zDogeModel.forward)NNNNNNN)rG   rH   rI   r#   r+   r   r!   r   r-   r   rK   r	   r(  r   r   r   r   rC   rL   r5   r5   r3   r6   r<    s>    	
r<  r8   gate_logitsr   r  r  c                 C   sv  | du s	t | tsdS | d j}| d j}g }g }| D ]h}	|	|}	|	j|dd\\}
}\}}|
d|d }|d| |d }|jg |jdd dR  }|jg |jdd dR  }|j|dd\}}|	d|}t
j|dd}|| || qtj|dd}tj|dd}|du r|d}tj|||d}tj|||d}|d|||jd  }tj|dd}nq|j\}}t| }|ddddddf ||||fd|}|d|  }tj|||d}tj|||d}|d||t| }|ddddddf ||||fd||}tj|| ddtj|dd }t|| }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [2, batch_size * sequence_length, num_keys].
        num_experts:
            Number of experts
        num_keys:
            Number of keys
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   r9   ro   r   r   )rq   rD   r;   r\   r<   r   r   r   rE   r  r   r   appendr-   ru   r   	ones_likescatter_add_r?   lenrp   r   r   r  )rU  r   r  r  r   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr  r  r  r  r  r  r  r  expert_indicesr  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthrF  expert_attention_mask router_per_expert_attention_maskoverall_lossr5   r5   r6   load_balancing_loss_funcm  sb    





rh  c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																	
		dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB ded	B dee defddZ  ZS )DogeForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr7   logitsc                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _
|j| _|   d S )NFr   )r*   r+   r<  r*  rB  r   r   r2   rj  router_aux_loss_coefr   r  rK  r   r3   r5   r6   r+     s   
zDogeForCausalLM.__init__Nr   rL  r   ry   r   rM  labelsr%  r   logits_to_keepoutput_router_logitsr   r)   c              
   K   s   |
dur|
n| j j}
| jd|||||||d|}|j}t|	tr(t|	 dn|	}| |dd|ddf }d}|durJ| j||| j	fi |}d}|
rot
|j| jtt| j| j|}|duro|| j||j 7 }t||||j|j|j|jdS )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, DogeForCausalLM

        >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
        >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)rL  r   ry   r   rM  r%  r   )lossaux_lossrl  r   r7   r,  r  r5   )rO   rp  r*  rN  rq   r   slicerj  loss_functionrB  rh  r  r   r  r  r  r  rm  r<   r\   r   r   r7   r,  )r1   rL  r   ry   r   rM  rn  r%  r   ro  rp  r   outputsr7   slice_indicesrl  rq  rr  r5   r5   r6   rC     sN   'zDogeForCausalLM.forward)
NNNNNNNNr   N)rG   rH   rI   _tied_weights_keys_tp_plan_pp_planr+   r   r   r-   r   rK   r	   r(  r   r   r   r   r   rC   rL   r5   r5   r3   r6   ri    sT    	
ri  c                   @   s   e Zd ZdS )DogeForSequenceClassificationN)rG   rH   rI   r5   r5   r5   r6   rz  ?  s    rz  )ri  r<  r)  rz  )r"   )r   )NN)NNr8   N)Qr  collections.abcr   typingr   r   r-   torch.nn.functionalr   r   r    r   r/  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   integrations.flex_attentionr   masking_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr    r!   configuration_doger#   !torch.nn.attention.flex_attentionr$   Moduler&   rM   r   r   rK   r   r   rJ   r   rD   r   r   r   r   r   r  r)  r<  rh  ri  rz  __all__r5   r5   r5   r6   <module>   s   A
	
.}92S
jg