o
    i                     @   sh  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& e 'e(Z)G dd de
j*Z+G dd de
j*Z,	dAde
j*de	j-de	j-de	j-dee	j- de.de.fddZ/G dd  d e
j*Z0G d!d" d"e
j*Z1G d#d$ d$e
j*Z2G d%d& d&e
j*Z3G d'd( d(e
j*Z4G d)d* d*eZ5G d+d, d,e
j*Z6eG d-d. d.eZ7eG d/d0 d0e7Z8G d1d2 d2e
j*Z9ed3d4G d5d6 d6e7Z:ed7d4G d8d9 d9e7Z;eed:d4G d;d< d<eZ<ed=d4G d>d? d?e7Z=g d@Z>dS )BzPyTorch DeiT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplecheck_model_inputs   )
DeiTConfigc                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdeej dedejfddZ  ZS )DeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                    s   t    ttdd|j| _ttdd|j| _|r*ttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|j| _d S )Nr      )super__init__r   	Parametertorchzeroshidden_size	cls_tokendistillation_token
mask_tokenDeiTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_size)selfr   r   r,   	__class__ c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/deit/modeling_deit.pyr"   0   s   
 
zDeiTEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r    N      ?r   r   bicubicF)sizemodealign_cornersdim)shaper-   r$   jit
is_tracingr1   r   reshapepermuter   
functionalinterpolateviewcat)r2   r7   r8   r9   r,   num_positionsclass_and_dist_pos_embedpatch_pos_embedrA   
new_height	new_widthsqrt_num_positionsr5   r5   r6   interpolate_pos_encoding<   s(   



z'DeiTEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posrQ   c                 C   s   |j \}}}}| |}| \}}	}|d ur1| j||	d}
|d|
}|d|  |
|  }| j|dd}| j|dd}t	j
|||fdd}| j}|rW| |||}|| }| |}|S )Nr:         ?r   r@   )rB   r+   r=   r)   expand	unsqueezetype_asr'   r(   r$   rJ   r-   rQ   r0   )r2   rR   rS   rQ   _r8   r9   r7   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokensposition_embeddingr5   r5   r6   forwardd   s    

zDeiTEmbeddings.forward)F)NF)__name__
__module____qualname____doc__r   boolr"   r$   TensorintrQ   r   
BoolTensorr`   __classcell__r5   r5   r3   r6   r   +   s    +r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r*   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r!   r"   
image_sizer1   num_channelsr&   
isinstancecollectionsabcIterabler,   r   Conv2d
projection)r2   r   rl   r1   rm   r&   r,   r3   r5   r6   r"      s   
 zDeiTPatchEmbeddings.__init__rR   r   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r    r   )rB   rm   
ValueErrorrs   flatten	transpose)r2   rR   rY   rm   r8   r9   xr5   r5   r6   r`      s   
zDeiTPatchEmbeddings.forward)	ra   rb   rc   rd   r"   r$   rf   r`   ri   r5   r5   r3   r6   r*      s    r*           modulequerykeyvalueattention_maskscalingr0   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )Nr:   )rA   dtype)ptrainingr   r    )r$   matmulrv   r   rG   softmaxfloat32tor   r0   r   
contiguous)
ry   rz   r{   r|   r}   r~   r0   kwargsattn_weightsattn_outputr5   r5   r6   eager_attention_forward   s   r   c                	       sP   e Zd Zdef fddZ	d
dejdeej deejejf fdd	Z	  Z
S )DeiTSelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r!   r"   r&   num_attention_headshasattrrt   r   rg   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr~   	is_causalr   Linearqkv_biasrz   r{   r|   r2   r   r3   r5   r6   r"      s"   

zDeiTSelfAttention.__init__Nhidden_states	head_maskr   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr?t| j	j
 }|| ||||| j| j| jsNdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   r:   r   r    eagerrx   )r   r~   r0   r   )rB   r   r   r{   rI   rv   r|   rz   r   r   _attn_implementationr   r   r~   r   r   r=   r   rE   )r2   r   r   rY   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper5   r5   r6   r`      s*   


zDeiTSelfAttention.forwardN)ra   rb   rc   r   r"   r$   rf   r   tupler`   ri   r5   r5   r3   r6   r      s    r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
DeiTSelfOutputz
    The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S r   )	r!   r"   r   r   r&   denser.   r/   r0   r   r3   r5   r6   r"         
zDeiTSelfOutput.__init__r   input_tensorr   c                 C      |  |}| |}|S r   r   r0   r2   r   r   r5   r5   r6   r`        

zDeiTSelfOutput.forward)
ra   rb   rc   rd   r   r"   r$   rf   r`   ri   r5   r5   r3   r6   r      s    $r   c                       sV   e Zd Zdef fddZdee fddZddej	d	e
ej	 d
ej	fddZ  ZS )DeiTAttentionr   c                    s*   t    t|| _t|| _t | _d S r   )r!   r"   r   	attentionr   outputsetpruned_headsr   r3   r5   r6   r"     s   


zDeiTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r@   )lenr   r   r   r   r   r   rz   r{   r|   r   r   r   union)r2   r   indexr5   r5   r6   prune_heads  s   zDeiTAttention.prune_headsNr   r   r   c                 C   s    |  ||\}}| ||}|S r   )r   r   )r2   r   r   self_attn_outputrX   r   r5   r5   r6   r`   $  s   zDeiTAttention.forwardr   )ra   rb   rc   r   r"   r   rg   r   r$   rf   r   r`   ri   r5   r5   r3   r6   r     s    *r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )DeiTIntermediater   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r!   r"   r   r   r&   intermediate_sizer   rn   
hidden_actstrr   intermediate_act_fnr   r3   r5   r6   r"   ,  s
   
zDeiTIntermediate.__init__r   r   c                 C   r   r   )r   r   )r2   r   r5   r5   r6   r`   4  r   zDeiTIntermediate.forward	ra   rb   rc   r   r"   r$   rf   r`   ri   r5   r5   r3   r6   r   +  s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	
DeiTOutputr   c                    s.   t    t|j|j| _t|j| _	d S r   )
r!   r"   r   r   r   r&   r   r.   r/   r0   r   r3   r5   r6   r"   <  r   zDeiTOutput.__init__r   r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r5   r5   r6   r`   A  s   

zDeiTOutput.forwardr   r5   r5   r3   r6   r   ;  s    $r   c                       sH   e Zd ZdZdef fddZddejdeej dejfd	d
Z	  Z
S )	DeiTLayerz?This corresponds to the Block class in the timm implementation.r   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r!   r"   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr&   layer_norm_epslayernorm_beforelayernorm_afterr   r3   r5   r6   r"   L  s   



zDeiTLayer.__init__Nr   r   r   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )r2   r   r   hidden_states_normattention_outputlayer_outputr5   r5   r6   r`   V  s   


zDeiTLayer.forwardr   )ra   rb   rc   rd   r   r"   r$   rf   r   r`   ri   r5   r5   r3   r6   r   I  s    *
r   c                       sB   e Zd Zdef fddZd
dejdeej defdd	Z	  Z
S )DeiTEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r5   )r   ).0rX   r   r5   r6   
<listcomp>l  s    z(DeiTEncoder.__init__.<locals>.<listcomp>F)	r!   r"   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r3   r   r6   r"   i  s   
 
zDeiTEncoder.__init__Nr   r   r   c                 C   s<   t | jD ]\}}|d ur|| nd }|||}qt|dS )N)last_hidden_state)	enumerater   r
   )r2   r   r   ilayer_modulelayer_head_maskr5   r5   r6   r`   o  s   
zDeiTEncoder.forwardr   )ra   rb   rc   r   r"   r$   rf   r   r
   r`   ri   r5   r5   r3   r6   r   h  s    (r   c                   @   sd   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZeedZdeejejejf dd	fd
dZd	S )DeiTPreTrainedModelr   deitrR   Tr   )r   
attentionsry   r   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |tri|jj  |jj  |jj  |jdurk|jj  dS dS dS )zInitialize the weightsrx   )meanstdNrT   )rn   r   r   rr   inittrunc_normal_weightdatar   r$   r   r   initializer_ranger   r   zero_r   fill_r   r'   r-   r(   r)   )r2   ry   r5   r5   r6   _init_weights  s(   



z!DeiTPreTrainedModel._init_weights)ra   rb   rc   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r   r   rr   r   r   r5   r5   r5   r6   r   w  s   
 &r   c                       s   e Zd Zddedededdf fdd	Zdefd
dZdd Ze	dde
				ddeej deej deej dedee defddZ  ZS )	DeiTModelTFr   add_pooling_layerr   r   Nc                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r!   r"   r   r   r7   r   encoderr   r   r&   r   	layernorm
DeiTPoolerpooler	post_init)r2   r   r   r   r3   r5   r6   r"     s   
zDeiTModel.__init__c                 C   s   | j jS r   )r7   r+   )r2   r5   r5   r6   get_input_embeddings  s   zDeiTModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r2   heads_to_pruner   r   r5   r5   r6   _prune_heads  s   zDeiTModel._prune_heads)tie_last_hidden_statesrR   rS   r   rQ   r   c                 K   s   |du rt d| || jj}| jjjjj}|j|kr!|	|}| j|||d}| j
||d}|j}	| |	}	| jdurB| |	nd}
t|	|
dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rS   rQ   )r   )r   pooler_output)rt   get_head_maskr   r   r7   r+   rs   r   r   r   r   r   r   r   r   )r2   rR   rS   r   rQ   r   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputr5   r5   r6   r`     s"   


zDeiTModel.forward)TFNNNF)ra   rb   rc   r   re   r"   r*   r   r   r   r   r   r$   rf   rh   r   r   r   r`   ri   r5   r5   r3   r6   r     s.     r   c                       r   )r   r   c                    s,   t    t|j|j| _t|j | _	d S r   )
r!   r"   r   r   r&   pooler_output_sizer   r   
pooler_act
activationr   r3   r5   r6   r"     s   
zDeiTPooler.__init__r   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )r2   r   first_token_tensorr  r5   r5   r6   r`     s   

zDeiTPooler.forwardr   r5   r5   r3   r6   r     s    r   ad  
    DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                       sp   e Zd Zdeddf fddZee				ddeej	 deej
 d	eej	 d
edee defddZ  ZS )DeiTForMaskedImageModelingr   r   Nc                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r    r   )in_channelsout_channelsrj   )r!   r"   r   r   r   
Sequentialrr   r&   encoder_striderm   PixelShuffledecoderr   r   r3   r5   r6   r"     s   

z#DeiTForMaskedImageModeling.__init__FrR   rS   r   rQ   r   c                 K   s
  | j |f|||d|}|j}|ddddf }|j\}}	}
t|	d  }}|ddd||
||}| |}d}|dur{| jj| jj	 }|d||}|
| jj	d
| jj	dd }tjj||dd	}||  | d
  | jj }t|||j|jdS )a;  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```)rS   r   rQ   Nr   r:   r;   r   r    none)	reductiongh㈵>)lossreconstructionr   r   )r   r   rB   rg   rF   rE   r  r   rl   r1   repeat_interleaverV   r   r   rG   l1_losssumrm   r   r   r   )r2   rR   rS   r   rQ   r   outputsr  rY   sequence_lengthrm   r8   r9   reconstructed_pixel_valuesmasked_im_lossr=   r\   reconstruction_lossr5   r5   r6   r`     s@   &
 z"DeiTForMaskedImageModeling.forwardr  )ra   rb   rc   r   r"   r   r   r   r$   rf   rh   re   r   r   r   r`   ri   r5   r5   r3   r6   r    s*    r  z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       sp   e Zd Zdeddf fddZee				ddeej	 deej	 d	eej	 d
e
dee defddZ  ZS )DeiTForImageClassificationr   r   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S NF)r   r   )r!   r"   
num_labelsr   r   r   r   r&   Identity
classifierr   r   r3   r5   r6   r"   l  s
   $z#DeiTForImageClassification.__init__FrR   r   labelsrQ   r   c           
      K   sr   | j |f||d|}|j}| |dddddf }d}	|dur/| j||| jfi |}	t|	||j|jdS )aZ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DeiTForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: Polaroid camera, Polaroid Land camera
        ```r   rQ   Nr   )r  logitsr   r   )r   r   r%  loss_functionr   r   r   r   )
r2   rR   r   r&  rQ   r   r  r  r(  r  r5   r5   r6   r`   x  s&   *z"DeiTForImageClassification.forwardr  )ra   rb   rc   r   r"   r   r   r   r$   rf   re   r   r   r   r`   ri   r5   r5   r3   r6   r!  e  s*    r!  zC
    Output type of [`DeiTForImageClassificationWithTeacher`].
    c                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )+DeiTForImageClassificationWithTeacherOutputaj  
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores as the average of the cls_logits and distillation logits.
    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
        class token).
    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
        distillation token).
    Nr(  
cls_logitsdistillation_logitsr   r   )ra   rb   rc   rd   r(  r   r$   FloatTensorr   r+  r,  r   r   r   r5   r5   r5   r6   r*    s   
 r*  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                       sd   e Zd Zdeddf fddZee			ddeej	 deej	 d	e
d
ee def
ddZ  ZS )%DeiTForImageClassificationWithTeacherr   r   Nc                    sv   t  | |j| _t|dd| _|jdkrt|j|jnt | _	|jdkr0t|j|jnt | _
|   d S r"  )r!   r"   r#  r   r   r   r   r&   r$  cls_classifierdistillation_classifierr   r   r3   r5   r6   r"     s     z.DeiTForImageClassificationWithTeacher.__init__FrR   r   rQ   r   c           
      K   sx   | j |f||d|}|j}| |d d dd d f }| |d d dd d f }|| d }	t|	|||j|jdS )Nr'  r   r   r    )r(  r+  r,  r   r   )r   r   r/  r0  r*  r   r   )
r2   rR   r   rQ   r   r  r  r+  r,  r(  r5   r5   r6   r`     s&   	z-DeiTForImageClassificationWithTeacher.forward)NNF)ra   rb   rc   r   r"   r   r   r   r$   rf   re   r   r   r*  r`   ri   r5   r5   r3   r6   r.    s$    r.  )r!  r.  r  r   r   )rx   )?rd   collections.abcro   dataclassesr   typingr   r   r   r$   r   activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   utils.genericr   r   configuration_deitr   
get_loggerra   loggerModuler   r*   rf   floatr   r   r   r   r   r   r   r   r   r   r   r  r!  r*  r.  __all__r5   r5   r5   r6   <module>   s   
Y(
5 $M`O3