o
    wi@                     @   sh  d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZG dd dej Z!G dd dej Z"G dd dej Z#G dd dej Z$eG dd deZ%eG dd de%Z&eddG dd de%Z'eddG dd  d e%eZ(g d!Z)dS )"zPyTorch TextNet model.    )AnyOptionalUnionN)Tensor)BCEWithLogitsLossCrossEntropyLossMSELoss)PreTrainedModel)ACT2CLS)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)TextNetConfig)logging)BackboneMixin   )auto_docstringc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )TextNetConvLayerconfigc                    s   t    |j| _|j| _|j| _t|jt	r%|jd d |jd d fn|jd }t
j|j|j|j|j|dd| _t
|j|j| _t
 | _| jd urVt| j  | _d S d S )Nr         F)kernel_sizestridepaddingbias)super__init__stem_kernel_sizer   stem_strider   stem_act_funcactivation_function
isinstancetuplennConv2dstem_num_channelsstem_out_channelsconvBatchNorm2dbatch_norm_eps
batch_normIdentity
activationr
   )selfr   r   	__class__ i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/textnet/modeling_textnet.pyr   +   s*   



zTextNetConvLayer.__init__hidden_statesreturnc                 C   s   |  |}| |}| |S N)r(   r+   r-   )r.   r3   r1   r1   r2   forwardF   s   


zTextNetConvLayer.forward)	__name__
__module____qualname__r   r   torchr   r6   __classcell__r1   r1   r/   r2   r   *   s    r   c                
       sL   e Zd ZdZdededededef
 fddZd	ejd
ejfddZ	  Z
S )TextNetRepConvLayera  
    This layer supports re-parameterization by combining multiple convolutional branches
    (e.g., main convolution, vertical, horizontal, and identity branches) during training.
    At inference time, these branches can be collapsed into a single convolution for
    efficiency, as per the re-parameterization paradigm.

    The "Rep" in the name stands for "re-parameterization" (introduced by RepVGG).
    r   in_channelsout_channelsr   r   c           	         sf  t    || _|| _|| _|| _|d d d |d d d f}t | _tj	|||||dd| _
tj||jd| _|d d d df}d|d d d f}|d dkrotj	|||d df||dd| _tj||jd| _nd\| _| _|d dkrtj	||d|d f||dd| _tj||jd| _nd\| _| _||kr|dkrtj||jd| _d S d | _d S )Nr   r   r   F)r=   r>   r   r   r   r   )num_featuresepsNN)r   r   num_channelsr>   r   r   r$   ReLUr!   r%   	main_convr)   r*   main_batch_normvertical_convvertical_batch_normhorizontal_convhorizontal_batch_normrbr_identity)	r.   r   r=   r>   r   r   r   vertical_paddinghorizontal_paddingr/   r1   r2   r   V   sZ   
 


zTextNetRepConvLayer.__init__r3   r4   c                 C   s   |  |}| |}| jd ur| |}| |}|| }| jd ur0| |}| |}|| }| jd ur>| |}|| }| |S r5   )rD   rE   rF   rG   rH   rI   rJ   r!   )r.   r3   main_outputsvertical_outputshorizontal_outputsid_outr1   r1   r2   r6      s   










zTextNetRepConvLayer.forward)r7   r8   r9   __doc__r   intr   r:   r   r6   r;   r1   r1   r/   r2   r<   L   s    "	9r<   c                       s.   e Zd Zdedef fddZdd Z  ZS )TextNetStager   depthc                    s   t    |j| }|j| }t|}|j| }|j|d  }|g|g|d   }|g| }	g }
t||	||D ]}|
t|g|R   q7t	
|
| _d S )Nr   )r   r   conv_layer_kernel_sizesconv_layer_strideslenhidden_sizeszipappendr<   r$   
ModuleListstage)r.   r   rT   r   r   
num_layersstage_in_channel_sizestage_out_channel_sizer=   r>   r\   stage_configr/   r1   r2   r      s   




zTextNetStage.__init__c                 C   s   | j D ]}||}q|S r5   )r\   )r.   hidden_stateblockr1   r1   r2   r6      s   

zTextNetStage.forward)r7   r8   r9   r   rR   r   r6   r;   r1   r1   r/   r2   rS      s    rS   c                	       sL   e Zd Zdef fddZ		ddejdee dee de	fd	d
Z
  ZS )TextNetEncoderr   c                    sF   t    g }t|j}t|D ]
}|t|| qt|| _	d S r5   )
r   r   rW   rU   rangerZ   rS   r$   r[   stages)r.   r   re   
num_stagesstage_ixr/   r1   r2   r      s   

zTextNetEncoder.__init__Nra   output_hidden_statesreturn_dictr4   c                 C   sL   |g}| j D ]}||}|| q|s |f}|r||f S |S t||dS )N)last_hidden_stater3   )re   rZ   r   )r.   ra   rh   ri   r3   r\   outputr1   r1   r2   r6      s   
zTextNetEncoder.forwardrA   )r7   r8   r9   r   r   r:   r   r   boolr   r6   r;   r1   r1   r/   r2   rc      s    rc   c                   @   s    e Zd ZeZdZdZdd ZdS )TextNetPreTrainedModeltextnetpixel_valuesc                 C   s   t |tjtjfr#|jjjd| jjd |j	d ur!|j	j
  d S d S t |tjr=|jjd |j	d ur?|j	j
  d S d S d S )Ng        )meanstdg      ?)r"   r$   Linearr%   weightdatanormal_r   initializer_ranger   zero_r)   fill_)r.   moduler1   r1   r2   _init_weights   s   

z$TextNetPreTrainedModel._init_weightsN)r7   r8   r9   r   config_classbase_model_prefixmain_input_namerz   r1   r1   r1   r2   rm      s
    rm   c                       s`   e Zd Z fddZe	d
dedee dee dee	e
ee
 f e	e
 ef fdd	Z  ZS )TextNetModelc                    s8   t  | t|| _t|| _td| _| 	  d S )N)r   r   )
r   r   r   stemrc   encoderr$   AdaptiveAvgPool2dpooler	post_initr.   r   r/   r1   r2   r      s
   

zTextNetModel.__init__Nro   rh   ri   r4   c           	      C   s   |d ur|n| j j}|d ur|n| j j}| |}| j|||d}|d }| |}|s;||f}|r9||d f S |S t|||rF|d dS d dS )Nrh   ri   r   r   )rj   pooler_outputr3   )r   use_return_dictrh   r   r   r   r   )	r.   ro   rh   ri   ra   encoder_outputsrj   pooled_outputrk   r1   r1   r2   r6      s&   


zTextNetModel.forwardrA   )r7   r8   r9   r   r   r   r   rl   r   r#   r   listr   r6   r;   r1   r1   r/   r2   r~      s    r~   z
    TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       s\   e Zd Z fddZe				ddeej deej dee	 dee	 de
f
d	d
Z  ZS )TextNetForImageClassificationc                    s|   t  | |j| _t|| _td| _t | _	|jdkr)t
|jd |jnt | _t| j| j	g| _|   d S )N)r   r   r   )r   r   
num_labelsr~   rn   r$   r   avg_poolFlattenflattenrr   rX   r,   fcr[   
classifierr   r   r/   r1   r2   r     s   

(z&TextNetForImageClassification.__init__Nro   labelsrh   ri   r4   c                 C   sl  |dur|n| j j}| j|||d}|d }| jD ]}||}q| |}d}	|dur| j jdu rU| jdkr;d| j _n| jdkrQ|jtj	ksL|jtj
krQd| j _nd| j _| j jdkrst }
| jdkrm|
| | }	n+|
||}	n%| j jdkrt }
|
|d| j|d}	n| j jdkrt }
|
||}	|s|f|d	d  }|	dur|	f| S |S t|	||jd
S )al  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> import torch
        >>> import requests
        >>> from transformers import TextNetForImageClassification, TextNetImageProcessor
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> outputs.logits.shape
        torch.Size([1, 2])
        ```Nr   r   r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr3   )r   r   rn   r   r   problem_typer   dtyper:   longrR   r   squeezer   viewr   r   r3   )r.   ro   r   rh   ri   outputsrj   layerr   r   loss_fctrk   r1   r1   r2   r6   '  s:   !




"


z%TextNetForImageClassification.forward)NNNN)r7   r8   r9   r   r   r   r:   FloatTensor
LongTensorrl   r   r6   r;   r1   r1   r/   r2   r     s$    r   zP
    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sR   e Zd Z fddZe	d
dedee dee dee	e	 e
f fdd	Z  ZS )TextNetBackbonec                    s6   t  | t  | t|| _|j| _|   d S r5   )r   r   _init_backboner~   rn   rX   r?   r   r   r/   r1   r2   r   s  s
   
zTextNetBackbone.__init__Nro   rh   ri   r4   c           
      C   s   |dur|n| j j}|dur|n| j j}| j|d|d}|r!|jn|d }d}t| jD ]\}}|| jv r<||| f7 }q,|sT|f}	|rR|rI|jn|d }|	|f7 }	|	S t||r^|jddS dddS )a  
        Examples:

        ```python
        >>> import torch
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, AutoBackbone

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = AutoBackbone.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(image, return_tensors="pt")
        >>> with torch.no_grad():
        >>>     outputs = model(**inputs)
        ```NTr   r   r1   )feature_mapsr3   
attentions)	r   r   rh   rn   r3   	enumeratestage_namesout_featuresr   )
r.   ro   rh   ri   r   r3   r   idxr\   rk   r1   r1   r2   r6   }  s0   

zTextNetBackbone.forwardrA   )r7   r8   r9   r   r   r   r   rl   r   r#   r   r6   r;   r1   r1   r/   r2   r   m  s    
r   )r   r~   rm   r   )*rQ   typingr   r   r   r:   torch.nnr$   r   r   r   r   transformersr	   transformers.activationsr
   transformers.modeling_outputsr   r   r   r   1transformers.models.textnet.configuration_textnetr   transformers.utilsr   !transformers.utils.backbone_utilsr   utilsr   
get_loggerr7   loggerModuler   r<   rS   rc   rm   r~   r   r   __all__r1   r1   r1   r2   <module>   s>   
"Z%U>