o
    ei0;                     @   sD  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ eeZG dd dejZG dd dejZG dd dejZG dd dejZeG dd deZeG dd deZ eddG dd deZ!eddG dd de
eZ"g d Z#dS )!zPyTorch TextNet model.    )AnyN)Tensor   )ACT2CLS)BackboneMixin)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging   )TextNetConfigc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )TextNetConvLayerconfigc                    s   t    |j| _|j| _|j| _t|jt	r%|jd d |jd d fn|jd }t
j|j|j|j|j|dd| _t
|j|j| _t
 | _| jd urVt| j  | _d S d S )Nr      r   F)kernel_sizestridepaddingbias)super__init__stem_kernel_sizer   stem_strider   stem_act_funcactivation_function
isinstancetuplennConv2dstem_num_channelsstem_out_channelsconvBatchNorm2dbatch_norm_eps
batch_normIdentity
activationr   )selfr   r   	__class__ j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/textnet/modeling_textnet.pyr   '   s*   



zTextNetConvLayer.__init__hidden_statesreturnc                 C   s   |  |}| |}| |S N)r#   r&   r(   )r)   r.   r,   r,   r-   forwardB   s   


zTextNetConvLayer.forward)	__name__
__module____qualname__r   r   torchr   r1   __classcell__r,   r,   r*   r-   r   &   s    r   c                
       sL   e Zd ZdZdededededef
 fddZd	ejd
ejfddZ	  Z
S )TextNetRepConvLayera  
    This layer supports re-parameterization by combining multiple convolutional branches
    (e.g., main convolution, vertical, horizontal, and identity branches) during training.
    At inference time, these branches can be collapsed into a single convolution for
    efficiency, as per the re-parameterization paradigm.

    The "Rep" in the name stands for "re-parameterization" (introduced by RepVGG).
    r   in_channelsout_channelsr   r   c           	         sf  t    || _|| _|| _|| _|d d d |d d d f}t | _tj	|||||dd| _
tj||jd| _|d d d df}d|d d d f}|d dkrotj	|||d df||dd| _tj||jd| _nd\| _| _|d dkrtj	||d|d f||dd| _tj||jd| _nd\| _| _||kr|dkrtj||jd| _d S d | _d S )Nr   r   r   F)r8   r9   r   r   r   r   )num_featuresepsNN)r   r   num_channelsr9   r   r   r   ReLUr   r    	main_convr$   r%   main_batch_normvertical_convvertical_batch_normhorizontal_convhorizontal_batch_normrbr_identity)	r)   r   r8   r9   r   r   r   vertical_paddinghorizontal_paddingr*   r,   r-   r   R   sZ   
 


zTextNetRepConvLayer.__init__r.   r/   c                 C   s   |  |}| |}| jd ur| |}| |}|| }| jd ur0| |}| |}|| }| jd ur>| |}|| }| |S r0   )r?   r@   rA   rB   rC   rD   rE   r   )r)   r.   main_outputsvertical_outputshorizontal_outputsid_outr,   r,   r-   r1      s   










zTextNetRepConvLayer.forward)r2   r3   r4   __doc__r   intr   r5   r   r1   r6   r,   r,   r*   r-   r7   H   s    "	9r7   c                       s.   e Zd Zdedef fddZdd Z  ZS )TextNetStager   depthc                    s   t    |j| }|j| }t|}|j| }|j|d  }|g|g|d   }|g| }	g }
t||	||D ]}|
t|g|R   q7t	
|
| _d S )Nr   )r   r   conv_layer_kernel_sizesconv_layer_strideslenhidden_sizeszipappendr7   r   
ModuleListstage)r)   r   rO   r   r   
num_layersstage_in_channel_sizestage_out_channel_sizer8   r9   rW   stage_configr*   r,   r-   r      s   




zTextNetStage.__init__c                 C   s   | j D ]}||}q|S r0   )rW   )r)   hidden_stateblockr,   r,   r-   r1      s   

zTextNetStage.forward)r2   r3   r4   r   rM   r   r1   r6   r,   r,   r*   r-   rN      s    rN   c                	       sL   e Zd Zdef fddZ		ddejdedB dedB defd	d
Z	  Z
S )TextNetEncoderr   c                    sF   t    g }t|j}t|D ]
}|t|| qt|| _	d S r0   )
r   r   rR   rP   rangerU   rN   r   rV   stages)r)   r   r`   
num_stagesstage_ixr*   r,   r-   r      s   

zTextNetEncoder.__init__Nr\   output_hidden_statesreturn_dictr/   c                 C   sL   |g}| j D ]}||}|| q|s |f}|r||f S |S t||dS )N)last_hidden_stater.   )r`   rU   r   )r)   r\   rc   rd   r.   rW   outputr,   r,   r-   r1      s   
zTextNetEncoder.forwardr<   )r2   r3   r4   r   r   r5   r   boolr   r1   r6   r,   r,   r*   r-   r^      s    r^   c                   @   s   e Zd ZU eed< dZdZdS )TextNetPreTrainedModelr   textnetpixel_valuesN)r2   r3   r4   r   __annotations__base_model_prefixmain_input_namer,   r,   r,   r-   rh      s   
 rh   c                       s`   e Zd Z fddZe		d
dededB dedB deee	e f ee B e
B fdd	Z  ZS )TextNetModelc                    s8   t  | t|| _t|| _td| _| 	  d S )N)r   r   )
r   r   r   stemr^   encoderr   AdaptiveAvgPool2dpooler	post_initr)   r   r*   r,   r-   r      s
   

zTextNetModel.__init__Nrj   rc   rd   r/   c           
      K   s   |d ur|n| j j}|d ur|n| j j}| |}| j|||d}|d }| |}|s;||f}	|r9|	|d f S |	S t|||rF|d dS d dS )Nrc   rd   r   r   )re   pooler_outputr.   )r   use_return_dictrc   ro   rp   rr   r	   )
r)   rj   rc   rd   kwargsr\   encoder_outputsre   pooled_outputrf   r,   r,   r-   r1      s&   


zTextNetModel.forwardr<   )r2   r3   r4   r   r   r   rg   r   r   listr	   r1   r6   r,   r,   r*   r-   rn      s    rn   z
    TextNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       s\   e Zd Z fddZe				ddejdB dejdB dedB dedB de	f
d	d
Z
  ZS )TextNetForImageClassificationc                    s|   t  | |j| _t|| _td| _t | _	|jdkr)t
|jd |jnt | _t| j| j	g| _|   d S )N)r   r   r   )r   r   
num_labelsrn   ri   r   rq   avg_poolFlattenflattenLinearrS   r'   fcrV   
classifierrs   rt   r*   r,   r-   r     s   

(z&TextNetForImageClassification.__init__Nrj   labelsrc   rd   r/   c                 K   s   |dur|n| j j}| j|||d}|d }| jD ]}||}q| |}	d}
|dur3| ||	| j }
|sI|	f|dd  }|
durG|
f| S |S t|
|	|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import TextNetForImageClassification, TextNetImageProcessor
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = TextNetImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = TextNetForImageClassification.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> outputs.logits.shape
        torch.Size([1, 2])
        ```Nru   r   r   )losslogitsr.   )r   rw   ri   r   r   loss_functionr
   r.   )r)   rj   r   rc   rd   rx   outputsre   layerr   r   rf   r,   r,   r-   r1     s   $


z%TextNetForImageClassification.forward)NNNN)r2   r3   r4   r   r   r5   FloatTensor
LongTensorrg   r
   r1   r6   r,   r,   r*   r-   r}     s$    r}   zP
    TextNet backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sT   e Zd ZdZ fddZe		ddededB dedB dee e	B fd	d
Z
  ZS )TextNetBackboneFc                    s*   t  | t|| _|j| _|   d S r0   )r   r   rn   ri   rS   r:   rs   rt   r*   r,   r-   r   \  s   
zTextNetBackbone.__init__Nrj   rc   rd   r/   c                 K   s   |dur|n| j j}|dur|n| j j}| j|d|d}|r!|jn|d }d}t| jD ]\}}	|	| jv r<||| f7 }q,|sT|f}
|rR|rI|jn|d }|
|f7 }
|
S t||r^|jddS dddS )a  
        Examples:

        ```python
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, AutoBackbone

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("czczup/textnet-base")
        >>> model = AutoBackbone.from_pretrained("czczup/textnet-base")

        >>> inputs = processor(image, return_tensors="pt")
        >>> with torch.no_grad():
        >>>     outputs = model(**inputs)
        ```NTru   r   r,   )feature_mapsr.   
attentions)	r   rw   rc   ri   r.   	enumeratestage_namesout_featuresr   )r)   rj   rc   rd   rx   r   r.   r   idxrW   rf   r,   r,   r-   r1   e  s0   

zTextNetBackbone.forwardr<   )r2   r3   r4   has_attentionsr   r   r   rg   r   r   r1   r6   r,   r,   r*   r-   r   T  s    	
r   )r   rn   rh   r}   )$rL   typingr   r5   torch.nnr   r   activationsr   backbone_utilsr   modeling_outputsr   r   r	   r
   modeling_utilsr   utilsr   r   configuration_textnetr   
get_loggerr2   loggerModuler   r7   rN   r^   rh   rn   r}   r   __all__r,   r,   r,   r-   <module>   s:   
"Z)FE