o
    ei                     @   s  d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddlmZ ee Z!dZ"G dd dej#Z$G dd dej#Z%dej&de'de'dej&fddZ(G dd dej#Z)G dd dej#Z*G dd dej#Z+G dd  d ej#Z,	"dJd#ej&d$e'd%e'd&e-d'e-dej&fd(d)Z.G d*d+ d+ej#Z/G d,d- d-ej#Z0eG d.d/ d/eZ1G d0d1 d1ej#Z2eed2d3G d4d5 d5eZ3ed6d3G d7d8 d8e1Z4eG d9d: d:e1Z5ed;d3G d<d= d=e1Z6eG d>d? d?e1Z7ed@d3G dAdB dBe1Z8eG dCdD dDe1Z9eG dEdF dFe1Z:eG dGdH dHe1Z;g dIZ<dS )Kz!PyTorch Funnel Transformer model.    )	dataclassN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )FunnelConfigg    .Ac                       sN   e Zd Zdeddf fddZ	d
dejdB dejdB dejfdd	Z  ZS )FunnelEmbeddingsconfigreturnNc                    sH   t    tj|j|j|jd| _tj|j	|j
d| _t|j| _d S )N)padding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddings	LayerNormd_modellayer_norm_eps
layer_normDropouthidden_dropoutdropoutselfr   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.pyr   -   s   
zFunnelEmbeddings.__init__	input_idsinputs_embedsc                 C   s*   |d u r	|  |}| |}| |}|S N)r"   r&   r)   )r+   r0   r1   
embeddingsr.   r.   r/   forward3   s
   


zFunnelEmbeddings.forwardNN	__name__
__module____qualname__r   r   torchTensorr4   __classcell__r.   r.   r,   r/   r   ,   s    r   c                       s  e Zd ZU dZdZeed< deddf fddZ		d,d	e	j
d
e	j
dB de	j
dB dee	j
 fddZde	j
de	j
fddZdede	jde	jdee	j
 eee	j
  B fddZde	j
defddZd-de	j
dedede	j
fddZde	j
ee	j
 B ee	j
 B d eee B ee B de	j
fd!d"Z	d.de	j
ee	j
 B ee	j
 B d$edede	j
fd%d&Zd'ee	j
 dee	j
ee	j
 f fd(d)Zd'ee	j
 dee	j
 fd*d+Z  ZS )/FunnelAttentionStructurez>
    Contains helpers for `FunnelRelMultiheadAttention `.
       cls_token_type_idr   r   Nc                    s6   t    || _t|j| _t|j| _d | _d S r2   )	r   r   r   r   r'   r(   sin_dropoutcos_dropoutpooling_multr*   r,   r.   r/   r   D   s
   

z!FunnelAttentionStructure.__init__r1   attention_masktoken_type_idsc                 C   sv   d| _ |d | _}| ||j|j}|dur| |nd}| jjr3t	j
||d |d gdnd}||||fS )zCReturns the attention inputs associated to the inputs of the model.r   N)r   r   r   r   )rB   sizeseq_lenget_position_embedsdtypedevicetoken_type_ids_to_matr   separate_clsr   
functionalpadnew_ones)r+   r1   rC   rD   rF   position_embedstoken_type_matcls_maskr.   r.   r/   init_attention_inputsM   s   	"z.FunnelAttentionStructure.init_attention_inputsc                 C   s^   |dddddf |dddf k}|| j k}|dddddf |dddf B }||B S )z-Convert `token_type_ids` to `token_type_mat`.N)r?   )r+   rD   rP   cls_idscls_matr.   r.   r/   rJ   a   s   &
&z.FunnelAttentionStructure.token_type_ids_to_matrF   rH   rI   c                 C   s  | j j}| j jdkr}tjd|dtj|d|}tjd|d dtj|d|}dd||d    }|dddf |d  }t|}	| |	}
t	|}| 
|}tj|
|
gd	d
}tj||	gd	d
}tj||gd	d
}tj|	 |gd	d
}||||fS tjd|d dtj|d|}dd||d    }tj| d |d dtj|d|}|d }|dddf |d  }| t|}	| 
t	|}tj|	|gd	d
}tjd|tj|d|}|}g }td| j jD ]e}|dkrd}n/| ||}d|d  }| j|||dd}|dddf | }||d|}t|d|}|}d| }| ||}|dddf | }||d|}t|d|}|||g q|S )a  
        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
        are using the factorized or the relative shift attention:

        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
        final formula.

        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
        formula.

        Paper link: https://huggingface.co/papers/2006.03236
        
factorizedr         ?rH   rI   r>   r   i'  Ndim)shift)r   r$   attention_typer:   arangeint64tosinr@   cosrA   catrange
num_blocksstride_pool_posrelative_posexpandrE   gatherappend)r+   rF   rH   rI   r$   pos_seqfreq_seqinv_freqsinusoid	sin_embedsin_embed_d	cos_embedcos_embed_dphipsipiomega
rel_pos_idzero_offset	pos_embedpos
pooled_posposition_embeds_listblock_indexposition_embeds_poolingstriderel_posposition_embeds_no_poolingr.   r.   r/   rG   i   sV    



 &z,FunnelAttentionStructure.get_position_embedspos_idr|   c                 C   sf   | j jr,|d|  d g}| j jr|dd n|dd }t||ddd gdS |ddd S )ze
        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
        r>   r   rX   Nr   )r   rK   
new_tensortruncate_seqr:   rb   )r+   r   r|   cls_pospooled_pos_idr.   r.   r/   re      s
    z(FunnelAttentionStructure.stride_pool_posr   ry   r~   r[   c           	      C   sb   |du r|}|d |d  }|t | }|||  }|d |d  }tj||d | tj|jdS )zV
        Build the relative positional vector between `pos` and `pooled_pos`.
        Nr   rX   r   rW   )lenr:   r]   longrI   )	r+   ry   r~   rz   r[   	ref_point
num_removemax_distmin_distr.   r.   r/   rf      s   z%FunnelAttentionStructure.relative_postensoraxisc                    s   |du rdS t  ttfr D ]}||}q|S t |ttfr/t| fdd|D S  |j;  jjrBjjrBt	dddnt	ddd}tt	dg  |g }jjrrtt	dg  t	ddg }t
j|| |g d}|| S )zT
        Perform pooling by stride slicing the tensor along the given axis.
        Nc                 3   s    | ]	} | V  qd S r2   )stride_pool.0xr   r+   r.   r/   	<genexpr>       z7FunnelAttentionStructure.stride_pool.<locals>.<genexpr>rX   r>   r   )r   )
isinstancelisttupler   typendimr   rK   r   slicer:   rb   )r+   r   r   ax
axis_slice	enc_slice	cls_slicer.   r   r/   r      s    
&z$FunnelAttentionStructure.stride_poolmeanmodec                    sz  du rdS t ttfrt fddD S jjrBjjr/ddddf n}tjddddf |gddj	}|dkrVddddddf n|dkrhdddddddf df d	kr{t
jjd
dn$ dkrt
jjd
dn dkrt
jj d
d ntd|dkrddddddf S |dkrdddf S S )z3Apply 1D pooling to a tensor of size [B x T (x H)].Nc                 3   s     | ]}j  d V  qdS ))r   r~   N)pool_tensorr   r   r+   r~   r   r.   r/   r      s    z7FunnelAttentionStructure.pool_tensor.<locals>.<genexpr>rX   r   rY   r>   r   r   T)r~   	ceil_modemaxminz0The supported modes are 'mean', 'max' and 'min'.r   )r   r   r   r   r   rK   r   r:   rb   r   r   rL   
avg_pool2d
max_pool2dNotImplementedError)r+   r   r   r~   suffixr   r.   r   r/   r      s2     "z$FunnelAttentionStructure.pool_tensorattention_inputsc                 C   s   |\}}}}| j jr6| j jdkr | |dd d|dd  }| |d}| |d}| j|| j jd}n3|  jd9  _| j jdkrI| |d}| |ddg}| |ddg}| j|dd}| j|| j jd}||||f}||fS )zTPool `output` and the proper parts of `attention_inputs` before the attention layer.rU   Nr>   r   r   r   r   )r   pool_q_onlyr\   r   r   pooling_typerB   )r+   outputr   rO   rP   rC   rQ   r.   r.   r/   pre_attention_pooling  s     z.FunnelAttentionStructure.pre_attention_poolingc                 C   s   |\}}}}| j jr:|  jd9  _| j jdkr'|dd | |dd d }| |d}| |d}| j|dd}||||f}|S )zFPool the proper parts of `attention_inputs` after the attention layer.r>   rU   Nr   r   r   r   )r   r   rB   r\   r   r   )r+   r   rO   rP   rC   rQ   r.   r.   r/   post_attention_pooling3  s    z/FunnelAttentionStructure.post_attention_poolingr5   Nr   )r   r>   )r7   r8   r9   __doc__r?   int__annotations__r   r   r:   r;   r   rR   rJ   rH   rI   r   rG   re   rf   r   strr   r   r   r<   r.   r.   r,   r/   r=   =   sd   
 

P 
"
&
&r=   positional_attncontext_lenr[   r   c                 C   sn   | j \}}}}t| ||||g} | d d d d |d d d f } t| ||||| g} | dd |f } | S )N.)shaper:   reshape)r   r   r[   
batch_sizen_headrF   max_rel_lenr.   r.   r/   _relative_shift_gatherA  s    r   c                       sz   e Zd Zdededdf fddZdddZdd	d
Z	ddej	dej	dej	de
ej	 dede
ej	df fddZ  ZS )FunnelRelMultiheadAttentionr   r|   r   Nc                    s*  t    || _|| _|j|j|j}}}t|j	| _	t|j
| _
tj||| dd| _t||| | _t||| | _tt||g| _tt||g| _tt|||g| _tt||g| _ttd||g| _t|| || _tj||jd| _d|d  | _d S )NF)biasr>   r   rV   g      ?)r   r   r   r|   r$   r   d_headr   r'   r(   attention_dropoutLinearq_headk_headv_head	Parameterr:   zerosr_w_biasr_r_biasr_kernelr_s_bias	seg_embed	post_projr#   r%   r&   scale)r+   r   r|   r$   r   r   r,   r.   r/   r   R  s"   
z$FunnelRelMultiheadAttention.__init__c                 C   s   | j jdkrA|\}}}}| j| j }	| j}
td||	 |
}||dddf  }||dddf  }td||td|| }n3|jd |krJdnd}|| j |d  }| j| j }| j}
td||
}td|| |}t	|||}|dur|||9 }|S )	z5Relative attention score for the positional encodingsrU   zbinh,dnh->bindNzbind,jd->bnijr   r>   ztd,dnh->tnhzbinh,tnh->bnit)
r   r\   r   r   r   r:   einsumr   r|   r   )r+   rO   r   r   rQ   rr   rt   rs   ru   uw_rq_r_attentionq_r_attention_1q_r_attention_2r   r[   rvr_headr.   r.   r/   relative_positional_attentioni  s(   z9FunnelRelMultiheadAttention.relative_positional_attentionc                 C   s   |du rdS |j \}}}| j| j }td|| | j}|dddf ||j d ||g}tj|ddd\}	}
t||
|j |	|j }|durO||9 }|S )z/Relative attention score for the token_type_idsNr   zbind,snd->bnisr>   r   rX   rY   )	r   r   r   r:   r   r   rg   splitwhere)r+   rP   r   rQ   r   rF   r   r   token_type_biasdiff_token_typesame_token_typetoken_type_attnr.   r.   r/   relative_token_type_attention  s   $z9FunnelRelMultiheadAttention.relative_token_type_attentionFquerykeyvaluer   output_attentions.c                 C   sj  |\}}}}	|j \}
}}|j d }| jj| jj}}| ||
|||}| ||
|||}| ||
|||}|| j }| j	| j }t
d|| |}| ||||	}| |||	}|| | }|j}| }|d ur|td|d d d d f     }t
j|d|d}| |}t
d||}| ||
||| }| |}| || }|r||fS |fS )Nr   zbind,bjnd->bnijrX   )rZ   rH   zbnij,bjnd->bind)r   r   r   r   r   viewr   r   r   r   r:   r   r   r   rH   floatINFsoftmaxr   r   r   r(   r&   )r+   r   r   r   r   r   rO   rP   rC   rQ   r   rF   _r   r   r   r   r   r   r   content_scorer   r   
attn_scorerH   	attn_probattn_vecattn_outr   r.   r.   r/   r4     s0   


"

z#FunnelRelMultiheadAttention.forwardr2   F)r7   r8   r9   r   r   r   r   r   r:   r;   r   boolr4   r<   r.   r.   r,   r/   r   Q  s$    

*r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	FunnelPositionwiseFFNr   r   Nc                    sl   t    t|j|j| _t|j | _	t
|j| _t|j|j| _t
|j| _t|j|j| _d S r2   )r   r   r   r   r$   d_innerlinear_1r	   
hidden_actactivation_functionr'   activation_dropoutlinear_2r(   r)   r#   r%   r&   r*   r,   r.   r/   r     s   
zFunnelPositionwiseFFN.__init__hiddenc                 C   s@   |  |}| |}| |}| |}| |}| || S r2   )r   r   r   r   r)   r&   )r+   r   hr.   r.   r/   r4     s   




zFunnelPositionwiseFFN.forwardr6   r.   r.   r,   r/   r     s    	r   c                       sR   e Zd Zdededdf fddZ	ddejd	ejd
ejdede	f
ddZ
  ZS )FunnelLayerr   r|   r   Nc                    s$   t    t||| _t|| _d S r2   )r   r   r   	attentionr   ffn)r+   r   r|   r,   r.   r/   r     s   
zFunnelLayer.__init__Fr   r   r   r   c                 C   s8   | j |||||d}| |d }|r||d fS |fS )Nr   r   r   )r   r   )r+   r   r   r   r   r   attnr   r.   r.   r/   r4     s   zFunnelLayer.forwardr   )r7   r8   r9   r   r   r   r:   r;   r   r   r4   r<   r.   r.   r,   r/   r     s    r   c                       sj   e Zd Zdeddf fddZ					ddejd	ejdB d
ejdB dedededee	B fddZ
  ZS )FunnelEncoderr   r   Nc                    >   t     | _t | _t fddt jD | _	d S )Nc                    s.   g | ]\ }t  fd dt|D qS )c                    s   g | ]}t  qS r.   r   r   r   )r|   r   r.   r/   
<listcomp>      z5FunnelEncoder.__init__.<locals>.<listcomp>.<listcomp>)r   
ModuleListrc   )r   
block_sizer   )r|   r/   r    s    z*FunnelEncoder.__init__.<locals>.<listcomp>)
r   r   r   r=   attention_structurer   r  	enumerateblock_sizesblocksr*   r,   r  r/   r     s   



zFunnelEncoder.__init__FTr1   rC   rD   r   output_hidden_statesreturn_dictc              
   C   sf  | |}| jj|||d}|}|r|fnd }	|rdnd }
t| jD ]{\}}|d| jjr0dndk}|o8|dk}|rD| j||\}}t|D ]T\}}t	| jj
| D ]G}|dko_|dko_|}|ro|}| jjrj|n| }}n| } }}||||||d}|d }|r| j|}|r|
|dd   }
|r|	|f }	qTqHq"|stdd ||	|
fD S t||	|
d	S )
NrC   rD   r.   r   r>   r   r   c                 s       | ]	}|d ur|V  qd S r2   r.   r   r   r.   r.   r/   r   B  r   z(FunnelEncoder.forward.<locals>.<genexpr>last_hidden_statehidden_states
attentions)type_asr  rR   r  r
  rE   r   rK   r   rc   block_repeatsr   r   r   r
   )r+   r1   rC   rD   r   r  r  r   r   all_hidden_statesall_attentionsr|   blockpooling_flagpooled_hiddenlayer_indexlayerrepeat_index
do_poolingr   r   r   layer_outputr.   r.   r/   r4     sJ   


zFunnelEncoder.forwardNNFFTr7   r8   r9   r   r   r:   r;   r   r   r
   r4   r<   r.   r.   r,   r/   r     s,    r   TFr   r~   
target_lenrK   r   c              	   C   s   |dkr| S |r| ddddf }| ddddf } t j| |dd}|rN|r7tj|ddd|d ddf}|ddd|d f }t j||gdd}|S |ddd|f }|S )z{
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    r   N)repeatsrZ   r   rY   )r:   repeat_interleaver   rL   rM   rb   )r   r~   r"  rK   r   clsr   r.   r.   r/   upsampleF  s   r&  c                       sp   e Zd Zdeddf fddZ					ddejd	ejd
ejdB dejdB dedededee	B fddZ
  ZS )FunnelDecoderr   r   Nc                    r   )Nc                    s   g | ]}t  d qS )r   r   r  r  r.   r/   r  a  r  z*FunnelDecoder.__init__.<locals>.<listcomp>)
r   r   r   r=   r  r   r  rc   num_decoder_layerslayersr*   r,   r  r/   r   ]  s   

$zFunnelDecoder.__init__FTfinal_hiddenfirst_block_hiddenrC   rD   r   r  r  c                 C   s   t |dt| jjd  |jd | jj| jjd}|| }	|r!|	fnd }
|r'dnd }| jj|	||d}| j	D ] }||	|	|	||d}|d }	|rN||dd   }|rU|
|	f }
q5|sdt
dd	 |	|
|fD S t|	|
|d
S )Nr>   r   )r~   r"  rK   r   r.   r  r   r   c                 s   r  r2   r.   r  r.   r.   r/   r     r   z(FunnelDecoder.forward.<locals>.<genexpr>r  )r&  r   r   r	  r   rK   r   r  rR   r)  r   r
   )r+   r*  r+  rC   rD   r   r  r  upsampled_hiddenr   r  r  r   r  r  r.   r.   r/   r4   c  s4   


zFunnelDecoder.forwardr   r!  r.   r.   r,   r/   r'  \  s0    
	r'  c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
FunnelDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.r   r   Nc                    s6   t    || _t|j|j| _t|jd| _d S r   )r   r   r   r   r   r$   densedense_predictionr*   r,   r.   r/   r     s   
z'FunnelDiscriminatorPredictions.__init__discriminator_hidden_statesc                 C   s.   |  |}t| jj |}| |d}|S )NrX   )r.  r	   r   r   r/  squeeze)r+   r0  r  logitsr.   r.   r/   r4     s   
z&FunnelDiscriminatorPredictions.forward)
r7   r8   r9   r   r   r   r:   r;   r4   r<   r.   r.   r,   r/   r-    s    r-  c                   @   s*   e Zd ZU eed< dZe dd ZdS )FunnelPreTrainedModelr   funnelc                 C   sd  |j j}|ddkrJt|dd d ur7| jjd u r+|jj\}}t	dt
||  }n| jj}tj|j|d t|dd d urHt|jd d S d S |dkrtj|j| jjd	 tj|j| jjd	 tj|j| jjd	 tj|j| jjd	 tj|j| jjd	 d S |d
kr| jjd u rdn| jj}tj|jj|d |jjd urt|jj|jj  d S d S d S )Nr   rX   weightrV   )stdr   g        r   )br   )r-   r7   findgetattrr   initializer_stdr5  r   npsqrtr   initnormal_	constant_r   uniform_r   initializer_ranger   r   r   r   r"   r   zeros_)r+   module	classnamefan_outfan_inr6  r.   r.   r/   _init_weights  s0   z#FunnelPreTrainedModel._init_weightsN)	r7   r8   r9   r   r   base_model_prefixr:   no_gradrG  r.   r.   r.   r/   r3    s
   
 r3  c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  Z	S )
FunnelClassificationHeadr   n_labelsr   Nc                    s>   t    t|j|j| _t|j| _t|j|| _	d S r2   )
r   r   r   r   r$   linear_hiddenr'   r(   r)   
linear_out)r+   r   rK  r,   r.   r/   r     s   
z!FunnelClassificationHead.__init__r   c                 C   s(   |  |}t|}| |}| |S r2   )rL  r:   tanhr)   rM  )r+   r   r.   r.   r/   r4     s   



z FunnelClassificationHead.forward)
r7   r8   r9   r   r   r   r:   r;   r4   r<   r.   r.   r,   r/   rJ    s    rJ  z2
    Output type of [`FunnelForPreTraining`].
    )custom_introc                   @   sb   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dS )FunnelForPreTrainingOutputa1  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss of the ELECTRA-style objective.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    Nlossr2  r  r  )r7   r8   r9   r   rQ  r:   FloatTensorr   r2  r  r   r  r.   r.   r.   r/   rP    s   
 rP  z
    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
    decoder) or any task-specific head on top.
    c                       s   e Zd Zdeddf fddZdejfddZdejddfd	d
Ze									dde
jdB de
jdB de
jdB de
jdB de
jdB dedB dedB dedB deeB fddZ  ZS )FunnelBaseModelr   r   Nc                    ,   t  | t|| _t|| _|   d S r2   )r   r   r   r3   r   encoder	post_initr*   r,   r.   r/   r     s   

zFunnelBaseModel.__init__c                 C      | j jS r2   r3   r"   r+   r.   r.   r/   get_input_embeddings     z$FunnelBaseModel.get_input_embeddingsnew_embeddingsc                 C      || j _d S r2   rX  r+   r\  r.   r.   r/   set_input_embeddings     z$FunnelBaseModel.set_input_embeddingsr0   rC   rD   position_idsr1   r   r  r  c	                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}|d u rltj
|
tj|d}| j||d}| j||||||d}|S )NDYou cannot specify both input_ids and inputs_embeds at the same timerX   5You have to specify either input_ids or inputs_embedsrI   rW   r1   rC   rD   r   r  r  )r   r   r  use_return_dict
ValueError%warn_if_padding_and_no_attention_maskrE   rI   r:   onesr   r   r3   rU  )r+   r0   rC   rD   ra  r1   r   r  r  kwargsinput_shaperI   encoder_outputsr.   r.   r/   r4     s6   
	zFunnelBaseModel.forwardNNNNNNNNr7   r8   r9   r   r   r   r   rZ  r_  r   r:   r;   r   r   r
   r4   r<   r.   r.   r,   r/   rS    s@    		rS  c                       s   e Zd Zdeddf fddZdejfddZdejddfd	d
Ze								dde
jdB de
jdB de
jdB de
jdB dedB dedB dedB deeB fddZ  ZS )FunnelModelr   r   Nc                    s<   t  | || _t|| _t|| _t|| _| 	  d S r2   )
r   r   r   r   r3   r   rU  r'  decoderrV  r*   r,   r.   r/   r   (  s   


zFunnelModel.__init__c                 C   rW  r2   rX  rY  r.   r.   r/   rZ  2  r[  z FunnelModel.get_input_embeddingsr\  c                 C   r]  r2   rX  r^  r.   r.   r/   r_  5  r`  z FunnelModel.set_input_embeddingsr0   rC   rD   r1   r   r  r  c              	   K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }	n|d urF| d d }	ntd|d urQ|jn|j}
|d u r_tj	|	|
d}|d u rltj
|	tj|
d}| j||d}| j||||d|d}| j|d	 |d
 | j jd	  |||||d}|sd	}|d	 f}|r|d
7 }||d
 ||  f }|r|d
7 }||d ||  f }|S t|d	 |r|j|j nd |r|j|j dS d dS )Nrb  rX   rc  rd  rW   re  Trf  r   r   )r*  r+  rC   rD   r   r  r  r>   r  )r   r   r  rg  rh  ri  rE   rI   r:   rj  r   r   r3   rU  rq  r	  r
   r  r  )r+   r0   rC   rD   r1   r   r  r  rk  rl  rI   rm  decoder_outputsidxoutputsr.   r.   r/   r4   8  sl   
	

zFunnelModel.forward)NNNNNNNro  r.   r.   r,   r/   rp  &  s:    

rp  z
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
    generated tokens.
    c                          e Zd Zdeddf fddZe								ddejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB de	e
B fddZ  ZS )FunnelForPreTrainingr   r   Nc                    rT  r2   )r   r   rp  r4  r-  discriminator_predictionsrV  r*   r,   r.   r/   r     s   

zFunnelForPreTraining.__init__r0   rC   rD   r1   labelsr   r  r  c	              	   K   s   |dur|n| j j}| j|||||||d}
|
d }| |}d}|dur]t }|durO|d|jd dk}|d|jd | }|| }||| }n||d|jd | }|ss|f|
dd  }|durq|f| S |S t	|||
j
|
jdS )a"  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
            docstring) Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, FunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> logits = model(**inputs).logits
        ```NrC   rD   r1   r   r  r  r   rX   r   rQ  r2  r  r  )r   rg  r4  rw  r   r   r   r   r   rP  r  r  )r+   r0   rC   rD   r1   rx  r   r  r  rk  r0  discriminator_sequence_outputr2  rQ  loss_fctactive_lossactive_logitsactive_labelsr   r.   r.   r/   r4     s<   !	
zFunnelForPreTraining.forwardrn  )r7   r8   r9   r   r   r   r:   r;   r   r   rP  r4   r<   r.   r.   r,   r/   rv    s<    	rv  c                       s   e Zd ZddiZdeddf fddZdejfdd	Zd
ej	ddfddZ
e								ddejdB dejdB dejdB dejdB dejdB dedB dedB dedB deeB fddZ  ZS )FunnelForMaskedLMzlm_head.weightz(funnel.embeddings.word_embeddings.weightr   r   Nc                    s4   t  | t|| _t|j|j| _| 	  d S r2   )
r   r   rp  r4  r   r   r$   r   lm_headrV  r*   r,   r.   r/   r     s   
zFunnelForMaskedLM.__init__c                 C   s   | j S r2   r  rY  r.   r.   r/   get_output_embeddings  s   z'FunnelForMaskedLM.get_output_embeddingsr\  c                 C   s
   || _ d S r2   r  r^  r.   r.   r/   set_output_embeddings  s   
z'FunnelForMaskedLM.set_output_embeddingsr0   rC   rD   r1   rx  r   r  r  c	              	   K   s   |dur|n| j j}| j|||||||d}
|
d }| |}d}|dur6t }||d| j j|d}|sL|f|
dd  }|durJ|f| S |S t|||
j|
j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nry  r   rX   r   rz  )
r   rg  r4  r  r   r   r   r   r  r  )r+   r0   rC   rD   r1   rx  r   r  r  rk  rt  r  prediction_logitsmasked_lm_lossr|  r   r.   r.   r/   r4     s2   

zFunnelForMaskedLM.forwardrn  )r7   r8   r9   _tied_weights_keysr   r   r   r   r  r   r  r   r:   r;   r   r   r   r4   r<   r.   r.   r,   r/   r    sB    		r  z
    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    c                       ru  )FunnelForSequenceClassificationr   r   Nc                    s>   t  | |j| _|| _t|| _t||j| _|   d S r2   )	r   r   
num_labelsr   rS  r4  rJ  
classifierrV  r*   r,   r.   r/   r   '  s   
z(FunnelForSequenceClassification.__init__r0   rC   rD   r1   rx  r   r  r  c	              	   K   st  |dur|n| j j}| j|||||||d}
|
d }|dddf }| |}d}|dur| j jdu rW| jdkr=d| j _n| jdkrS|jtjksN|jtj	krSd| j _nd| j _| j jdkrut
 }| jdkro|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|
dd  }|dur|f| S |S t|||
j|
jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nry  r   r   
regressionsingle_label_classificationmulti_label_classificationrX   rz  )r   rg  r4  r  problem_typer  rH   r:   r   r   r   r1  r   r   r   r   r  r  )r+   r0   rC   rD   r1   rx  r   r  r  rk  rt  r  pooled_outputr2  rQ  r|  r   r.   r.   r/   r4   1  sR   



"


z'FunnelForSequenceClassification.forwardrn  )r7   r8   r9   r   r   r   r:   r;   r   r   r   r4   r<   r.   r.   r,   r/   r     s<    
	r  c                       ru  )FunnelForMultipleChoicer   r   Nc                    s.   t  | t|| _t|d| _|   d S r   )r   r   rS  r4  rJ  r  rV  r*   r,   r.   r/   r   y  s   
z FunnelForMultipleChoice.__init__r0   rC   rD   r1   rx  r   r  r  c	              	   K   sR  |dur|n| j j}|dur|jd n|jd }
|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durV|d|d|dnd}| j|||||||d}|d }|dddf }| |}|d|
}d}|durt }|||}|s|f|dd  }|dur|f| S |S t|||j	|j
dS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   rX   ry  r   rz  )r   rg  r   r   rE   r4  r  r   r   r  r  )r+   r0   rC   rD   r1   rx  r   r  r  rk  num_choicesrt  r  r  r2  reshaped_logitsrQ  r|  r   r.   r.   r/   r4     sF   


zFunnelForMultipleChoice.forwardrn  )r7   r8   r9   r   r   r   r:   r;   r   r   r   r4   r<   r.   r.   r,   r/   r  w  s<    	r  c                       ru  )FunnelForTokenClassificationr   r   Nc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r2   )r   r   r  rp  r4  r   r'   r(   r)   r   r    r  rV  r*   r,   r.   r/   r     s   
z%FunnelForTokenClassification.__init__r0   rC   rD   r1   rx  r   r  r  c	              	   K   s   |dur|n| j j}| j|||||||d}
|
d }| |}| |}d}|dur:t }||d| j|d}|sP|f|
dd  }|durN|f| S |S t|||
j	|
j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nry  r   rX   r   rz  )r   rg  r4  r)   r  r   r   r  r   r  r  )r+   r0   rC   rD   r1   rx  r   r  r  rk  rt  r  r2  rQ  r|  r   r.   r.   r/   r4     s4   


z$FunnelForTokenClassification.forwardrn  )r7   r8   r9   r   r   r   r:   r;   r   r   r   r4   r<   r.   r.   r,   r/   r    s<    	r  c                       s   e Zd Zdeddf fddZe									ddejdB dejdB dejdB d	ejdB d
ejdB dejdB dedB dedB dedB de	e
B fddZ  ZS )FunnelForQuestionAnsweringr   r   Nc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r2   )
r   r   r  rp  r4  r   r   r    
qa_outputsrV  r*   r,   r.   r/   r     s
   
z#FunnelForQuestionAnswering.__init__r0   rC   rD   r1   start_positionsend_positionsr   r  r  c
              	   K   sD  |	d ur|	n| j j}	| j|||||||	d}|d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrM|	d}t| dkrZ|d}|d}|
d|}|
d|}t|d}|||}|||}|| d }|	s||f|dd   }|d ur|f| S |S t||||j|jdS )	Nry  r   r   rX   rY   )ignore_indexr>   )rQ  start_logits
end_logitsr  r  )r   rg  r4  r  r   r1  
contiguousr   rE   squezeclampr   r   r  r  )r+   r0   rC   rD   r1   r  r  r   r  r  rk  rt  r  r2  r  r  
total_lossignored_indexr|  
start_lossend_lossr   r.   r.   r/   r4     sL   







z"FunnelForQuestionAnswering.forward)	NNNNNNNNN)r7   r8   r9   r   r   r   r:   r;   r   r   r   r4   r<   r.   r.   r,   r/   r    sB    
	
r  )	rS  r  r  rv  r  r  r  rp  r3  )TF)=r   dataclassesr   numpyr;  r:   r   torch.nnr   r   r    r   r=  activationsr	   modeling_outputsr
   r   r   r   r   r   modeling_utilsr   utilsr   r   r   configuration_funnelr   
get_loggerr7   loggerr   Moduler   r=   r;   r   r   r   r   r   r   r   r&  r'  r-  r3  rJ  rP  rS  rp  rv  r  r  r  r  r  __all__r.   r.   r.   r/   <module>   s    
   @
1B]PEQH>J