o
    	۷i                     @   s  d Z ddlZddlmZ ddlmZmZ ddlZddl	Z	ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddlmZ e e!Z"dZ#dd Z$G dd de
j%Z&G dd de
j%Z'de	j(de)de)de	j(fddZ*G dd de
j%Z+G dd de
j%Z,G dd  d e
j%Z-G d!d" d"e
j%Z.	$dLd%e	j(d&e)d'e)d(e/d)e/de	j(fd*d+Z0G d,d- d-e
j%Z1G d.d/ d/e
j%Z2eG d0d1 d1eZ3G d2d3 d3e
j%Z4eed4d5G d6d7 d7eZ5ed8d5G d9d: d:e3Z6eG d;d< d<e3Z7ed=d5G d>d? d?e3Z8eG d@dA dAe3Z9edBd5G dCdD dDe3Z:eG dEdF dFe3Z;eG dGdH dHe3Z<eG dIdJ dJe3Z=g dKZ>dS )Mz!PyTorch Funnel Transformer model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )FunnelConfigg    .Ac                 C   s  zddl }ddl}ddl}W n ty   td  w tj|}t	d|  |j
|}g }g }	|D ] \}
}t	d|
 d|  |j
||
}||
 |	| q6ddd	d
ddddddddddd}t||	D ]\}
}|
d}
tdd |
D rt	dd|
  qm|
d dkrqm| }d}|
dd D ]}t|ts|d|rt|d| d }||jk rd}||j| kr||j| 8 }|d7 }||j| ks|j| | }q||j8 }|j| }q|dkrt|tr|j} n.||v rt||| }qzt||}W q ty#   t dd|
 |j! d}Y  nw |sHt"|j!t"|j!kr8|#|j!}|dkrB|$|}t%&||_'qm| S ) z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape k_headq_headv_head	post_projlinear_1linear_2	attentionffnweightbiasword_embeddings
embeddings)kqvolayer_1layer_2rel_attnffkernelgammabetalookup_tableword_embeddinginput/c                 s   s    | ]}|d v V  qdS ))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0nr7   r7   `/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/funnel/modeling_funnel.py	<genexpr>\   s
    
z,load_tf_weights_in_funnel.<locals>.<genexpr>z	Skipping 	generatorFr   z	layer_\d+zlayer_(\d+)rTr+   )(renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin
isinstanceFunnelPositionwiseFFN	fullmatchintsearchgroupsnum_hidden_layersblock_sizesblockslayersFunnelRelMultiheadAttentionr_kernelgetattrAttributeErrorprintshapelenreshape	transposetorch
from_numpydata)modelconfigtf_checkpoint_pathr>   nptftf_path	init_varsnamesarraysnamer_   array
_layer_mappointerskippedm_namelayer_index	block_idxr7   r7   r:   load_tf_weights_in_funnel.   s   






rw   c                       sN   e Zd Zdeddf fddZ	d
deej deej dejfdd	Z  Z	S )FunnelEmbeddingsrg   returnNc                    sH   t    tj|j|j|jd| _tj|j	|j
d| _t|j| _d S )N)padding_idxeps)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idr!   	LayerNormd_modellayer_norm_eps
layer_normDropouthidden_dropoutdropoutselfrg   	__class__r7   r:   r~      s   
zFunnelEmbeddings.__init__	input_idsinputs_embedsc                 C   s*   |d u r	|  |}| |}| |}|S N)r!   r   r   )r   r   r   r"   r7   r7   r:   forward   s
   


zFunnelEmbeddings.forwardNN)
__name__
__module____qualname__r   r~   r   rc   Tensorr   __classcell__r7   r7   r   r:   rx      s    rx   c                       s  e Zd ZU dZdZeed< deddf fddZ		d,d	e	j
d
ee	j
 dee	j
 dee	j
 fddZde	j
de	j
fddZdede	jde	jdeee	j
 eee	j
  f fddZde	j
defddZd-de	j
dedede	j
fddZdee	j
ee	j
 ee	j
 f d eeee ee f de	j
fd!d"Z	d.dee	j
ee	j
 ee	j
 f d$edede	j
fd%d&Zd'ee	j
 dee	j
ee	j
 f fd(d)Zd'ee	j
 dee	j
 fd*d+Z  ZS )/FunnelAttentionStructurez>
    Contains helpers for `FunnelRelMultiheadAttention `.
       cls_token_type_idrg   ry   Nc                    s6   t    || _t|j| _t|j| _d | _d S r   )	r}   r~   rg   r   r   r   sin_dropoutcos_dropoutpooling_multr   r   r7   r:   r~      s
   

z!FunnelAttentionStructure.__init__r   attention_masktoken_type_idsc                 C   sv   d| _ |d | _}| ||j|j}|dur| |nd}| jjr3t	j
||d |d gdnd}||||fS )zCReturns the attention inputs associated to the inputs of the model.r   N)r   r   r   r   )r   sizeseq_lenget_position_embedsdtypedevicetoken_type_ids_to_matrg   separate_clsr   
functionalpadnew_ones)r   r   r   r   r   position_embedstoken_type_matcls_maskr7   r7   r:   init_attention_inputs   s   	"z.FunnelAttentionStructure.init_attention_inputsc                 C   s^   |dddddf |dddf k}|| j k}|dddddf |dddf B }||B S )z-Convert `token_type_ids` to `token_type_mat`.N)r   )r   r   r   cls_idscls_matr7   r7   r:   r      s   &
&z.FunnelAttentionStructure.token_type_ids_to_matr   r   r   c                 C   s  | j j}| j jdkr}tjd|dtj|d|}tjd|d dtj|d|}dd||d    }|dddf |d  }t|}	| |	}
t	|}| 
|}tj|
|
gd	d
}tj||	gd	d
}tj||gd	d
}tj|	 |gd	d
}||||fS tjd|d dtj|d|}dd||d    }tj| d |d dtj|d|}|d }|dddf |d  }| t|}	| 
t	|}tj|	|gd	d
}tjd|tj|d|}|}g }td| j jD ]e}|dkrd}n/| ||}d|d  }| j|||dd}|dddf | }||d|}t|d|}|}d| }| ||}|dddf | }||d|}t|d|}|||g q|S )a  
        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
        are using the factorized or the relative shift attention:

        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
        final formula.

        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
        formula.

        Paper link: https://huggingface.co/papers/2006.03236
        
factorizedr         ?r   r   r   r   i'  Ndim)shift)rg   r   attention_typerc   arangeint64tosinr   cosr   catrange
num_blocksstride_pool_posrelative_posexpandr   gatherrK   )r   r   r   r   r   pos_seqfreq_seqinv_freqsinusoid	sin_embedsin_embed_d	cos_embedcos_embed_dphipsipiomega
rel_pos_idzero_offset	pos_embedpos
pooled_posposition_embeds_listblock_indexposition_embeds_poolingstriderel_posposition_embeds_no_poolingr7   r7   r:   r      sV    



 &z,FunnelAttentionStructure.get_position_embedspos_idr   c                 C   sf   | j jr,|d|  d g}| j jr|dd n|dd }t||ddd gdS |ddd S )ze
        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
        r   r   r   Nr   )rg   r   
new_tensortruncate_seqrc   r   )r   r   r   cls_pospooled_pos_idr7   r7   r:   r     s
    z(FunnelAttentionStructure.stride_pool_posr   r   r   r   c           	      C   sb   |du r|}|d |d  }|t | }|||  }|d |d  }tj||d | tj|jdS )zV
        Build the relative positional vector between `pos` and `pooled_pos`.
        Nr   r   r   r   )r`   rc   r   longr   )	r   r   r   r   r   	ref_point
num_removemax_distmin_distr7   r7   r:   r   $  s   z%FunnelAttentionStructure.relative_postensoraxisc                    s   |du rdS t  ttfr D ]}||}q|S t |ttfr/t| fdd|D S  |j;  jjrBjjrBt	dddnt	ddd}t	dg  |g }jjrnt	dg  t	ddg }t
j|| |g d}|| S )zT
        Perform pooling by stride slicing the tensor along the given axis.
        Nc                 3   s    | ]	} | V  qd S r   )stride_poolr8   xr   r   r7   r:   r;   E      z7FunnelAttentionStructure.stride_pool.<locals>.<genexpr>r   r   r   )r   )rP   listtupler   typendimrg   r   r   slicerc   r   )r   r   r   ax
axis_slice	enc_slice	cls_slicer7   r   r:   r   2  s    
&z$FunnelAttentionStructure.stride_poolmeanmodec                    sz  du rdS t ttfrt fddD S jjrBjjr/ddddf n}tjddddf |gddj	}|dkrVddddddf n|dkrhdddddddf df d	kr{t
jjd
dn$ dkrt
jjd
dn dkrt
jj d
d ntd|dkrddddddf S |dkrdddf S S )z3Apply 1D pooling to a tensor of size [B x T (x H)].Nc                 3   s     | ]}j  d V  qdS ))r   r   N)pool_tensorr   r   r   r   r   r7   r:   r;   \  s    z7FunnelAttentionStructure.pool_tensor.<locals>.<genexpr>r   r   r   r   r	   r   T)r   	ceil_modemaxminz0The supported modes are 'mean', 'max' and 'min'.r   )rP   r   r   r   rg   r   r   rc   r   r   r   r   
avg_pool2d
max_pool2dNotImplementedError)r   r   r   r   suffixr   r7   r   r:   r   S  s2     "z$FunnelAttentionStructure.pool_tensorattention_inputsc                 C   s   |\}}}}| j jr6| j jdkr | |dd d|dd  }| |d}| |d}| j|| j jd}n3|  jd9  _| j jdkrI| |d}| |ddg}| |ddg}| j|dd}| j|| j jd}||||f}||fS )zTPool `output` and the proper parts of `attention_inputs` before the attention layer.r   Nr   r   r   r   r   )rg   pool_q_onlyr   r   r   pooling_typer   )r   outputr   r   r   r   r   r7   r7   r:   pre_attention_poolingy  s     z.FunnelAttentionStructure.pre_attention_poolingc                 C   s   |\}}}}| j jr:|  jd9  _| j jdkr'|dd | |dd d }| |d}| |d}| j|dd}||||f}|S )zFPool the proper parts of `attention_inputs` after the attention layer.r   r   Nr   r   r   r  )rg   r  r   r   r   r   )r   r   r   r   r   r   r7   r7   r:   post_attention_pooling  s    z/FunnelAttentionStructure.post_attention_poolingr   Nr   )r   r   )r   r   r   __doc__r   rS   __annotations__r   r~   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   strr   r  r  r   r7   r7   r   r:   r      sd   
 

P 
"
&
&r   positional_attncontext_lenr   ry   c                 C   sn   | j \}}}}t| ||||g} | d d d d |d d d f } t| ||||| g} | dd |f } | S )N.)r_   rc   ra   )r  r  r   
batch_sizen_headr   max_rel_lenr7   r7   r:   _relative_shift_gather  s    r  c                       sz   e Zd Zdededdf fddZdddZdd	d
Z	ddej	dej	dej	de
ej	 dede
ej	df fddZ  ZS )rZ   rg   r   ry   Nc                    s*  t    || _|| _|j|j|j}}}t|j	| _	t|j
| _
tj||| dd| _t||| | _t||| | _tt||g| _tt||g| _tt|||g| _tt||g| _ttd||g| _t|| || _tj||jd| _d|d  | _d S )NF)r    r   r{   r   g      ?)r}   r~   rg   r   r   r  d_headr   r   r   attention_dropoutLinearr   r   r   	Parameterrc   zerosr_w_biasr_r_biasr[   r_s_bias	seg_embedr   r   r   r   scale)r   rg   r   r   r  r  r   r7   r:   r~     s"   
z$FunnelRelMultiheadAttention.__init__c                 C   s   | j jdkrA|\}}}}| j| j }	| j}
td||	 |
}||dddf  }||dddf  }td||td|| }n3|jd |krJdnd}|| j |d  }| j| j }| j}
td||
}td|| |}t	|||}|dur|||9 }|S )	z5Relative attention score for the positional encodingsr   zbinh,dnh->bindNzbind,jd->bnijr   r   ztd,dnh->tnhzbinh,tnh->bnit)
rg   r   r  r  r[   rc   einsumr_   r   r  )r   r   r   r  r   r   r   r   r   uw_rq_r_attentionq_r_attention_1q_r_attention_2r  r   r=   r%   r_headr7   r7   r:   relative_positional_attention  s(   z9FunnelRelMultiheadAttention.relative_positional_attentionc                 C   s   |du rdS |j \}}}| j| j }td|| | j}|dddf ||j d ||g}tj|ddd\}	}
t||
|j |	|j }|durO||9 }|S )z/Relative attention score for the token_type_idsNr   zbind,snd->bnisr   r   r   r   )	r_   r  r  rc   r  r  r   rM   where)r   r   r   r   r  r   r  r  token_type_biasdiff_token_typesame_token_typetoken_type_attnr7   r7   r:   relative_token_type_attention  s   $z9FunnelRelMultiheadAttention.relative_token_type_attentionFquerykeyvaluer   output_attentions.c                 C   sj  |\}}}}	|j \}
}}|j d }| jj| jj}}| ||
|||}| ||
|||}| ||
|||}|| j }| j	| j }t
d|| |}| ||||	}| |||	}|| | }|j}| }|d ur|td|d d d d f     }t
j|d|d}| |}t
d||}| ||
||| }| |}| || }|r||fS |fS )Nr   zbind,bjnd->bnijr   )r   r   zbnij,bjnd->bind)r_   rg   r  r  r   viewr   r   r  r  rc   r  r"  r(  r   floatINFsoftmaxr  r   ra   r   r   )r   r)  r*  r+  r   r,  r   r   r   r   r  r   _r  r  r  r   r   r   r  content_scorer  r'  
attn_scorer   	attn_probattn_vecattn_outr  r7   r7   r:   r     s0   


"

z#FunnelRelMultiheadAttention.forwardr   F)r   r   r   r   rS   r~   r"  r(  rc   r   r   boolr   r   r7   r7   r   r:   rZ     s$    

*rZ   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	rQ   rg   ry   Nc                    sl   t    t|j|j| _t|j | _	t
|j| _t|j|j| _t
|j| _t|j|j| _d S r   )r}   r~   r   r  r   d_innerr   r
   
hidden_actactivation_functionr   activation_dropoutr   r   r   r   r   r   r   r   r7   r:   r~   >  s   
zFunnelPositionwiseFFN.__init__hiddenc                 C   s@   |  |}| |}| |}| |}| |}| || S r   )r   r;  r<  r   r   r   )r   r=  hr7   r7   r:   r   G  s   




zFunnelPositionwiseFFN.forward)	r   r   r   r   r~   rc   r   r   r   r7   r7   r   r:   rQ   =  s    	rQ   c                       sR   e Zd Zdededdf fddZ	ddejd	ejd
ejdede	f
ddZ
  ZS )FunnelLayerrg   r   ry   Nc                    s$   t    t||| _t|| _d S r   )r}   r~   rZ   r   rQ   r   )r   rg   r   r   r7   r:   r~   Q  s   
zFunnelLayer.__init__Fr)  r*  r+  r,  c                 C   s8   | j |||||d}| |d }|r||d fS |fS )Nr,  r   r   )r   r   )r   r)  r*  r+  r   r,  attnr  r7   r7   r:   r   V  s   zFunnelLayer.forwardr7  )r   r   r   r   rS   r~   rc   r   r8  r   r   r   r7   r7   r   r:   r?  P  s    r?  c                       sn   e Zd Zdeddf fddZ					ddejd	eej d
eej dededede	e
ef fddZ  ZS )FunnelEncoderrg   ry   Nc                    >   t     | _t | _t fddt jD | _	d S )Nc                    s.   g | ]\ }t  fd dt|D qS )c                    s   g | ]}t  qS r7   r?  r8   r1  )r   rg   r7   r:   
<listcomp>j      z5FunnelEncoder.__init__.<locals>.<listcomp>.<listcomp>)r   
ModuleListr   )r8   
block_sizerg   )r   r:   rF  i  s    z*FunnelEncoder.__init__.<locals>.<listcomp>)
r}   r~   rg   r   attention_structurer   rH  	enumeraterW   rX   r   r   rJ  r:   r~   d  s   



zFunnelEncoder.__init__FTr   r   r   r,  output_hidden_statesreturn_dictc              
   C   sf  | |}| jj|||d}|}|r|fnd }	|rdnd }
t| jD ]{\}}|d| jjr0dndk}|o8|dk}|rD| j||\}}t|D ]T\}}t	| jj
| D ]G}|dko_|dko_|}|ro|}| jjrj|n| }}n| } }}||||||d}|d }|r| j|}|r|
|dd   }
|r|	|f }	qTqHq"|stdd ||	|
fD S t||	|
d	S )
Nr   r   r7   r   r   r   r@  c                 s       | ]	}|d ur|V  qd S r   r7   r8   r%   r7   r7   r:   r;     r   z(FunnelEncoder.forward.<locals>.<genexpr>last_hidden_statehidden_states
attentions)type_asrK  r   rL  rX   r   rg   r   r  r   block_repeatsr  r  r   r   )r   r   r   r   r,  rM  rN  r   r=  all_hidden_statesall_attentionsr   blockpooling_flagpooled_hiddenru   layerrepeat_index
do_poolingr)  r*  r+  layer_outputr7   r7   r:   r   o  sJ   


zFunnelEncoder.forwardNNFFTr   r   r   r   r~   rc   r   r   r8  r   r   r   r   r   r7   r7   r   r:   rB  c  s,    
rB  TFr   r   
target_lenr   r   c              	   C   s   |dkr| S |r| ddddf }| ddddf } t j| |dd}|rN|r7tj|ddd|d ddf}|ddd|d f }t j||gdd}|S |ddd|f }|S )z{
    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
    r   N)repeatsr   r   r   )rc   repeat_interleaver   r   r   r   )r   r   rc  r   r   clsr  r7   r7   r:   upsample  s   rg  c                       st   e Zd Zdeddf fddZ					ddejd	ejd
eej deej dededede	e
ef fddZ  ZS )FunnelDecoderrg   ry   Nc                    rC  )Nc                    s   g | ]}t  d qS )r   rD  rE  rJ  r7   r:   rF    rG  z*FunnelDecoder.__init__.<locals>.<listcomp>)
r}   r~   rg   r   rK  r   rH  r   num_decoder_layersrY   r   r   rJ  r:   r~     s   

$zFunnelDecoder.__init__FTfinal_hiddenfirst_block_hiddenr   r   r,  rM  rN  c                 C   s   t |dt| jjd  |jd | jj| jjd}|| }	|r!|	fnd }
|r'dnd }| jj|	||d}| j	D ] }||	|	|	||d}|d }	|rN||dd   }|rU|
|	f }
q5|sdt
dd	 |	|
|fD S t|	|
|d
S )Nr   r   )r   rc  r   r   r7   rO  r@  r   c                 s   rP  r   r7   rQ  r7   r7   r:   r;     r   z(FunnelDecoder.forward.<locals>.<genexpr>rR  )rg  r`   rg   rW   r_   r   r   rK  r   rY   r   r   )r   rj  rk  r   r   r,  rM  rN  upsampled_hiddenr=  rX  rY  r   r]  r`  r7   r7   r:   r     s4   


zFunnelDecoder.forwardra  rb  r7   r7   r   r:   rh    s0    

	rh  c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
FunnelDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.rg   ry   Nc                    s6   t    || _t|j|j| _t|jd| _d S r  )r}   r~   rg   r   r  r   densedense_predictionr   r   r7   r:   r~     s   
z'FunnelDiscriminatorPredictions.__init__discriminator_hidden_statesc                 C   s.   |  |}t| jj |}| |d}|S )Nr   )rn  r
   rg   r:  ro  squeeze)r   rp  rT  logitsr7   r7   r:   r     s   
z&FunnelDiscriminatorPredictions.forward)
r   r   r   r  r   r~   rc   r   r   r   r7   r7   r   r:   rm    s    rm  c                   @   s&   e Zd ZU eed< eZdZdd ZdS )FunnelPreTrainedModelrg   funnelc                 C   st  |j j}|ddkrLt|dd d ur8| jjd u r+|jj\}}t	dt
||  }n| jj}tjj|j|d t|dd d urJtj|jd d S d S |dkrtjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 tjj|j| jjd	 d S |d
kr| jjd u rdn| jj}tjj|jj|d |jjd ur|jjj|jj   d S d S d S )Nr  r   r   r   )stdr    g        rZ   )brx   )r   r   findr\   rg   initializer_stdr   r_   ri   sqrtr.  r   initnormal_	constant_r    uniform_r  initializer_ranger  r[   r  r  r!   rz   re   zero_)r   module	classnamefan_outfan_inru  r7   r7   r:   _init_weights  s0   z#FunnelPreTrainedModel._init_weightsN)	r   r   r   r   r	  rw   load_tf_weightsbase_model_prefixr  r7   r7   r7   r:   rs    s
   
 rs  c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  Z	S )
FunnelClassificationHeadrg   n_labelsry   Nc                    s>   t    t|j|j| _t|j| _t|j|| _	d S r   )
r}   r~   r   r  r   linear_hiddenr   r   r   
linear_out)r   rg   r  r   r7   r:   r~     s   
z!FunnelClassificationHead.__init__r=  c                 C   s(   |  |}t|}| |}| |S r   )r  rc   tanhr   r  )r   r=  r7   r7   r:   r     s   



z FunnelClassificationHead.forward)
r   r   r   r   rS   r~   rc   r   r   r   r7   r7   r   r:   r    s    r  z2
    Output type of [`FunnelForPreTraining`].
    )custom_introc                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )FunnelForPreTrainingOutputa1  
    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Total loss of the ELECTRA-style objective.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Prediction scores of the head (scores for each token before SoftMax).
    Nlossrr  rT  rU  )r   r   r   r  r  r   rc   FloatTensorr	  rr  rT  r   rU  r7   r7   r7   r:   r  &  s   
 r  z
    The base Funnel Transformer Model transformer outputting raw hidden-states without upsampling head (also called
    decoder) or any task-specific head on top.
    c                       s   e Zd Zdeddf fddZdejfddZdejddfd	d
Ze										dde
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )FunnelBaseModelrg   ry   Nc                    ,   t  | t|| _t|| _|   d S r   )r}   r~   rx   r"   rB  encoder	post_initr   r   r7   r:   r~   A  s   

zFunnelBaseModel.__init__c                 C      | j jS r   r"   r!   r   r7   r7   r:   get_input_embeddingsJ     z$FunnelBaseModel.get_input_embeddingsnew_embeddingsc                 C      || j _d S r   r  r   r  r7   r7   r:   set_input_embeddingsM     z$FunnelBaseModel.set_input_embeddingsr   r   r   position_ids	head_maskr   r,  rM  rN  c
                 C   s   |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|d ur*|d ur*td|d ur9| || | }
n|d urF| d d }
ntd|d urQ|jn|j}|d u r_tj	|
|d}|d u rltj
|
tj|d}| j||d}| j||||||	d}|S )NDYou cannot specify both input_ids and inputs_embeds at the same timer   5You have to specify either input_ids or inputs_embedsr   r   r   r   r   r,  rM  rN  )rg   r,  rM  use_return_dict
ValueError%warn_if_padding_and_no_attention_maskr   r   rc   onesr  r   r"   r  )r   r   r   r   r  r  r   r,  rM  rN  input_shaper   encoder_outputsr7   r7   r:   r   P  s6   
	zFunnelBaseModel.forward	NNNNNNNNNr   r   r   r   r~   r   r   r  r  r   r   rc   r   r8  r   r   r   r   r   r7   r7   r   r:   r  :  sF    		

r  c                       s   e Zd Zdeddf fddZdejfddZdejddfd	d
Ze								dde
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )FunnelModelrg   ry   Nc                    s<   t  | || _t|| _t|| _t|| _| 	  d S r   )
r}   r~   rg   rx   r"   rB  r  rh  decoderr  r   r   r7   r:   r~     s   


zFunnelModel.__init__c                 C   r  r   r  r  r7   r7   r:   r    r  z FunnelModel.get_input_embeddingsr  c                 C   r  r   r  r  r7   r7   r:   r    r  z FunnelModel.set_input_embeddingsr   r   r   r   r,  rM  rN  c              	   C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|d ur*td|d ur9| || | }n|d urF| d d }ntd|d urQ|jn|j}	|d u r_tj	||	d}|d u rltj
|tj|	d}| j||d}| j||||d|d}
| j|
d	 |
d
 | j jd	  |||||d}|sd	}|d	 f}|r|d
7 }||
d
 ||  f }|r|d
7 }||
d ||  f }|S t|d	 |r|
j|j nd |r|
j|j dS d dS )Nr  r   r  r  r   r  Tr  r   r   )rj  rk  r   r   r,  rM  rN  r   rR  )rg   r,  rM  r  r  r  r   r   rc   r  r  r   r"   r  r  rW   r   rT  rU  )r   r   r   r   r   r,  rM  rN  r  r   r  decoder_outputsidxoutputsr7   r7   r:   r     sl   
	

zFunnelModel.forward)NNNNNNNr  r7   r7   r   r:   r    s:    

	r  z
    Funnel Transformer model with a binary classification head on top as used during pretraining for identifying
    generated tokens.
    c                          e Zd Zdeddf fddZe								ddeej deej deej d	eej d
eej dee	 dee	 dee	 de
eef fddZ  ZS )FunnelForPreTrainingrg   ry   Nc                    r  r   )r}   r~   r  rt  rm  discriminator_predictionsr  r   r   r7   r:   r~     s   

zFunnelForPreTraining.__init__r   r   r   r   labelsr,  rM  rN  c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}d}|dur]t }|durO|d|
jd dk}|d|
jd | }|| }||| }n||d|
jd | }|ss|f|	dd  }|durq|f| S |S t	|||	j
|	jdS )a"  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see `input_ids`
            docstring) Indices should be in `[0, 1]`:

            - 0 indicates the token is an original token,
            - 1 indicates the token was replaced.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, FunnelForPreTraining
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("funnel-transformer/small")
        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> logits = model(**inputs).logits
        ```Nr   r   r   r,  rM  rN  r   r   r   r  rr  rT  rU  )rg   r  rt  r  r   r   r-  r_   r.  r  rT  rU  )r   r   r   r   r   r  r,  rM  rN  rp  discriminator_sequence_outputrr  r  loss_fctactive_lossactive_logitsactive_labelsr  r7   r7   r:   r     s<    	
zFunnelForPreTraining.forwardNNNNNNNN)r   r   r   r   r~   r   r   rc   r   r8  r   r   r  r   r   r7   r7   r   r:   r    s<    	

r  c                       s   e Zd ZdgZdeddf fddZdejfddZd	ej	ddfd
dZ
e								ddeej deej deej deej deej dee dee dee deeef fddZ  ZS )FunnelForMaskedLMzlm_head.weightrg   ry   Nc                    s4   t  | t|| _t|j|j| _| 	  d S r   )
r}   r~   r  rt  r   r  r   r   lm_headr  r   r   r7   r:   r~   :  s   
zFunnelForMaskedLM.__init__c                 C   s   | j S r   r  r  r7   r7   r:   get_output_embeddingsC  s   z'FunnelForMaskedLM.get_output_embeddingsr  c                 C   s
   || _ d S r   r  r  r7   r7   r:   set_output_embeddingsF  s   
z'FunnelForMaskedLM.set_output_embeddingsr   r   r   r   r  r,  rM  rN  c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}d}|dur6t }||d| j j|d}|sL|f|	dd  }|durJ|f| S |S t|||	j|	j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr  r   r   r   r  )
rg   r  rt  r  r   r-  r   r   rT  rU  )r   r   r   r   r   r  r,  rM  rN  r  rS  prediction_logitsmasked_lm_lossr  r  r7   r7   r:   r   I  s2   

zFunnelForMaskedLM.forwardr  )r   r   r   _tied_weights_keysr   r~   r   r  r  r   r  r   r   rc   r   r8  r   r   r   r   r   r7   r7   r   r:   r  6  sB    		

r  z
    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    c                       r  )FunnelForSequenceClassificationrg   ry   Nc                    s>   t  | |j| _|| _t|| _t||j| _|   d S r   )	r}   r~   
num_labelsrg   r  rt  r  
classifierr  r   r   r7   r:   r~     s   
z(FunnelForSequenceClassification.__init__r   r   r   r   r  r,  rM  rN  c	              	   C   st  |dur|n| j j}| j|||||||d}	|	d }
|
dddf }| |}d}|dur| j jdu rW| jdkr=d| j _n| jdkrS|jtjksN|jtj	krSd| j _nd| j _| j jdkrut
 }| jdkro|| | }n+|||}n%| j jdkrt }||d| j|d}n| j jdkrt }|||}|s|f|	dd  }|dur|f| S |S t|||	j|	jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr   r  )rg   r  rt  r  problem_typer  r   rc   r   rS   r   rq  r   r-  r   r   rT  rU  )r   r   r   r   r   r  r,  rM  rN  r  rS  pooled_outputrr  r  r  r  r7   r7   r:   r     sR   



"


z'FunnelForSequenceClassification.forwardr  )r   r   r   r   r~   r   r   rc   r   r8  r   r   r   r   r   r7   r7   r   r:   r  {  s<    
	

r  c                       r  )FunnelForMultipleChoicerg   ry   Nc                    s.   t  | t|| _t|d| _|   d S r  )r}   r~   r  rt  r  r  r  r   r   r7   r:   r~     s   
z FunnelForMultipleChoice.__init__r   r   r   r   r  r,  rM  rN  c	              	   C   sR  |dur|n| j j}|dur|jd n|jd }	|dur%|d|dnd}|dur4|d|dnd}|durC|d|dnd}|durV|d|d|dnd}| j|||||||d}
|
d }|dddf }| |}|d|	}d}|durt }|||}|s|f|
dd  }|dur|f| S |S t|||
j	|
j
dS )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   r  r   r  )rg   r  r_   r-  r   rt  r  r   r   rT  rU  )r   r   r   r   r   r  r,  rM  rN  num_choicesr  rS  r  rr  reshaped_logitsr  r  r  r7   r7   r:   r     sF   


zFunnelForMultipleChoice.forwardr  )r   r   r   r   r~   r   r   rc   r   r8  r   r   r   r   r   r7   r7   r   r:   r    s<    	

r  c                       r  )FunnelForTokenClassificationrg   ry   Nc                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S r   )r}   r~   r  r  rt  r   r   r   r   r  r   r  r  r   r   r7   r:   r~     s   
z%FunnelForTokenClassification.__init__r   r   r   r   r  r,  rM  rN  c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|dur:t }||d| j|d}|sP|f|	dd  }|durN|f| S |S t|||	j	|	j
dS )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r   r   r  )rg   r  rt  r   r  r   r-  r  r   rT  rU  )r   r   r   r   r   r  r,  rM  rN  r  rS  rr  r  r  r  r7   r7   r:   r   &  s4   


z$FunnelForTokenClassification.forwardr  )r   r   r   r   r~   r   r   rc   r   r8  r   r   r   r   r   r7   r7   r   r:   r    s<    	

r  c                       s   e Zd Zdeddf fddZe									ddeej deej deej d	eej d
eej deej dee	 dee	 dee	 de
eef fddZ  ZS )FunnelForQuestionAnsweringrg   ry   Nc                    s<   t  | |j| _t|| _t|j|j| _| 	  d S r   )
r}   r~   r  r  rt  r   r  r   
qa_outputsr  r   r   r7   r:   r~   Y  s
   
z#FunnelForQuestionAnswering.__init__r   r   r   r   start_positionsend_positionsr,  rM  rN  c
              	   C   sD  |	d ur|	n| j j}	| j|||||||	d}
|
d }| |}|jddd\}}|d }|d }d }|d ur|d urt| dkrM|	d}t| dkrZ|d}|d}|
d|}|
d|}t|d}|||}|||}|| d }|	s||f|
dd   }|d ur|f| S |S t||||
j|
jdS )	Nr  r   r   r   r   )ignore_indexr   )r  start_logits
end_logitsrT  rU  )rg   r  rt  r  rM   rq  
contiguousr`   r   squezeclampr   r   rT  rU  )r   r   r   r   r   r  r  r,  rM  rN  r  rS  rr  r  r  
total_lossignored_indexr  
start_lossend_lossr  r7   r7   r:   r   c  sL   







z"FunnelForQuestionAnswering.forwardr  )r   r   r   r   r~   r   r   rc   r   r8  r   r   r   r   r   r7   r7   r   r:   r  W  sB    
	

r  )
r  r  r  r  r  r  r  r  rs  rw   )TF)?r  rD   dataclassesr   typingr   r   r?   ri   rc   r   torch.nnr   r   r   activationsr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   utilsr   r   r   configuration_funnelr   
get_loggerr   rB   r/  rw   Modulerx   r   r   rS   r  rZ   rQ   r?  rB  r8  rg  rh  rm  rs  r  r  r  r  r  r  r  r  r  r  __all__r7   r7   r7   r:   <module>   s    
Z   @
1C]ODPG=I