o
    ߥi                     @   s  d Z ddlmZmZmZ ddlZddlZddlZddlZ	ddl
Z
ddlZddlmZ ddlm  mZ ddlZddlmZ dd ZG dd deZG d	d
 d
ejZG dd dejZG dd dejZd1ddZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd  d ejZ!G d!d" d"ejZ"G d#d$ d$ejZ#G d%d& d&ejZ$G d'd( d(ejZ%G d)d* d*ejZ&G d+d, d,ejZ'G d-d. d.ejZ(G d/d0 d0ejZ)dS )2zPyTorch BERT model.    )absolute_importdivisionprint_functionN)CrossEntropyLossc                 C   s    | d dt | td   S )zImplementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    g      ?      ?g       @)torcherfmathsqrt)x r   f/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/diffusion/structbert.pygelu   s    r   c                   @   sv   e Zd ZdZ											
																dddZedd Zedd Zdd Zdd Z	dS )
BertConfigzEConfiguration class to store the configuration of a `BertModel`.
          originallinearr         r   皙?      {Gz?selfFc                 C   s   || _ || _|| _|| _|| _|| _|| _|| _|	| _|| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _dS )a_  Constructs BertConfig.

        Args:
            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler.
            hidden_dropout_prob: The dropout probability for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `BertModel`.
            initializer_range: The stdev of the truncated_normal_initializer for
                initializing all weight matrices.
        N)
vocab_sizehidden_sizeemb_sizenum_hidden_layerstransformer_typetransition_functionweighted_transformernum_rolled_layersnum_attention_heads
hidden_actintermediate_sizehidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizeinitializer_rangeattention_typerezeropre_lnsqueeze_excitationtransfer_matrixdim_dropoutset_mask_zeroroberta_style
init_scale
safer_fp16grad_checkpoint)r   r   r   r   r   r    r!   r"   r#   r$   r&   r%   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r3   r2   r4   r5   r6   r   r   r   __init__*   s6   4
zBertConfig.__init__c                 C   s,   t dd}t|D ]	\}}||j|< q
|S )zAConstructs a `BertConfig` from a Python dictionary of parameters.N)r   )r   six	iteritems__dict__)clsjson_objectconfigkeyvaluer   r   r   	from_dictz   s   
zBertConfig.from_dictc                 C   sF   t |ddd}| }W d   n1 sw   Y  | t|S )z9Constructs a `BertConfig` from a json file of parameters.rzutf-8)encodingN)openreadr@   jsonloads)r;   	json_filereadertextr   r   r   from_json_file   s   
zBertConfig.from_json_filec                 C   s   t | j}|S )z0Serializes this instance to a Python dictionary.)copydeepcopyr:   r   outputr   r   r   to_dict   s   zBertConfig.to_dictc                 C   s   t j|  dddd S )z*Serializes this instance to a JSON string.   T)indent	sort_keys
)rE   dumpsrO   )r   r   r   r   to_json_string   s   zBertConfig.to_json_stringN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   FFFFFFFFFF)
__name__
__module____qualname____doc__r7   classmethodr@   rJ   rO   rU   r   r   r   r   r   &   sF    
P

r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )BERTLayerNorm-q=Nc                    sd   t t|   || _|dur|n|j}tt|| _	tt
|| _|js-|| _dS d| _dS )zWConstruct a layernorm module in the TF style (epsilon inside the square root).
        Ngh㈵>)superr[   r7   r=   r   nn	Parameterr   onesgammazerosbetar3   variance_epsilon)r   r=   rd   special_sizer   	__class__r   r   r7      s   zBERTLayerNorm.__init__c                 C   s   |  }| jjr| }|jddd}|| djddd}|| t|| j  }| jjr:| j	| | j
  |S | j	| | j
 S )Nr   TkeepdimrP   )typer=   r5   floatmeanpowr   r
   rd   ra   rc   )r   r   previous_typeusr   r   r   forward   s   zBERTLayerNorm.forward)r\   NrV   rW   rX   r7   rq   __classcell__r   r   rf   r   r[      s    
r[   c                       &   e Zd Z fddZdddZ  ZS )BERTEmbeddingsc                    s   t t|   	 |jdk r|jn|j}tj|j||jrdnd d| _	tj|j
||jr,dnd d| _t|j|| _|| _|jdk rCd nt|j|j| _t||d| _t|j| _d S )Nr      )padding_idx)re   )r]   ru   r7   r   r   r^   	Embeddingr   r3   word_embeddingsr)   position_embeddingsr*   token_type_embeddingsr=   Linearprojr[   	LayerNormDropoutr'   dropout)r   r=   r   rf   r   r   r7      s,   zBERTEmbeddings.__init__Nc                 C   s  | d}| jjstj|tj|jd}|d|}n|	d
 }tj|dd||  d }|d u r<t|}|d u rE| |n|}| jjrQd||dk< | |}| |}	| jjsf|| |	 }
n|| }
| |
}
| |
}
| jd ur| |
}
| |
}
d S |
|fS )Nrv   )dtypedevicer   dim        g   )sizer=   r3   r   arangelongr   	unsqueeze	expand_asneintcumsumtype_as
zeros_likery   r2   rz   r{   r~   r   r}   )r   	input_idstoken_type_idsadv_embedding
seq_lengthposition_idsmaskwords_embeddingsrz   r{   
embeddingsr   r   r   rq      sB   








zBERTEmbeddings.forward)NNrr   r   r   rf   r   ru      s    ru   c                       ,   e Zd Z fddZdd Zdd Z  ZS )BERTFactorizedAttentionc                    s   t t|   |j|j dkrtd|j|jf |j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _d S Nr   zLThe hidden size (%d) is not a multiple of the number of attention heads (%d))r]   r   r7   r   r$   
ValueErrorr   attention_head_sizeall_head_sizer^   r|   queryr>   r?   r   r(   r   r   r=   rf   r   r   r7      s    
z BERTFactorizedAttention.__init__c                 G   s0   |  d d | j| jf }|j| }||S )Nr   r   r$   r   viewpermute)r   r   r   new_x_shaper   r   r   transpose_for_scores   s
   

z,BERTFactorizedAttention.transpose_for_scoresc                 C   s   |  |}| |}| |}| |dddd}| |dddd}| |dddd}|| }	tjdd|	}
| |
}
tjdd|}t|
|}t||}|	dddd
 }| d d | jf }|j| }|S )Nr   rP   r   rv   r   r   )r   r>   r?   r   r^   Softmaxr   r   matmulr   
contiguousr   r   r   )r   hidden_statesattention_maskmixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layers_attention_scoress_attention_probsc_attention_probss_context_layercontext_layernew_context_layer_shaper   r   r   rq      s$   




zBERTFactorizedAttention.forwardrV   rW   rX   r7   r   rq   rs   r   r   rf   r   r      s    r   r   Fc                 C   s`   |r|dkr| S d| }| j |   d }t|| }|||tj||dd  |  S )Nr   rv   T)r   ri   )datanewr   zero_r   	bernoullisum)r   pr   trainingabdropout_maskr   r   r   r1     s   r1   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	BERTSelfAttentionc                    s   t t|   |j|j dkrtd|j|jf |j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _|| _|jr\t|| _d S d S r   )r]   r   r7   r   r$   r   r   r   r   r^   r|   r   r>   r?   r   r(   r   r=   r.   r[   r~   r   rf   r   r   r7   $  s(   
zBERTSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   rP   rv   r   r   )r   r   r   r   r   r   r   9  s
   
z&BERTSelfAttention.transpose_for_scoresNc                 C   sB  | j jr	| |}| |}| |}| |}| |}| |}| |}	t||	dd}
|
t
| j }
|d ur]| js]t|D ]\}}|| dkr\d|
d d |d d d d f< qD|
| }
tjdd|
}| j jss| |}nt|| j jd| jd}t||	}|dddd	 }| d d | jf }|j| }|S )
Nr   r   rv   r   r   )r   r   r   r   rP   r   )r=   r.   r~   r   r>   r?   r   r   r   	transposer	   r
   r   r   	enumerater^   r   r1   r   r(   r   r   r   r   r   )r   r   r   	head_maskr   r   r   r   r   r   attention_scoresir   attention_probsr   r   r   r   r   rq   ?  sH   








zBERTSelfAttention.forwardNr   r   r   rf   r   r   "  s    r   c                       $   e Zd Z fddZdd Z  ZS )BERTSelfOutputc                    s   t t|   || _t|j|j| _|js|j	st
|| _t|j| _|j	rQttddjt|  jd| _ttdjt|  jd| _d S d S )Nrv   Gz?r   )r]   r   r7   r=   r^   r|   r   denser.   r-   r[   r~   r   r'   r   r_   r   Tensorfill_tonext
parametersr   
res_factorr`   factorr   rf   r   r   r7   n  s    

zBERTSelfOutput.__init__c                 C   sV   |  |}| |}| jjs| jjs| || }|S | jjr(|| j|  }|S 	 |S r   )r   r   r=   r-   r.   r~   r   r   r   input_tensorr   r   r   rq   |  s   

zBERTSelfOutput.forwardrr   r   r   rf   r   r   l  s    r   c                       rt   )BERTAttentionc                    s`   t t|   |j dkrt|| _n|j dkr!t|| _ntd	|jt
|| _d S )Nr   
factorizedz5Attention type must in [self, factorized], but got {})r]   r   r7   r,   lowerr   r   r   r   formatr   rN   r   rf   r   r   r7     s   zBERTAttention.__init__Nc                 C   s   |  |||}| ||}|S r   rM   )r   r   r   r   self_outputattention_outputr   r   r   rq     s   zBERTAttention.forwardr   rr   r   r   rf   r   r     s    r   c                       s0   e Zd Z					d fdd	Zdd Z  ZS )	DepthwiseSeparableConv1drv   r   Fc              
      sV   t t|   |d d }tj||||||||d| _tj||ddddd|d| _d S )Nrv   rP   )groupsbiasr   )r   )r]   r   r7   r^   Conv1d	depthwise	pointwise)r   in_channelsout_channelskernel_sizestridepaddingdilationr   rf   r   r   r7     s   	z!DepthwiseSeparableConv1d.__init__c                 C   s   |  |}| |}|S r   )r   r   )r   r   r   r   r   rq     s   

z DepthwiseSeparableConv1d.forward)rv   rv   r   rv   Frr   r   r   rf   r   r     s    r   c                       r   )BERTIntermediatec                    s   t t|   || _| jjrt|| _t| _|j	
 dkr(t|j|j| _d S |j	
 dkr=t|jd|j dd| _d S |jj
 dkrItdtd)	Nr   cnn      r   rnn.rnn transition function is not implemented yetOnly support linear/cnn/rnn)r]   r   r7   r=   r.   r[   r~   r   intermediate_act_fnr!   r   r^   r|   r   r&   r   r   r   NotImplementedErrorr   r   rf   r   r   r7     s$   

zBERTIntermediate.__init__c                 C   sj   | j jr	| |}| j j dkr| |}n| j j dkr-| |dddd}n	 | |}|S Nr   r   r   r   )	r=   r.   r~   r!   r   r   r   r   r   )r   r   r   r   r   rq     s   


zBERTIntermediate.forwardrr   r   r   rf   r   r     s    r   c                       r   )SqueezeExcitationBlockc                    s>   t t|   t|j|jd | _t|jd |j| _d S )Nr   )r]   r   r7   r^   r|   r   down_samplingup_samplingr   rf   r   r   r7     s   
zSqueezeExcitationBlock.__init__c                 C   s2   t j|ddd}t | t| |}|| S )Nrv   Trh   )r   rl   sigmoidr   r   r   )r   r   squeeze
excitationr   r   r   rq     s
   zSqueezeExcitationBlock.forwardrr   r   r   rf   r   r     s    r   c                       r   )
BERTOutputc                    s  t t|   || _|j dkrt|j|j	| _
n$|j dkr/td|j	 |j	dd| _n|jj	 dkr;tdtd|jsJ|jsJt|| _t|j| _|jrYt|| _|jrttd	d
jt|  j d| _!tt"d	jt|  j d| _#d S d S )Nr   r   r   r   r   r   r   r   rv   r   r   )$r]   r   r7   r=   r!   r   r^   r|   r&   r   r   r   r   r   r   r.   r-   r[   r~   r   r'   r   r/   r   SEblockr_   r   r   r   r   r   r   r   r   r`   r   r   rf   r   r   r7     s<   



zBERTOutput.__init__c                 C   s   | j j dkr| |}n| j j dkr$| |dddd}n	 | |}| j jr3| |}| j j	sD| j j
sD| || }|S | j j	rQ|| j|  }|S 	 |S r   )r=   r!   r   r   r   r   r   r/   r   r-   r.   r~   r   r   r   r   r   rq   	  s,   


zBERTOutput.forwardrr   r   r   rf   r   r     s    r   c                       rt   )	BERTLayerc                    s0   t t|   t|| _t|| _t|| _d S r   )	r]   r   r7   r   	attentionr   intermediater   rN   r   rf   r   r   r7      s   

zBERTLayer.__init__Nc                 C   s,   |  |||}| |}| ||}||fS r   )r   r   rN   )r   r   r   r   r   intermediate_outputlayer_outputr   r   r   rq   &  s   
zBERTLayer.forwardr   rr   r   r   rf   r   r     s    r   c                       r   )BERTWeightedLayerc                    s   t t   _t _jj_t fddt	 j
D _t j
_tjj  _t j
_tjj  _t _t j j_t _t j_d S )Nc                    s   g | ]
}t j jqS r   )r^   r|   r   r   .0_r=   r   r   r   
<listcomp>6  s    z.BERTWeightedLayer.__init__.<locals>.<listcomp>)r]   r  r7   r=   r   r   r   r^   
ModuleListranger$   w_or   randw_kpr_   r   w_ar   r   r|   r&   r   rN   r[   r~   r   r'   r   r   rf   r  r   r7   0  s   




zBERTWeightedLayer.__init__c                    s     ||}|j j jdd fddttD  fddttD dd t jD  fddttD  fddttD  fd	dttD d
d t jD t} 	|| S )Nr   r   c                    s   g | ]} j | | qS r   )r  r  r   r   self_outputsr   r   r  G  s    z-BERTWeightedLayer.forward.<locals>.<listcomp>c                       g | ]	}  | qS r   r   r  r  r   r   r  J      c                 S      g | ]\}}|| qS r   r   )r  kapparN   r   r   r   r  M      c                    r  r   )r   r  r  r   r   r  P  s    c                    r  r   )rN   r  r  r   r   r  T  r  c                    r  r   r  r  r  r   r   r  W  r  c                 S   r  r   r   )r  alpharN   r   r   r   r  Z  r  )
r   splitr   r
  lenzipr  r  r   r~   )r   r   r   r   rN   r   r  r   rq   D  s2   






zBERTWeightedLayer.forwardrr   r   r   rf   r   r  .  s    r  c                       *   e Zd Z fddZ		dddZ  ZS )BERTEncoderc                    s   t t|   t | _t|jD ]}|jr| j	t
| q| j	t| q|jrot| jD ]>\}}ttddjt|  jd|j_ttddjt|  jd|j_|jj|jj_|jj|jj_q0|| _d S )Nrv   r   r   )r]   r  r7   r^   r	  layerr
  r   r"   appendr  r   r-   r   r_   r   r   r   r   r   r   r   rN   r   r   r   r=   )r   r=   r  indexr  rf   r   r   r7   c  s*   

zBERTEncoder.__init__r   Nc           
      C   s   |g}|dkrt t| jd d|  d }nd}t| jD ]>\}}|d u r@| jjs3|||d \}	}ntjj|||d \}	}n
||||| \}	}||krR|	  |
|	 |
| q|S )Nr   r   rP   rv   )r   r  r  r   r=   r6   r   utils
checkpointdetach_r  )
r   r   r   epoch_id
head_masksall_encoder_layersdetach_indexr   layer_moduleself_outr   r   r   rq   w  s,    


zBERTEncoder.forwardr   Nrr   r   r   rf   r   r  a  s
    r  c                       r  )BERTEncoderRolledc                    s@   t t|   t| || _t fddt|jD | _	d S )Nc                       g | ]}t  qS r   rK   rL   r  r  r   r   r        z.BERTEncoderRolled.__init__.<locals>.<listcomp>)
r]   r+  r7   r   r=   r^   r	  r
  r#   r  r   rf   r.  r   r7     s   
zBERTEncoderRolled.__init__r   Nc                 C   s   |g}t | jjD ]6}| jj dkr | j|| jj  ||}n| jj dkr:|| jj| jj  }| j| ||}|| q	|S )N	universalalbert)r
  r=   r   r    r   r  r#   r  )r   r   r   r$  r%  r&  r   r   r   r   r   rq     s   zBERTEncoderRolled.forwardr*  rr   r   r   rf   r   r+    s
    
r+  c                       r   )BERTEncoderACTc                    sx   t t|   t|| _t|jd t fddt	|j
D | _| jD ]	}|jjd q&|| _|j
| _d| _d S )Nrv   c                    r,  r   r-  r  r   r   r   r    r/  z+BERTEncoderACT.__init__.<locals>.<listcomp>r   r   )r]   r2  r7   r   r  r^   r|   r   r	  r
  r   r   r   r   r   r=   act_max_steps	threshold)r   r=   modulerf   r3  r   r7     s   


zBERTEncoderACT.__init__c                 C   s   | | j| | j S r   )ltr5  __and__r4  any)r   halting_probability	n_updatesr   r   r   should_continue  s
   
zBERTEncoderACT.should_continuec                 C   sJ  |g}|  \}}}t|| }t|| }t|| }	t| jD ]t}
t| j|
 |d}|	d
 }|||  | j
 | }|||  | j
 | }|||  }||d|   }|||  }|	| | }	|| ||  d}| ||}|| |d|   }|| | ||	s nq'|t|	| fS )NrP   r   rv   )r   r   rb   cudar
  r4  r   r   r   r7  rk   gtr5  ler   r  r  r<  rl   )r   r   r   r&  
batch_sizeseq_lenhdimr:  
remaindersr;  r   r   still_running
new_haltedupdate_weightstransformed_statesr   r   r   rq     sH   
zBERTEncoderACT.forward)rV   rW   rX   r7   r<  rq   rs   r   r   rf   r   r2    s    r2  c                       r   )
BERTPoolerc                    s.   t t|   t|j|j| _t | _d S r   )	r]   rH  r7   r^   r|   r   r   Tanh
activationr   rf   r   r   r7     s   zBERTPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   rJ  )r   r   first_token_tensorpooled_outputr   r   r   rq     s   

zBERTPooler.forwardrr   r   r   rf   r   rH    s    rH  c                       s:   e Zd ZdZdef fddZ					d	ddZ  ZS )
	BertModela  BERT model ("Bidirectional Embedding Representations from a Transformer").

    Example:
        >>> # Already been converted into WordPiece token ids
        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])

        >>> config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
        >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

        >>> model = modeling.BertModel(config=config)
        >>> all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    r=   c                    s   t t|   || _t|| _|j dkrt|| _	nK|j dkr)t
|| _	n>|j dkr6t
|| _	n1|j dkrCt|| _	n$|j dkr]ddlm}m}m} t||||| _	n
td|j t|| _d	S )
z]Constructor for BertModel.

        Args:
            config: `BertConfig` instance.
        r   r0  r1  acttextnasr   )
input_dictop_dict	skip_dictz Not support transformer type: {}N)r]   rM  r7   r=   ru   r   r    r   r  encoderr+  r2  textnas_finalrP  rQ  rR  TextNASEncoderr   r   rH  pooler)r   r=   rP  rQ  rR  rf   r   r   r7     s(   
zBertModel.__init__Nr   c                 C   s  |d u r	t |}|d u rt |}|dd}|jt|  jd}d| d }| |||\}}	| j	j
 dkrE| ||\}
}n| j	j
 dkrW| |}||g}
n| ||||}
|
d|	 |
d	 }| j	jsv| |}|
|fS |d d df }|
|fS )
Nrv   rP   r   r   g     rN  reformerr   r   )r   	ones_liker   r   r   r   r   r   r   r=   r    r   rS  insertr5   rV  )r   r   r   r   r$  r%  r   extended_attention_maskembedding_outputry   r&  act_losssequence_outputrL  r   r   r   rq     s<   





zBertModel.forward)NNr   NN)rV   rW   rX   rY   r   r7   rq   rs   r   r   rf   r   rM    s    rM  c                       s8   e Zd ZdZ fddZ							d	ddZ  ZS )
&BertForSequenceClassificationMultiTaska  BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.

    Example:
        >>> # Already been converted into WordPiece token ids
        >>> input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        >>> input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        >>> token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]])

        >>> config = BertConfig(vocab_size=32000, hidden_size=512,
        >>>     num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)

        >>> num_labels = 2

        >>> model = BertForSequenceClassification(config, num_labels)
        >>> logits = model(input_ids, token_type_ids, input_mask)
    c                    s   t t|   | dkrt | _n| dkrt | _ntd|t	
 j| _t	 | _|D ]}| jt	 jt| q4|| _ fdd}| | d S )Nbertlstmz%Only support lstm or bert, but got {}c                    sz   t | tjtjfr| jjjd jd nt | tr-| j	jjd jd | j
jjd jd t | tjr;| jj  d S d S )Nr   )rl   std)
isinstancer^   r|   rx   weightr   normal_r+   r[   rc   ra   r   r   )r6  r=   r   r   init_weightsm  s   
zEBertForSequenceClassificationMultiTask.__init__.<locals>.init_weights)r]   r^  r7   r   rM  r_  	LSTMModelr   r   r^   r   r'   r   r	  
classifierr  r|   r   r  
label_listapply)r   r=   ri  core_encoderlabelrf  rf   re  r   r7   ^  s   
z/BertForSequenceClassificationMultiTask.__init__Nr   Fc                    s  |  ||||||\} |    fdd| jD }|d urtdd}tjdd}t|d}g }tt	||D ]B\}\}}t
| j| dkrP||| }n||d|}||kjt|  jd}|
d uro||
|  }t|| }|| q9|	st||fS t|||d fS |S )	Nc                    s   g | ]}| qS r   r   )r  rh  rL  r   r   r    s    zBBertForSequenceClassificationMultiTask.forward.<locals>.<listcomp>none)	reductionrv   r   r   r   )r_  r   rh  r   r^   MSELossr   unbindr   r  r  ri  r   r   r   r   r   r   rl   r  r   )r   r   r   r   labelslabels_indexr$  r%  r   return_embeddingloss_weightr&  logitsloss_fctregression_loss_fct
labels_lstloss_lstr   rl  logitlosslabels_maskr   rm  r   rq   }  s8   

z.BertForSequenceClassificationMultiTask.forward)NNr   NNFN)rV   rW   rX   rY   r7   rq   rs   r   r   rf   r   r^  J  s    #r^  )r   r   F)*rY   
__future__r   r   r   rK   r	   rE   numpynpr8   r   torch.nnr^   torch.nn.functional
functionalFtorch.utils.checkpointr   r   objectr   Moduler[   ru   r   r1   r   r   r   r   r   r   r   r   r  r  r+  r2  rH  rM  r^  r   r   r   r   <module>   sB   m:
2
J#133/[