o
     iЁ                     @   sD  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlm  mZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZmZmZmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( zd dl)m*Z* W n e+y   dZ*Y nw zd dl,m-Z- W n e+y   dZ-Y nw zd dl.m/Z/ W n e+y   dZ/Y nw e 0e1Z2d1ddZ3d2ddZ4d3ddZ5d4ddZ6G dd dej7Z8G dd dej7Z9G d d! d!ej7Z:G d"d# d#ej7Z;G d$d% d%ej7Z<G d&d' d'ej7Z=G d(d) d)e=Z>G d*d+ d+e=Z?d,efd-d.Z@d,efd/d0ZAdS )5    N)OrderedDict)Sequence)partial)AnyMapping)	rearrange)
BertConfigPretrainedConfig),BaseModelOutputWithPoolingAndCrossAttentionsBertForPreTrainingOutput)index_first_axisindex_first_axis_residual	pad_inputunpad_input)Block)BertEmbeddings)MHA)FusedMLPMlp)state_dict_from_pretrained)
FusedDense)layer_norm_fn)CrossEntropyLossFc              
   C   s   t | dd}t | dd}i }| jdkr4t | d| j|d< t | dd|d< t | dd |d< t | d	d|d	< ttf| j|| jd|||d
|}|S )Nuse_flash_attnFfused_bias_fcrotaryrotary_emb_dimrotary_emb_baseg     @rotary_emb_scale_baserotary_emb_interleaved)	num_heads
cross_attndropoutcausalr   r   return_residual)getattrposition_embedding_typehidden_sizer   r   num_attention_headsattention_probs_dropout_prob)configr!   r$   r   r   rotary_kwargs	mixer_cls r-   T/home/ubuntu/.local/lib/python3.10/site-packages/xformers/_flash_attn/models/bert.pycreate_mixer_cls9   s,   
	r/   c                 C   s   | j }t| dd}|r| jdv sJ d|s.| jdv rdnd}tt|ttj|d|d}|S td u r6td	t| d
d}t	|t
rK|d usGJ || }tt|||d}|S )N	fused_mlpFgelu_new	gelu_fastgelu_pytorch_tanhz(fused_mlp only supports approximate gelutanhnoneapproximate)hidden_features
activationr$   fused_dense is not installedmlp_checkpoint_lvlr   )r9   checkpoint_lvlr$   )intermediate_sizer%   
hidden_actr   r   Fgelur   ImportError
isinstancer   )r*   	layer_idxr$   	inner_dimr0   r8   mlp_clsr<   r-   r-   r.   create_mlp_clsP   s>   

rG   c           	      C   s|   t | dd}|o|| jd k}| }t| ||d}t| ||d}ttj| jd}t| j	|||d| j
| j
t | dd|d	}|S )Nlast_layer_subsetF   )r$   epsfused_dropout_add_ln)norm_clsprenormresid_dropout1resid_dropout2rL   r$   )r%   num_hidden_layersr/   rG   r   nn	LayerNormlayer_norm_epsr   r'   hidden_dropout_prob)	r*   rD   rH   r!   r$   r,   rF   rM   blockr-   r-   r.   create_blockt   s$   
rW   {Gz?c                 C   s   t | tjrtjj| j|d | jd urtj| j d S d S t | tjr?tjj| j|d | j	d urAtj| j| j	  d S d S d S )N)std)
rC   rR   Linearinitnormal_weightbiaszeros_	Embeddingpadding_idx)moduleinitializer_ranger-   r-   r.   _init_weights   s   

rd   c                       s,   e Zd Zdef fddZdddZ  ZS )BertEncoderr*   c                    s<   t    t dd| _t fddt jD | _d S )Nr   Fc                    s   g | ]}t  |d qS ))rD   )rW   ).0ir*   r-   r.   
<listcomp>   s    z(BertEncoder.__init__.<locals>.<listcomp>)	super__init__r%   r   rR   
ModuleListrangerQ   layersselfr*   	__class__rh   r.   rk      s
   

zBertEncoder.__init__Nc                 C   s  |du s| j s'|durd|ind}| jD ]}|||d}q|dur%|| }|S |jdd \}}t||\}}}	}
}|	|
d}|du rX| jD ]}|||d}qFt||||}|S | jdd D ]}|||d}q_|durtj|| dd }||@ jdtj	d	}t
tj|d
tj	d	d}ntj|dd }|jdtj	d	}t
tj|d
tj	d	d}t||\}}|||
|	|
d}| jd ||d}|S )zIf subset_mask is not None, we only want output for the subset of the sequence.
        This means that we only compute the last layer output for these tokens.
        subset_mask: (batch, seqlen), dtype=torch.bool
        Nkey_padding_mask)mixer_kwargs   )
cu_seqlens
max_seqlenFas_tuple)dimdtyper   )rI   r   )x_kvrv   rw   cu_seqlens_kmax_seqlen_k)r   rn   shaper   r   torchnonzeroflattensumint32r@   padcumsumr   )rp   hidden_statesrs   subset_maskrt   layerbatchseqlenindicesrv   max_seqlen_in_batch_
subset_idxsubset_seqlenssubset_cu_seqlenshidden_states_subsetr-   r-   r.   forward   s\   
(

zBertEncoder.forward)NN__name__
__module____qualname__r   rk   r   __classcell__r-   r-   rq   r.   re      s    re   c                       s&   e Zd Z fddZdddZ  ZS )
BertPoolerc                    sV   t    t|dd}|rtd u rtd|stjnt}||j|j| _t	 | _
d S )Nr   Fr;   )rj   rk   r%   r   rB   rR   rZ   r'   denseTanhr:   rp   r*   r   
linear_clsrq   r-   r.   rk      s   
zBertPooler.__init__Tc                 C   s0   |r
|d d df n|}|  |}| |}|S )Nr   )r   r:   )rp   r   poolfirst_token_tensorpooled_outputr-   r-   r.   r      s   

zBertPooler.forwardTr   r   r   rk   r   r   r-   r-   rq   r.   r      s    	r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )BertPredictionHeadTransformc                    s   t    t|dd}|rtd u rtdt|dd| _| jr'td u r'td|s,tjnt}||j	|j	| _
|jdv r=dnd}tj|d	| _tj|j	|jd
| _d S )Nr   Fr;   rL   Triton is not installedr1   r5   r6   r7   rJ   )rj   rk   r%   r   rB   rL   r   rR   rZ   r'   r   r?   GELUtransform_act_fnrS   rT   
layer_norm)rp   r*   r   r   r8   rq   r-   r.   rk      s   

z$BertPredictionHeadTransform.__init__r   returnc                 C   sH   |  |}| |}| js| |}|S t|| jj| jj| jjd}|S )NrJ   )r   r   rL   r   r   r]   r^   rK   rp   r   r-   r-   r.   r      s   


z#BertPredictionHeadTransform.forward)r   r   r   rk   r   Tensorr   r   r-   r-   rq   r.   r      s    r   c                       $   e Zd Z fddZdd Z  ZS )BertLMPredictionHeadc                    sZ   t    t|dd}|rtd u rtd|stjnt}t|| _||j	|j
dd| _d S )Nr   Fr;   T)r^   )rj   rk   r%   r   rB   rR   rZ   r   	transformr'   
vocab_sizedecoderr   rq   r-   r.   rk   
  s   

zBertLMPredictionHead.__init__c                 C   s   |  |}| |}|S N)r   r   r   r-   r-   r.   r     s   

zBertLMPredictionHead.forwardr   r-   r-   rq   r.   r   	  s    r   c                       r   )BertPreTrainingHeadsc                    s(   t    t|| _t|jd| _d S )Nru   )rj   rk   r   predictionsrR   rZ   r'   seq_relationshipro   rq   r-   r.   rk     s   

zBertPreTrainingHeads.__init__c                 C   s   |  |}| |}||fS r   )r   r   )rp   sequence_outputr   prediction_scoresseq_relationship_scorer-   r-   r.   r   #  s   

zBertPreTrainingHeads.forwardr   r-   r-   rq   r.   r     s    r   c                       s,   e Zd ZdZ fddZedd Z  ZS )BertPreTrainedModelzAn abstract class to handle weights initialization and
    a simple interface for dowloading and loading pretrained models.
    c                    s6   t    t|tstd| jj| jj|| _d S )NzParameter config in `{}(config)` should be an instance of class `BertConfig`. To create a model from a Google pretrained model use `model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`)	rj   rk   rC   r   
ValueErrorformatrr   r   r*   )rp   r*   inputskwargsrq   r-   r.   rk   .  s   


zBertPreTrainedModel.__init__c                 O   s<   | |g|R i |}|j tt||dd}t| |S )a@  
        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.

        Params:
            pretrained_model_name_or_path: either:
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
                - a path or url to a pretrained model archive containing:
                    . `bert_config.json` a configuration file for the model
                    . `model.chkpt` a TensorFlow checkpoint
            *inputs, **kwargs: additional input for the specific Bert class
                (ex: num_labels for BertForSequenceClassification)
        F)strict)load_state_dictremap_state_dictr   loggerinfo)cls
model_namer*   r   r   modelload_returnr-   r-   r.   from_pretrained:  s   
z#BertPreTrainedModel.from_pretrained)r   r   r   __doc__rk   classmethodr   r   r-   r-   rq   r.   r   )  s
    r   c                       s6   e Zd Zddef fddZ				d	ddZ  ZS )
	BertModelTr*   c                    s   t  | t|dd| _|j| j dkr#| j| j|j| j  7  _t|dd| _| jr5td u r5td|jdv s<J t	|j
|j|j|j|jd| _t|j| _tj|j
|jd	| _t|| _|rgt|nd | _| tt|jd
 d S )Npad_vocab_size_multiplerI   r   rL   Fr   )rA   r2   r3   r4   )ra   rJ   rc   )rj   rk   r%   r   r   rL   r   rB   r?   r   r'   max_position_embeddingstype_vocab_sizepad_token_id
embeddingsrR   DropoutrU   emb_droprS   rT   emb_lnre   encoderr   poolerapplyr   rd   rc   )rp   r*   add_pooling_layerrq   r-   r.   rk   U  s,   


zBertModel.__init__Nc                 C   s:  | j |||d}| js| |}nt|| jj| jj| jjd}| |}|durI|jdd \}}t	j
||t	j|jd}	d|	dddf< ||	B }
nd}
| j|||
d}|du rd| jdura| |nd}n3|dur}|
| }||	| |  }||| |  }n||	|
  }|||
  }| jdur| j|d	d
nd}t||dS )a'  If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
        we only want the output for the masked tokens. This means that we only compute the last
        layer output for these tokens.
        masked_tokens_mask: (batch, seqlen), dtype=torch.bool
        )position_idstoken_type_idsrJ   Nru   )r|   deviceTr   )rs   r   F)r   )last_hidden_statepooler_output)r   rL   r   r   r]   r^   rK   r   r   r   zerosboolr   r   r   r
   )rp   	input_idsr   r   attention_maskmasked_tokens_maskr   
batch_sizer   first_col_maskr   r   r   r   
pool_inputr-   r-   r.   r   o  sB   

zBertModel.forwardr   )NNNNr   r-   r-   rq   r.   r   T  s    r   c                       s>   e Zd Zdef fddZdd Z					d	ddZ  ZS )
BertForPreTrainingr*   c                    s   t  | t|dd| _t|dd| _| jr| jsJ dt|dd}|r.td u r.td|s3tjnttdd}t	|| _
t|| _|d	d
| _|dd
| _| tt|jd |   d S )Ndense_seq_outputFrH   z+last_layer_subset requires dense_seq_outputuse_xentropyzxentropy_cuda is not installedT)inplace_backwardr   )ignore_indexrx   r   )rj   rk   r%   r   rH   r   rB   rR   r   r   bertr   r   mlm_lossnsp_lossr   rd   rc   tie_weights)rp   r*   r   loss_clsrq   r-   r.   rk     s$   


zBertForPreTraining.__init__c                 C   s   | j jjj| jjj_d S r   )r   r   word_embeddingsr]   r   r   r   )rp   r-   r-   r.   r     s   zBertForPreTraining.tie_weightsNc                 C   s   | j r|dur|dknd}| j||||dur| nd|d}|j|j}	}
| jrE|durEtj| dkdd }| j sEt	t
|	d|}	| |	|
\}}d}|dur|dur| jri|duri| || | }n| t
|dt
|d}| t
|d	t
|d}| |  }t|||d
S )a  
        If labels are provided, they must be 0 for masked out tokens (as specified in the attention
        mask).
        Outputs:
            if `labels` and `next_sentence_label` are not `None`:
                Outputs the total_loss which is the sum of the masked language modeling loss and the next
                sentence classification loss.
            if `labels` or `next_sentence_label` is `None`:
                Outputs a tuple comprising
                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
                - the next sentence classification logits of shape [batch_size, 2].

        Nr   )r   r   r   r   Fry   zb s d -> (b s) dz... v -> (...) vz... -> (...)z... t -> (...) t)lossprediction_logitsseq_relationship_logits)rH   r   r   r   r   r   r   r   r   r   r   r   r   r   floatr   )rp   r   r   r   r   labelsnext_sentence_labelr   outputsr   r   masked_token_idxr   r   
total_lossmasked_lm_lossnext_sentence_lossr-   r-   r.   r     sL   
zBertForPreTraining.forward)NNNNN)r   r   r   r   rk   r   r   r   r-   r-   rq   r.   r     s    r   r*   c              	      s  dd t fdd|  D } dd t fdd|  D } dd	 t fd
d|  D } dd t fdd|  D } t|dd}t|jD ]}| d| d}| d| d}| d| d}| d| d}| d| d}| d| d}	|r||jd kstj|||gdd| d| d< tj|||	gdd| d| d< qO|| d| d< tj||gdd| d| d< || d| d< tj||	gdd| d| d< qOd d!  t  fd"d|  D } d#d$ t fd%d|  D } t|d&d}
|
dkrO| d' }t	|ddd|j
|jd  f| d'< | d( }t	|ddd|j
|jd  f| d(< | d) }tj	|d|j
|jd  fd*d+| d)< | S ),zU
    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
    c                 S       t dd| } t dd| } | S )NzLayerNorm.gamma$zLayerNorm.weightzLayerNorm.beta$zLayerNorm.biasresubkeyr-   r-   r.   key_mapping_ln_gamma_beta     z3remap_state_dict.<locals>.key_mapping_ln_gamma_betac                 3        | ]\}} ||fV  qd S r   r-   rf   kv)r   r-   r.   	<genexpr>      z#remap_state_dict.<locals>.<genexpr>c                 S      t dd| S )Nz^bert.encoder.layer.bert.encoder.layers.r   r   r-   r-   r.   key_mapping_layers     z,remap_state_dict.<locals>.key_mapping_layersc                 3   r  r   r-   r  )r
  r-   r.   r    r  c                 S   <   t dd| } t dd| } t dd| } t dd| } | S )	Nz^bert.embeddings.LayerNorm.bert.emb_ln.zC^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)zbert.encoder.layers.\1.norm1.\2z9^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)zbert.encoder.layers.\1.norm2.\2z2^cls.predictions.transform.LayerNorm.(weight|bias)z'cls.predictions.transform.layer_norm.\1r   r   r-   r-   r.   key_mapping_ln   "   z(remap_state_dict.<locals>.key_mapping_lnc                 3   r  r   r-   r  )r  r-   r.   r  3  r  c                 S   r   )Nz;^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)z!bert.encoder.layers.\1.mlp.fc1.\2z5^bert.encoder.layers.(\d+).output.dense.(weight|bias)z!bert.encoder.layers.\1.mlp.fc2.\2r   r   r-   r-   r.   key_mapping_mlp6     z)remap_state_dict.<locals>.key_mapping_mlpc                 3   r  r   r-   r  )r  r-   r.   r  C  r  rH   Fr	  .attention.self.query.weight.attention.self.key.weight.attention.self.value.weight.attention.self.query.bias.attention.self.key.bias.attention.self.value.biasrI   r   )r{   .mixer.Wqkv.weight.mixer.Wqkv.bias.mixer.Wq.weight.mixer.Wkv.weight.mixer.Wq.bias.mixer.Wkv.biasc                 S   r  )Nz?^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)z(bert.encoder.layers.\1.mixer.out_proj.\2r   r   r-   r-   r.   key_mapping_attnY  
   z*remap_state_dict.<locals>.key_mapping_attnc                 3   r  r   r-   r  )r  r-   r.   r  `  r  c                 S   r  )Nz^cls.predictions.biascls.predictions.decoder.biasr   r   r-   r-   r.   key_mapping_decoder_biasb  r  z2remap_state_dict.<locals>.key_mapping_decoder_biasc                 3   r  r   r-   r  )r!  r-   r.   r  e  r  r   &bert.embeddings.word_embeddings.weightcls.predictions.decoder.weightr   g      Y)value)r   itemsr%   rm   rQ   popr   catr@   r   r   r   )
state_dictr*   rH   dWqWkWvbqbkbvr   r   decoder_weightdecoder_biasr-   )r  r!  r
  r  r   r  r.   r     sX   
" 

r   c                    s  t |dd}|dkr9| d }| d }| d }|d|jddf | d< |d|jddf | d< |d|j | d< t|jD ](}t |dd}|rP||jd kr| d	| d
}| d	| d}	|d|jd d ddf | d	| d< ||jd d d|jd  d ddf | d	| d< |d|jd  d dddf | d	| d< |	d|	jd d  | d	| d< |	|	jd d d|	jd  d  | d	| d< |	d|	jd  d d | d	| d< q>| d	| d}
| d	| d}| d	| d}| d	| d}|
| d	| d< |d|jd d ddf | d	| d< ||jd d dddf | d	| d< || d	| d< |d|jd d  | d	| d< ||jd d d | d	| d< q>dd dd dd dd  d!d"  d#d$ tfd%d&|  D } tfd'd&|  D } tfd(d&|  D } tfd)d&|  D } t fd*d&|  D } tfd+d&|  D } | S ),z
    Map the state_dict of a flash_attn model to be Huggingface BERT compatible.

    This function is meant to be the inverse of remap_state_dict.
    r   rI   r"  r#  r   NrH   Fr	  r  r  r      r  ru   r  r  r  r  r  r  r  r  r  c                 S   r  )	Nr  zbert.embeddings.LayerNorm.z-bert.encoder.layers.(\d+).norm1.(weight|bias)z4bert.encoder.layers.\1.attention.output.LayerNorm.\2z-bert.encoder.layers.(\d+).norm2.(weight|bias)z*bert.encoder.layers.\1.output.LayerNorm.\2z2cls.predictions.transform.layer_norm.(weight|bias)z&cls.predictions.transform.LayerNorm.\1r   r   r-   r-   r.   inv_key_mapping_ln  r  z0inv_remap_state_dict.<locals>.inv_key_mapping_lnc                 S   r   )NzLayerNorm.weight$zLayerNorm.gammazLayerNorm.bias$zLayerNorm.betar   r   r-   r-   r.   inv_key_mapping_ln_gamma_beta  r  z;inv_remap_state_dict.<locals>.inv_key_mapping_ln_gamma_betac                 S   r  )Nr	  zbert.encoder.layer.r   r   r-   r-   r.   inv_key_mapping_layers  r  z4inv_remap_state_dict.<locals>.inv_key_mapping_layersc                 S   r   )Nz.bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)z+bert.encoder.layer.\1.intermediate.dense.\2z.bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)z%bert.encoder.layer.\1.output.dense.\2r   r   r-   r-   r.   inv_key_mapping_mlp  r  z1inv_remap_state_dict.<locals>.inv_key_mapping_mlpc                 S   r  )Nz5bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)z/bert.encoder.layer.\1.attention.output.dense.\2r   r   r-   r-   r.   inv_key_mapping_attn  r  z2inv_remap_state_dict.<locals>.inv_key_mapping_attnc                 S   r  )Nr   zcls.predictions.biasr   r   r-   r-   r.   inv_key_mapping_decoder_bias  r  z:inv_remap_state_dict.<locals>.inv_key_mapping_decoder_biasc                 3   r  r   r-   rf   r   r$  )r3  r-   r.   r    r  z'inv_remap_state_dict.<locals>.<genexpr>c                 3   r  r   r-   r9  )r4  r-   r.   r        
c                 3   r  r   r-   r9  )r5  r-   r.   r    r:  c                 3   r  r   r-   r9  )r6  r-   r.   r    r  c                 3   r  r   r-   r9  )r7  r-   r.   r    r:  c                 3   r  r   r-   r9  )r8  r-   r.   r    r:  )r%   orig_vocab_sizerm   rQ   r&  r   r   r%  )r(  r*   r   r   r0  r1  r)  rH   Wqkv_weightsWqkv_biases	Wq_weightWkv_weightsWq_bias
Wkv_biasesr-   )r7  r8  r5  r3  r4  r6  r.   inv_remap_state_dict}  s   &rB  )FF)NFr   )rX   )Bloggingr   collectionsr   collections.abcr   	functoolsr   typingr   r   r   torch.nnrR   torch.nn.functional
functionalr@   einopsr   transformersr   r	   &transformers.models.bert.modeling_bertr
   r   flash_attn.bert_paddingr   r   r   r   flash_attn.modules.blockr   flash_attn.modules.embeddingr   flash_attn.modules.mhar   flash_attn.modules.mlpr   r   flash_attn.utils.pretrainedr   flash_attn.ops.fused_denser   rB    flash_attn.ops.triton.layer_normr   flash_attn.losses.cross_entropyr   	getLoggerr   r   r/   rG   rW   rd   Modulere   r   r   r   r   r   r   r   r   rB  r-   r-   r-   r.   <module>   s`   



$
?+Waq