o
    ei                  
   @   s,  d Z ddlZddlZddlmZ ddlmZmZmZmZ ddlm	Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z! e"e#Z$dej%de&dej'dej%fddZ(dej%dej%de)de*dej%f
ddZ+dej%dej%fddZ,dej%dej%dej%fdd Z-G d!d" d"ej.j/Z0G d#d$ d$ej1Z2G d%d& d&ej1Z3G d'd( d(ej1Z4G d)d* d*eZ5eG d+d, d,eZ6eG d-d. d.e6Z7ed/d0G d1d2 d2e6eZ8ed3d0G d4d5 d5e6Z9eG d6d7 d7e6Z:eG d8d9 d9e6Z;g d:Z<dS );zPyTorch BLOOM model.    N)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCacheStaticCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )BloomConfigattention_mask	num_headsdtypereturnc                 C   s6  | j \}}dtt| }tjddt|d     | jtjd}tjdd| | jtj	d}t
||}||krvtjddtd| d     | jtjd}	t||| }
tjddd|
  d| jtj	d}tj|t
|	|gdd}| jddd |  dddddf }|d	 | }||| d||S )
a  
    Link to paper: https://huggingface.co/papers/2108.12409 Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

    Args:
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
        num_heads (`int`):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
       r   devicer   r   r   dimN).N)shapemathfloorlog2torchtensorr   float32arangeint32powmincatcumsumreshapeto)r   r   r   
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibi r=   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/bloom/modeling_bloom.pybuild_alibi_tensor-   s"   
 $ &r?   xresidualprobtrainingc                 C   s   t j| ||d}|| }|S )a
  
    Dropout add function

    Args:
        x (`torch.tensor`):
            input tensor
        residual (`torch.tensor`):
            residual tensor
        prob (`float`):
            dropout probability
        training (`bool`):
            training mode
    )prC   )Fdropout)r@   rA   rB   rC   outr=   r=   r>   dropout_addY   s   rH   c                 C   s*   | d dt d|  dd|  |      S )z
    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
    make the model jitable.

    Args:
        x (`torch.tensor`):
            input hidden states
          ?      ? e3E?r   Hm?r'   tanh)r@   r=   r=   r>   bloom_gelu_forwardl   s   *	rO   gc                 C   s^   |d }t d| dd| |   }d| d||  dd| |    dd|   }||  S )a   
    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
    0.3989423 * x * torch.exp(-0.5 * x * x)

    Args:
        g (`torch.tensor`):
            gradient output tensor
        x (`torch.tensor`):
            input tensor
    r   rK   r   rL   rI   g6vf?rM   )rP   r@   tanh_outffr=   r=   r>   bloom_gelu_backx   s   0rS   c                   @   s@   e Zd ZedejdejfddZedejdejfddZdS )	GeLUFunctioninputr   c                 C   s   |  | t|S N)save_for_backwardrO   )ctxrU   r=   r=   r>   forward   s   
zGeLUFunction.forwardgrad_outputc                 C   s   | j }t||}|S rV   )saved_tensorsrS   )rX   rZ   rU   tmpr=   r=   r>   backward   s   
zGeLUFunction.backwardN)__name__
__module____qualname__staticmethodr'   TensorrY   r]   r=   r=   r=   r>   rT      s
    rT   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )	BloomGeluzN
    Partly copied from Megatron-DeepSpeed code and adapted for our needs
    c                    s   t    d S rV   )super__init__self	__class__r=   r>   re      s   zBloomGelu.__init__r@   r   c                 C   s
   t |S rV   )rT   apply)rg   r@   r=   r=   r>   rY         
zBloomGelu.forward)	r^   r_   r`   __doc__re   r'   rb   rY   __classcell__r=   r=   rh   r>   rc      s    rc   c                       s   e Zd ZddededB f fddZdejdeejejejf fdd	Z	d
ejdejfddZ
				ddejdejdejdejdedB dededejdB fddZ  ZS )BloomAttentionNconfig	layer_idxc                    s   t    |j| _|j| _|j| _|j| _| j| j | _| j| _|j	| _	| j| j | jkr:t
d| j d| j ddt| j | _d| _|| _|d u rXtd| jj d tj| jd| j dd	| _t| j| j| _t|j| _d S )
NzA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).rJ   zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r   Tbias)rd   re   pretraining_tpslow_but_exacthidden_sizen_headr   head_dim
split_sizehidden_dropout
ValueErrorr$   sqrtinv_norm_factorbetarp   loggerwarning_onceri   r^   r   Linearquery_key_valuedenseDropoutattention_dropout)rg   ro   rp   rh   r=   r>   re      s0   

zBloomAttention.__init__	fused_qkvr   c                 C   sz   |j \}}}|||| jd| j}|ddddf dd}|ddddf dd}|ddddf dd}|||fS )a  
        Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape
        without making any copies, results share same memory storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, num_heads, seq_length, head_dim]
            key: [batch_size, num_heads, seq_length, head_dim]
            value: [batch_size, num_heads, seq_length, head_dim]
        r   .r   Nr   r   )r#   viewr   rw   	transpose)rg   r   r2   r3   three_times_hidden_sizequery_layer	key_layervalue_layerr=   r=   r>   _reshape   s   
zBloomAttention._reshaper@   c                 C   sP   |j \}}}|| j }||| j|| j}|dddd}|||| j| j S )z
        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
        r   r   r   r   )r#   r   r   rw   permuter0   )rg   r@   batch_size_and_num_headsr3   _r2   r=   r=   r>   _merge_heads   s
   
zBloomAttention._merge_headsFhidden_statesrA   r<   r   
layer_past	use_cacheoutput_attentionscache_positionc	                 C   s  |j \}	}
}| |}| |\}}}|d ur&d|i}|||| j|\}}||	| j d| j}||	| j d| jdd}||	| j d| j}|j	||| j
| jd}||	| j|
d}|d urg|| }tj|dtjd|j}| |}||	| j |
d}t||}| |}| jdkr| jr| j| j }t|}t| jD ]3}|t|d d d d t|| t|d | f | jjd d t|| t|d | f  }qn| |}t||| j | j!}||fS )Nr   r"   )batch1batch2r}   alpha)r!   r   r   )"r#   r   r   updaterp   r0   r   rw   r   baddbmmr}   r|   r   rE   softmaxr'   r)   r1   r   r   bmmr   rs   rt   ru   
zeros_likerangelinearintr   weightrH   ry   rC   )rg   r   rA   r<   r   r   r   r   r   r2   q_lengthr   r   r   r   r   cache_kwargsattention_scoresattn_weightsattention_probsattention_probs_reshapedcontext_layerslicesoutput_tensorir=   r=   r>   rY      sF   



,*
zBloomAttention.forwardrV   NFFN)r^   r_   r`   r   r   re   r'   rb   tupler   r   r	   bool
LongTensorrY   rm   r=   r=   rh   r>   rn      s2    $!	rn   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	BloomMLPro   c                    sX   t    |j}|j| _|j| _t|d| | _t | _	td| || _
|j| _d S )N   )rd   re   ru   rs   rt   r   r   dense_h_to_4hrc   	gelu_impldense_4h_to_hry   )rg   ro   ru   rh   r=   r>   re   8  s   
zBloomMLP.__init__r   rA   r   c                 C   s   |  | |}| jdkrY| jrYt|}| jjjd | j }t	| jD ]3}|t
|d d d d t|| t|d | f | jjd d t|| t|d | f  }q$n| |}t||| j| j}|S )Nr   r"   )r   r   rs   rt   r'   r   r   r   r#   r   rE   r   r   rH   ry   rC   )rg   r   rA   intermediate_outputr   r   outputr=   r=   r>   rY   C  s   
,*
zBloomMLP.forward)	r^   r_   r`   r   re   r'   rb   rY   rm   r=   r=   rh   r>   r   7  s    $r   c                       sl   e Zd ZddededB f fddZ				ddejdejd	ejd
edB de	de	dej
dB fddZ  ZS )
BloomBlockNro   rp   c                    sb   t    |j}t||jd| _|j| _t||| _	t||jd| _
t|| _|j| _|j| _d S )Neps)rd   re   ru   r   layer_norm_epsiloninput_layernormrv   r   rn   self_attentionpost_attention_layernormr   mlp(apply_residual_connection_post_layernormry   )rg   ro   rp   ru   rh   r=   r>   re   W  s   

zBloomBlock.__init__Fr   r<   r   r   r   r   r   c              
   C   sf   |  |}| jr|}	n|}	| j||	||||||d\}
}| |
}| jr'|}	n|
}	| ||	}||fS )N)r   r   r<   r   r   r   )r   r   r   r   r   )rg   r   r<   r   r   r   r   r   layernorm_outputrA   attention_outputr   r   r=   r=   r>   rY   e  s(   


zBloomBlock.forwardrV   r   )r^   r_   r`   r   r   re   r'   rb   r	   r   r   rY   rm   r=   r=   rh   r>   r   V  s*    r   c                   @   s,   e Zd ZU eed< dZdZdgZdZdZ	dS )BloomPreTrainedModelro   transformerTr   past_key_valuesN)
r^   r_   r`   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphr=   r=   r=   r>   r     s   
 r   c                       s   e Zd Zdef fddZdejdedejdejfdd	Z	d
d Z
dejfddZe									ddejdB dedB dejdB dejdB dedB dedB dedB dedB dejdB deejdf eB fddZ  ZS )
BloomModelro   c                    s   t     j| _ j| _t j| j| _	t
| j jd| _t fddt jD | _t
| j jd| _d| _|   d S )Nr   c                    s   g | ]}t  |d qS ))rp   )r   ).0r   ro   r=   r>   
<listcomp>  s    z'BloomModel.__init__.<locals>.<listcomp>F)rd   re   ru   	embed_dimrv   r   r   	Embedding
vocab_sizeword_embeddingsr   r   word_embeddings_layernorm
ModuleListr   num_hidden_layershln_fgradient_checkpointing	post_initrg   ro   rh   r   r>   re     s    zBloomModel.__init__r   r   r   r   c                 C   s   t |||S rV   )r?   )rg   r   r   r   r=   r=   r>   r?     s   zBloomModel.build_alibi_tensorc                 C   s   | j S rV   r   rf   r=   r=   r>   get_input_embeddings  s   zBloomModel.get_input_embeddingsnew_embeddingsc                 C   
   || _ d S rV   r   rg   r   r=   r=   r>   set_input_embeddings  rk   zBloomModel.set_input_embeddingsN	input_idsr   inputs_embedsr   r   output_hidden_statesreturn_dictr   .c
              
   K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|du |duA r4td| jrC| jrC|rCt	d d}|du rL| 
|}|rX|du rXt| j d}|j\}}}|durf| nd}|| }|	du r{tj||| |jd}	| |}|rdnd}|rdnd}|du rtj||f|jd}n||j}| j|| j|jd	}t| j |||	|d
}t| jD ]#\}}|r||f }||||||||	d}|d }|r||d f }q| |}|r||f }|stdd ||||fD S t||||dS )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   r=   )r   )ro   r   r   r   r   )r   r   r   r   r<   r   r   c                 s   s    | ]	}|d ur|V  qd S rV   r=   )r   vr=   r=   r>   	<genexpr>"  s    z%BloomModel.forward.<locals>.<genexpr>)last_hidden_stater   r   
attentions)ro   r   r   r   use_return_dictrz   r   rC   r~   r   r   r
   r#   get_seq_lengthr'   r*   r   r   onesr1   r?   r   r   r   	enumerater   r   r   r   )rg   r   r   r   r   r   r   r   r   r   kwargsr2   r3   r   past_lengthseq_length_with_pastr   all_self_attentionsall_hidden_statesr<   causal_maskr   blockoutputsr=   r=   r>   rY     s   






zBloomModel.forward	NNNNNNNNN)r^   r_   r`   r   re   r'   rb   r   r   r?   r   r   r   r   r	   r   r   r   rY   rm   r=   r=   rh   r>   r     sH     	
r   z
    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       s   e Zd ZddiZdef fddZdejfddZ									
	d fdd	Z	e
																					ddejd	B ded	B dejd	B dejd	B dejd	B ded	B ded	B ded	B ded	B dejd	B deejB deej eB fddZ  ZS )BloomForCausalLMzlm_head.weightz"transformer.word_embeddings.weightro   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S NFrq   )
rd   re   r   r   r   r   ru   r   lm_headr   r   rh   r=   r>   re   7  s   
zBloomForCausalLM.__init__r   c                 C   r   rV   )r   r   r=   r=   r>   set_output_embeddings?  rk   z&BloomForCausalLM.set_output_embeddingsNTFc              	      s   t  j|f||||||d|}	t|tr?|d ur?| }
|j\}}|
| }tj|||j|j	d}tj
||gdd}||	d< |	S )N)r   r   r   r   r   is_first_iterationr   r"   r    r   )rd   prepare_inputs_for_generation
isinstancer   get_max_cache_shaper#   r'   zerosr   r   r.   )rg   r   r   r   r   r   r   r   r   model_inputstarget_lengthr2   r3   diffnew_attn_maskrh   r=   r>   r   B  s(   
z.BloomForCausalLM.prepare_inputs_for_generationr   r   r   r   r   labelsr   r   r   r   r   logits_to_keepr   c                 K   s   |	dur|	n| j j}	| j||||||||	|
d	}|d }t|tr't| dn|}| |dd|ddf }d}|durK| j||| j j|	dd}|	sa|f|dd  }|dur_|f| S |S t
|||j|j|jdS )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   r   r   r   r   r   r   r   r   num_items_in_batch)r   r	  r   losslogitsr   r   r   )ro   r   r   r   r   slicer   loss_functionr   getr   r   r   r   )rg   r   r   r   r   r  r   r   r   r   r   r  r   transformer_outputsr   slice_indicesr  r  r   r=   r=   r>   rY   g  sB   !zBloomForCausalLM.forward)NNNNTF)NNNNNNNNNNr   )r^   r_   r`   _tied_weights_keysr   re   r'   rb   r   r   r   r   r	   r   r   r   r   rY   rm   r=   r=   rh   r>   r   .  s`    %	
r   a  
    The Bloom Model transformer with a sequence classification head on top (linear layer).

    [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                          e Zd Zdef fddZe									ddejdB dedB dej	dB dej	dB d	ej	dB d
e
dB de
dB de
dB de
dB deej	 eB fddZ  ZS )BloomForSequenceClassificationro   c                    s@   t  | |j| _t|| _tj|j|jdd| _| 	  d S r   )
rd   re   
num_labelsr   r   r   r   ru   scorer   r   rh   r=   r>   re     s
   
z'BloomForSequenceClassification.__init__Nr   r   r   r   r  r   r   r   r   r   c
              
   K   s$  |	dur|	n| j j}	| j||||||||	d}|d }| |}|dur*|jd }n|jd }| j jdu r=|dkr=td| j jdu rFd}n1|durk|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur| j jdu r| jdkrd
| j _n| jdkr|jt	jks|jt	jkrd| j _nd| j _| j jd
krt }| jdkr|| | }n#|||}n| j jdkrt }|||}n| j jdkrt }|||}|	s|f|dd  }|dur|f| S |S t|||j|j|jdS )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r"   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr
  )ro   r   r   r  r#   pad_token_idrz   r1   r   r'   r+   r*   argmaxr~   r   ri   r^   problem_typer  r   longr   r   squeezer   r   r   r   r   r   )rg   r   r   r   r   r  r   r   r   r   r   r  r   r  r2   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr   r=   r=   r>   rY     st   



"


z&BloomForSequenceClassification.forwardr   )r^   r_   r`   r   re   r   r'   r   r	   rb   r   r   r   rY   rm   r=   r=   rh   r>   r    sB    		
r  c                       r  )BloomForTokenClassificationro   c                    s   t  | |j| _t|| _t|dr|jd ur|j}nt|dr+|jd ur+|j}nd}t	|| _
t|j|j| _|   d S )Nclassifier_dropoutry   g?)rd   re   r  r   r   hasattrr'  ry   r   r   rF   r   ru   
classifierr   )rg   ro   r'  rh   r=   r>   re   4  s   
z$BloomForTokenClassification.__init__Nr   r   r   r   r  r   r   r   r   r   c
              
   K   s   |	dur|	n| j j}	| j||||||||	d}|d }| |}| |}d}|durJ||j}|j\}}t }||	|| | j
|	|| }|	s`|f|dd  }|dur^|f| S |S t|||j|jdS )r  Nr  r   r   )r  r  r   r   )ro   r   r   rF   r)  r1   r   r#   r   r   r  r   r   r   )rg   r   r   r   r   r  r   r   r   r   r   r  r   r  r  r2   r3   r%  r   r=   r=   r>   rY   E  s>   


z#BloomForTokenClassification.forwardr   )r^   r_   r`   r   re   r   r'   r   r	   rb   r   r   r   rY   rm   r=   r=   rh   r>   r&  2  sB    	
r&  c                       s   e Zd Z fddZe								ddejdB dejdB dejdB dejdB dejdB d	edB d
edB dedB de	e
B fddZ  ZS )BloomForQuestionAnsweringc                    s2   t  | t|| _t|jd| _|   d S )Nr   )	rd   re   r   r   r   r   ru   
qa_outputsr   r   rh   r=   r>   re     s   
z"BloomForQuestionAnswering.__init__Nr   r   r   start_positionsend_positionsr   r   r   r   c	                 K   sB  |dur|n| j j}| j||||||d}
|
d }| |}|jddd\}}|d }|d }d}|dur|durt| dkrL|d}t| dkrY|d}|d}|	d|}|	d|}t
|d}|||}|||}|| d }|s||f|
dd  }|dur|f| S |S t||||
j|
jd	S )
r   N)r   r   r   r   r   r   r   r"   r    )ignore_indexr   )r  start_logits
end_logitsr   r   )ro   r   r   r+  splitr   
contiguouslensizeclampr   r   r   r   )rg   r   r   r   r,  r-  r   r   r   r   r   sequence_outputr  r/  r0  
total_lossignored_indexr%  
start_lossend_lossr   r=   r=   r>   rY     sJ   	






z!BloomForQuestionAnswering.forward)NNNNNNNN)r^   r_   r`   re   r   r'   r   FloatTensorr   r   r   rY   rm   r=   r=   rh   r>   r*    s<    	r*  )r   r   r   r  r&  r*  )=rl   r$   r'   r   torch.nnr   r   r   r   r   rE   cache_utilsr	   r
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_bloomr   
get_loggerr^   r~   rb   r   r   r?   floatr   rH   rO   rS   autogradFunctionrT   Modulerc   rn   r   r   r   r   r   r  r&  r*  __all__r=   r=   r=   r>   <module>   sV   
 $, >	 }sXS