o
    Gii                     @   s  d dl Z d dl mZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZmZ d dlmZmZmZmZmZ d dlmZ d d	lmZmZ eeZG d
d dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd deZ"G dd deZ#dS )    N)nn)BertTokenizer)QuickGELUActivation))BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)Blip2ConfigBlip2VisionConfig)Blip2EncoderBlip2PreTrainedModelBlip2QFormerAttentionBlip2QFormerIntermediateBlip2QFormerOutput)apply_chunking_to_forward)loggingreplace_return_docstringsc                       s2   e Zd ZdZ fddZ				dddZ  ZS )	Blip2TextEmbeddingsz;Construct the embeddings from word and position embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	tj
|j|jd| _
t|j| _| dt|jd t|dd| _|| _d S )N)padding_idxepsposition_ids)   position_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr   configselfr.   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/blip_diffusion/modeling_blip2.pyr   /   s   

zBlip2TextEmbeddings.__init__Nr   c           	      C   s   |d ur|  d }nd}|d u r | jd d ||| f  }|d urQ| |}| jdkr7| |}|| }|d urP|jd }||dd}tj	||fdd}n|}|
|j}| |}| |}|S )Nr   r   r   dim)sizer   cloner!   r   r#   shaperepeatr*   cattodtyper$   r(   )	r0   	input_idsr   query_embedspast_key_values_length
seq_length
embeddingsr#   
batch_sizer3   r3   r4   forward?   s(   





zBlip2TextEmbeddings.forward)NNNr   )__name__
__module____qualname____doc__r   rD   __classcell__r3   r3   r1   r4   r   ,   s    r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Blip2VisionEmbeddingsr.   c                    s   t    || _|j| _|j| _|j| _tt	
dd| j| _tjd| j| j| jdd| _| j| j d | _| jd | _tt	
d| j| j| _d S )Nr      F)in_channelsout_channelskernel_sizestridebias   )r   r   r.   r   	embed_dim
image_size
patch_sizer   	Parameterr*   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr/   r1   r3   r4   r   c   s   
zBlip2VisionEmbeddings.__init__pixel_valuesreturnc                 C   s   |j d }| jjj}| |j|d}|ddd}| j|dd|}t	j
||gdd}|| jd d d |dd d f | }|S )Nr   r=   rQ   r   r   r5   )r9   rY   weightr=   r<   flatten	transposerW   r,   r*   r;   r\   r7   )r0   r]   rC   target_dtypepatch_embedsclass_embedsrB   r3   r3   r4   rD   u   s   

,zBlip2VisionEmbeddings.forward)	rE   rF   rG   r	   r   r*   TensorrD   rI   r3   r3   r1   r4   rJ   b   s    rJ   c                       s:   e Zd Z fddZ										d	ddZ  ZS )
Blip2QFormerEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  |qS r3   )Blip2QFormerLayer).0	layer_idxr.   r3   r4   
<listcomp>       z0Blip2QFormerEncoder.__init__.<locals>.<listcomp>F)	r   r   r.   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr/   r1   rk   r4   r      s   

zBlip2QFormerEncoder.__init__NFTr   c                 C   s\  |	rdnd }|r
dnd }|rdnd }|rdnd }t | jjD ]o}| j| }|	r,||f }|d ur4|| nd }|d ur>|| nd }t| jddrbt rb|rTtd d}| 	|||||||||	}n|||||||||}|d }|rz||d f7 }|r||d f }|j
r||d f }q|	r||f }|
std	d
 |||||fD S t|||||dS )Nr3   rr   FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   r   r   rQ   c                 s   s    | ]	}|d ur|V  qd S Nr3   )ri   vr3   r3   r4   	<genexpr>   s    z.Blip2QFormerEncoder.forward.<locals>.<genexpr>)last_hidden_statepast_key_valueshidden_states
attentionscross_attentions)ro   r.   rp   rq   r-   r*   is_grad_enabledloggerwarning_gradient_checkpointing_funchas_cross_attentiontupler   )r0   rx   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskrw   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictquery_lengthall_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskpast_key_valuelayer_outputsr3   r3   r4   rD      s~   


zBlip2QFormerEncoder.forward)
NNNNNNFFTr   rE   rF   rG   r   rD   rI   r3   r3   r1   r4   rg      s    rg   c                       sD   e Zd Z fddZ							dddZdd	 Zd
d Z  ZS )rh   c                    s~   t    |j| _d| _t|| _|| _||j dkr&t|dd| _d| _	nd| _	t
|| _t
|| _t|| _t|| _d S )Nr   r   T)is_cross_attentionF)r   r   chunk_size_feed_forwardseq_len_dimr   	attentionrj   cross_attention_frequencycrossattentionr   r   intermediateintermediate_queryr   output_queryoutput)r0   r.   rj   r1   r3   r4   r      s   




zBlip2QFormerLayer.__init__NFr   c	              	   C   s:  |d ur
|d d nd }	| j |||||	d}
|
d }|
dd }|
d }|dkr|d d d |d d f }| jrW|d u r@td| j||||||d}|d }||dd  }t| j| j| j|}|jd |krt| j	| j| j|d d |d d d f }t
j||gdd}n
t| j	| j| j|}|f| }||f }|S )	NrQ   )r   r   r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r5   )r   r   
ValueErrorr   r   feed_forward_chunk_queryr   r   r9   feed_forward_chunkr*   r;   )r0   rx   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsattention_outputoutputspresent_key_valuequery_attention_outputcross_attention_outputslayer_outputlayer_output_textr3   r3   r4   rD      sd   

zBlip2QFormerLayer.forwardc                 C      |  |}| ||}|S rs   )r   r   r0   r   intermediate_outputr   r3   r3   r4   r   A     
z$Blip2QFormerLayer.feed_forward_chunkc                 C   r   rs   )r   r   r   r3   r3   r4   r   F  r   z*Blip2QFormerLayer.feed_forward_chunk_query)NNNNNFr   )rE   rF   rG   r   rD   r   r   rI   r3   r3   r1   r4   rh      s    
Grh   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	ProjLayer皙?-q=c                    sN   t    t||| _t | _t||| _t|| _	tj
||d| _
d S Nr   )r   r   r   Lineardense1	QuickGELUact_fndense2r&   r(   r$   )r0   in_dimout_dim
hidden_dimdrop_pr   r1   r3   r4   r   N  s   
zProjLayer.__init__c              	   C   s2   |}|  |}| | | | || }|S rs   )r$   r(   r   r   r   )r0   xx_inr3   r3   r4   rD   Y  s   
 zProjLayer.forward)r   r   r   r3   r3   r1   r4   r   M  s    r   c                       s|   e Zd ZdZeZdef fddZeeed				dde	j
dB dedB dedB d	edB d
eeB f
ddZdd Z  ZS )Blip2VisionModelr]   r.   c                    s\   t  | || _|j}t|| _tj||jd| _	t
|| _tj||jd| _|   d S r   )r   r   r.   r   rJ   rB   r   r$   r%   pre_layernormr
   encoderpost_layernorm	post_init)r0   r.   rR   r1   r3   r4   r   g  s   

zBlip2VisionModel.__init__)output_typeconfig_classNr   r   r   r^   c           	      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |}| |}| j||||d}|d }| |}|dddddf }| |}|s^||f|dd  S t	|||j
|jdS )z
        Returns:

        Nz You have to specify pixel_values)inputs_embedsr   r   r   r   r   )rv   pooler_outputrx   ry   )r.   r   r   use_return_dictr   rB   r   r   r   r   rx   ry   )	r0   r]   r   r   r   rx   encoder_outputsrv   pooled_outputr3   r3   r4   rD   r  s4   



zBlip2VisionModel.forwardc                 C   s   | j S rs   )rB   r0   r3   r3   r4   get_input_embeddings  s   z%Blip2VisionModel.get_input_embeddings)NNNN)rE   rF   rG   main_input_namer	   r   r   r   r   r*   rf   boolr   rD   r   rI   r3   r3   r1   r4   r   c  s*    
,r   c                       s   e Zd ZdZdef fddZdd Zdd Zd	d
 Z	dde	j
dee de	jdede	j
f
ddZ										dddZ  ZS )Blip2QFormerModelz:
    Querying Transformer (Q-Former), used in BLIP-2.
    r.   c                    s   t  | || _t|j| _t|j| _t	
td|j|jj| _t|dr-|jd u r6tjddd| _n	tj|jdd| _| jddi t|jj|jj|jjd d	d
d| _t|j| _|   d S )Nr   	tokenizerzbert-base-uncasedright)truncation_side	bos_tokenz[DEC]   r   r   )r   r   r   r   r   )r   r   r.   r   qformer_configrB   r   vision_configvisual_encoderr   rU   r*   zerosnum_query_tokensr   query_tokenshasattrr   r   from_pretrainedadd_special_tokensr   
proj_layerrg   r   r   r/   r1   r3   r4   r     s$   
zBlip2QFormerModel.__init__c                 C   s   | j jS rs   rB   r!   r   r3   r3   r4   r     s   z&Blip2QFormerModel.get_input_embeddingsc                 C   s   || j _d S rs   r   )r0   valuer3   r3   r4   set_input_embeddings  s   z&Blip2QFormerModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   rq   r   prune_heads)r0   heads_to_prunerq   headsr3   r3   r4   _prune_heads  s   zBlip2QFormerModel._prune_headsFr   input_shapedevice	has_queryr^   c                 C   s   |  dkr|dddddddf }n|  dkr(|ddddddf }n	td||j|j| jd}d| d }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        rK   NrQ   zAWrong shape for input_ids (shape {}) or attention_mask (shape {})r_   g      ?g     )r6   r   formatr9   r<   r=   )r0   r   r   r   r   extended_attention_maskr3   r3   r4   get_extended_attention_mask  s   z-Blip2QFormerModel.get_extended_attention_maskNc                     s^   j |ddd}| j}|j}|jd }tj| j d ftj	d j}tj
||jgdd}|dur9|n jj}|	durC|	n jj}	|
durM|
n jj}
|durb|d d jd	  jj nd} jjd } j| j|d
}| dd }|\}}|j} |j}|}|du rtj||| f|d} |||}|durt|tr|d  \}}}n| \}}}||f}t|tr͇ fdd|D }n|du rtj||d} |}n |}nd} | jjj} j|||||||||	|
|d}|d }|dddddf }|
s! |ddd|ddf S t|||j|j|j |j!dS )a	  
        encoder_hidden_states  (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
            `(batch_size, sequence_length)`.
        use_cache (`bool`, `optional`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        ptT)return_tensorspaddingr   r   r_   r5   NrQ   )r>   r?   r@   r   )r   c                    s   g | ]}  |qS r3   )invert_attention_mask)ri   maskr   r3   r4   rl   R  rm   z-Blip2QFormerModel.forward.<locals>.<listcomp>)
r   r   r   r   rw   r   r   r   r   r   )rv   r   rw   rx   ry   rz   )"r   r<   r   r>   r9   r*   onesr   r7   longr;   r   r.   r   r   r   r   rB   r   rv   r   
isinstancelistr   get_head_maskr   rp   r   r   r   rw   rx   ry   rz   ) r0   
text_inputimage_inputr   r   r   rw   r   r   r   r   textr>   rC   
query_attsr   r@   r   embedding_outputr   rA   r   image_embeds_frozenr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskr   sequence_outputr   r3   r   r4   rD     s   !
&$	

 zBlip2QFormerModel.forward)F)
NNNNNNNNNN)rE   rF   rG   rH   r   r   r   r   r   r*   rf   r   intr   r   r   rD   rI   r3   r3   r1   r4   r     s:    
/r   )$r*   r   transformersr   transformers.activationsr   r   transformers.modeling_outputsr   r   r   /transformers.models.blip_2.configuration_blip_2r   r	   *transformers.models.blip_2.modeling_blip_2r
   r   r   r   r   transformers.pytorch_utilsr   transformers.utilsr   r   
get_loggerrE   r|   Moduler   rJ   rg   rh   r   r   r   r3   r3   r3   r4   <module>   s"   
6 dgA