o
    pij                     @   s,  d dl mZmZmZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z d dlmZmZmZ d dlmZmZ d dlmZmZmZmZmZ d d	lmZ d d
lmZmZ eeZ G dd dej!Z"G dd dej!Z#G dd dej!Z$G dd dej!Z%G dd dej!Z&G dd deZ'G dd deZ(dS )    )OptionalTupleUnionN)nn)BertTokenizer)QuickGELUActivation))BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)Blip2ConfigBlip2VisionConfig)Blip2EncoderBlip2PreTrainedModelBlip2QFormerAttentionBlip2QFormerIntermediateBlip2QFormerOutput)apply_chunking_to_forward)loggingreplace_return_docstringsc                       s2   e Zd ZdZ fddZ				dddZ  ZS )	Blip2TextEmbeddingsz;Construct the embeddings from word and position embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	tj
|j|jd| _
t|j| _| dt|jd t|dd| _|| _d S )N)padding_idxepsposition_ids)   position_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr   configselfr1   	__class__ o/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/blip_diffusion/modeling_blip2.pyr   2   s   

zBlip2TextEmbeddings.__init__Nr   c           	      C   s   |d ur|  d }nd}|d u r | jd d ||| f  }|d urQ| |}| jdkr7| |}|| }|d urP|jd }||dd}tj	||fdd}n|}|
|j}| |}| |}|S )Nr   r   r   dim)sizer   cloner$   r   r&   shaperepeatr-   cattodtyper'   r+   )	r3   	input_idsr   query_embedspast_key_values_length
seq_length
embeddingsr&   
batch_sizer6   r6   r7   forwardB   s(   





zBlip2TextEmbeddings.forward)NNNr   )__name__
__module____qualname____doc__r   rG   __classcell__r6   r6   r4   r7   r   /   s    r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Blip2VisionEmbeddingsr1   c                    s   t    || _|j| _|j| _|j| _tt	
dd| j| _tjd| j| j| jdd| _| j| j d | _| jd | _tt	
d| j| j| _d S )Nr      F)in_channelsout_channelskernel_sizestridebias   )r   r   r1   r"   	embed_dim
image_size
patch_sizer   	Parameterr-   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr2   r4   r6   r7   r   f   s   
zBlip2VisionEmbeddings.__init__pixel_valuesreturnc                 C   s   |j d }| jjj}| |j|d}|ddd}| j|dd|}t	j
||gdd}|| jd d d |dd d f | }|S )Nr   r@   rT   r   r   r8   )r<   r\   weightr@   r?   flatten	transposerZ   r/   r-   r>   r_   r:   )r3   r`   rF   target_dtypepatch_embedsclass_embedsrE   r6   r6   r7   rG   x   s   

,zBlip2VisionEmbeddings.forward)	rH   rI   rJ   r   r   r-   TensorrG   rL   r6   r6   r4   r7   rM   e   s    rM   c                       s:   e Zd Z fddZ										d	ddZ  ZS )
Blip2QFormerEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  |qS r6   )Blip2QFormerLayer).0	layer_idxr1   r6   r7   
<listcomp>       z0Blip2QFormerEncoder.__init__.<locals>.<listcomp>F)	r   r   r1   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr2   r4   rn   r7   r      s   

zBlip2QFormerEncoder.__init__NFTr   c              
      sl  |	rdnd } r
dnd } rdnd }|rdnd }t | jjD ]w}| j| }|	r,||f }|d ur4|| nd }|d ur>|| nd t| jddrj| jrj|rStd d} fdd}tj	j

|||||||}n|||||| }|d }|r||d f7 } r||d	 f }|jr||d
 f }q|	r||f }|
stdd |||||fD S t|||||dS )Nr6   ru   FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...c                    s    fdd}|S )Nc                     s    g | R  S Nr6   )inputs)moduleoutput_attentionspast_key_valuequery_lengthr6   r7   custom_forward   s   zRBlip2QFormerEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr6   )rx   r|   ry   rz   r{   )rx   r7   create_custom_forward   s   z:Blip2QFormerEncoder.forward.<locals>.create_custom_forwardr   r   r   rT   c                 s   s    | ]	}|d ur|V  qd S rv   r6   )rl   vr6   r6   r7   	<genexpr>   s    z.Blip2QFormerEncoder.forward.<locals>.<genexpr>)last_hidden_statepast_key_valueshidden_states
attentionscross_attentions)rr   r1   rs   rt   r0   trainingloggerwarningr-   utils
checkpointhas_cross_attentiontupler   )r3   r   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskr   	use_cachery   output_hidden_statesreturn_dictr{   all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr~   layer_outputsr6   r}   r7   rG      sz   

	
zBlip2QFormerEncoder.forward)
NNNNNNFFTr   rH   rI   rJ   r   rG   rL   r6   r6   r4   r7   rj      s    rj   c                       sD   e Zd Z fddZ							dddZdd	 Zd
d Z  ZS )rk   c                    s~   t    |j| _d| _t|| _|| _||j dkr&t|dd| _d| _	nd| _	t
|| _t
|| _t|| _t|| _d S )Nr   r   T)is_cross_attentionF)r   r   chunk_size_feed_forwardseq_len_dimr   	attentionrm   cross_attention_frequencycrossattentionr   r   intermediateintermediate_queryr   output_queryoutput)r3   r1   rm   r4   r6   r7   r      s   




zBlip2QFormerLayer.__init__NFr   c	              	   C   s:  |d ur
|d d nd }	| j |||||	d}
|
d }|
dd }|
d }|dkr|d d d |d d f }| jrW|d u r@td| j||||||d}|d }||dd  }t| j| j| j|}|jd |krt| j	| j| j|d d |d d d f }t
j||gdd}n
t| j	| j| j|}|f| }||f }|S )	NrT   )ry   rz   r   r   r   z>encoder_hidden_states must be given for cross-attention layers)ry   r8   )r   r   
ValueErrorr   r   feed_forward_chunk_queryr   r   r<   feed_forward_chunkr-   r>   )r3   r   r   r   r   r   rz   ry   r{   self_attn_past_key_valueself_attention_outputsattention_outputoutputspresent_key_valuequery_attention_outputcross_attention_outputslayer_outputlayer_output_textr6   r6   r7   rG      sd   

zBlip2QFormerLayer.forwardc                 C      |  |}| ||}|S rv   )r   r   r3   r   intermediate_outputr   r6   r6   r7   r   G     
z$Blip2QFormerLayer.feed_forward_chunkc                 C   r   rv   )r   r   r   r6   r6   r7   r   L  r   z*Blip2QFormerLayer.feed_forward_chunk_query)NNNNNFr   )rH   rI   rJ   r   rG   r   r   rL   r6   r6   r4   r7   rk      s    
Grk   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	ProjLayer皙?-q=c                    sN   t    t||| _t | _t||| _t|| _	tj
||d| _
d S Nr   )r   r   r   Lineardense1	QuickGELUact_fndense2r)   r+   r'   )r3   in_dimout_dim
hidden_dimdrop_pr   r4   r6   r7   r   T  s   
zProjLayer.__init__c              	   C   s2   |}|  |}| | | | || }|S rv   )r'   r+   r   r   r   )r3   xx_inr6   r6   r7   rG   _  s   
 zProjLayer.forward)r   r   r   r6   r6   r4   r7   r   S  s    r   c                       s   e Zd ZdZeZdef fddZeeed				dde	e
j de	e de	e d	e	e d
eeef f
ddZdd Z  ZS )Blip2VisionModelr`   r1   c                    s\   t  | || _|j}t|| _tj||jd| _	t
|| _tj||jd| _|   d S r   )r   r   r1   r"   rM   rE   r   r'   r(   pre_layernormr   encoderpost_layernorm	post_init)r3   r1   rU   r4   r6   r7   r   m  s   

zBlip2VisionModel.__init__)output_typeconfig_classNry   r   r   ra   c           	      C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |}| |}| j||||d}|d }| |}|dddddf }| |}|s^||f|dd  S t	|||j
|jdS )z
        Returns:

        Nz You have to specify pixel_values)inputs_embedsry   r   r   r   r   )r   pooler_outputr   r   )r1   ry   r   use_return_dictr   rE   r   r   r   r	   r   r   )	r3   r`   ry   r   r   r   encoder_outputsr   pooled_outputr6   r6   r7   rG   x  s4   



zBlip2VisionModel.forwardc                 C   s   | j S rv   )rE   r3   r6   r6   r7   get_input_embeddings  s   z%Blip2VisionModel.get_input_embeddings)NNNN)rH   rI   rJ   main_input_namer   r   r   r   r	   r   r-   ri   boolr   r   rG   r   rL   r6   r6   r4   r7   r   i  s*    

,r   c                       s   e Zd ZdZdef fddZdd Zdd Zd	d
 Z	dde	j
dee de	jdede	j
f
ddZ										dddZ  ZS )Blip2QFormerModelz:
    Querying Transformer (Q-Former), used in BLIP-2.
    r1   c                    s   t  | || _t|j| _t|j| _t	
td|j|jj| _t|dr-|jd u r6tjddd| _n	tj|jdd| _| jddi t|jj|jj|jjd d	d
d| _t|j| _|   d S )Nr   	tokenizerzbert-base-uncasedright)truncation_side	bos_tokenz[DEC]   r   r   )r   r   r   r   r   )r   r   r1   r   qformer_configrE   r   vision_configvisual_encoderr   rX   r-   zerosnum_query_tokensr"   query_tokenshasattrr   r   from_pretrainedadd_special_tokensr   
proj_layerrj   r   r   r2   r4   r6   r7   r     s$   
zBlip2QFormerModel.__init__c                 C   s   | j jS rv   rE   r$   r   r6   r6   r7   r     s   z&Blip2QFormerModel.get_input_embeddingsc                 C   s   || j _d S rv   r   )r3   valuer6   r6   r7   set_input_embeddings  s   z&Blip2QFormerModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   rt   r   prune_heads)r3   heads_to_prunert   headsr6   r6   r7   _prune_heads  s   zBlip2QFormerModel._prune_headsFr   input_shapedevice	has_queryra   c                 C   s   |  dkr|dddddddf }n|  dkr(|ddddddf }n	td||j|j| jd}d| d }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        rN   NrT   zAWrong shape for input_ids (shape {}) or attention_mask (shape {})rb   g      ?g     )r9   r   formatr<   r?   r@   )r3   r   r   r   r   extended_attention_maskr6   r6   r7   get_extended_attention_mask  s   z-Blip2QFormerModel.get_extended_attention_maskNc                     s^   j |ddd}| j}|j}|jd }tj| j d ftj	d j}tj
||jgdd}|dur9|n jj}|	durC|	n jj}	|
durM|
n jj}
|durb|d d jd	  jj nd} jjd } j| j|d
}| dd }|\}}|j} |j}|}|du rtj||| f|d} |||}|durt|tr|d  \}}}n| \}}}||f}t|tr͇ fdd|D }n|du rtj||d} |}n |}nd} | jjj} j|||||||||	|
|d}|d }|dddddf }|
s! |ddd|ddf S t|||j|j|j |j!dS )a	  
        encoder_hidden_states  (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
            `(batch_size, sequence_length)`.
        use_cache (`bool`, `optional`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        ptT)return_tensorspaddingr   r   rb   r8   NrT   )rA   rB   rC   r   )r   c                    s   g | ]}  |qS r6   )invert_attention_mask)rl   maskr   r6   r7   ro   X  rp   z-Blip2QFormerModel.forward.<locals>.<listcomp>)
r   r   r   r   r   r   ry   r   r   r{   )r   r   r   r   r   r   )"r   r?   r   rA   r<   r-   onesr   r:   longr>   r   r1   ry   r   r   r{   rE   r   r   r   
isinstancelistr   get_head_maskr   rs   r   r   r
   r   r   r   r   ) r3   
text_inputimage_inputr   r   r   r   r   ry   r   r   textrA   rF   
query_attsr   rC   r{   embedding_outputr   rD   r   image_embeds_frozenr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskr   sequence_outputr   r6   r   r7   rG     s   !
&$	

 zBlip2QFormerModel.forward)F)
NNNNNNNNNN)rH   rI   rJ   rK   r   r   r   r   r   r-   ri   r   intr   r   r   rG   rL   r6   r6   r4   r7   r     s:    
/r   ))typingr   r   r   r-   torch.utils.checkpointr   transformersr   transformers.activationsr   r   transformers.modeling_outputsr   r	   r
   /transformers.models.blip_2.configuration_blip_2r   r   *transformers.models.blip_2.modeling_blip_2r   r   r   r   r   transformers.pytorch_utilsr   transformers.utilsr   r   
get_loggerrH   r   Moduler   rM   rj   rk   r   r   r   r6   r6   r6   r7   <module>   s&   
6 ggA