o
    Gi"                     @   s   d dl Z d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	 dde j
de jd	edB fd
dZG dd deZG dd dejZG dd dejZdS )    N)nn)CLIPPreTrainedModel)BaseModelOutputWithPooling)CLIPTextConfig)CLIPEncodermaskdtypetgt_lenc                 C   sj   |   \}}|dur|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    N   g      ?)sizeexpandtomasked_filltorchboolfinfomin)r   r   r	   bszsrc_lenexpanded_maskinverted_mask r   h/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py_expand_mask   s
   *r   c                       s   e Zd ZeZdgZdef fddZ								ddejde	dejdB d	ejdB d
ejdB de
dB de
dB de
dB deeB fddZ  ZS )ContextCLIPTextModelCLIPEncoderLayerconfigc                    s"   t  | t|| _|   d S N)super__init__ContextCLIPTextTransformer
text_model	post_init)selfr   	__class__r   r   r   -   s   
zContextCLIPTextModel.__init__Nctx_embeddingsctx_begin_pos	input_idsattention_maskposition_idsoutput_attentionsoutput_hidden_statesreturn_dictreturnc	           	   
   C   s   | j ||||||||dS )N)r&   r'   r(   r)   r*   r+   r,   r-   )r!   )	r#   r&   r'   r(   r)   r*   r+   r,   r-   r   r   r   forward3   s   zContextCLIPTextModel.forward)NNNNNNNN)__name__
__module____qualname__r   config_class_no_split_modulesr   r   Tensorlistr   tupler   r/   __classcell__r   r   r$   r   r   (   s>    	
r   c                       s   e Zd Zdef fddZ						ddejdedejdB dejdB d	ejdB d
edB dedB dedB de	e
B fddZdd Z  ZS )r    r   c                    s:   t    || _|j}t|| _t|| _t	|| _
d S r   )r   r   r   hidden_sizeContextCLIPTextEmbeddings
embeddingsr   encoderr   	LayerNormfinal_layer_normr#   r   	embed_dimr$   r   r   r   K   s   


z#ContextCLIPTextTransformer.__init__Nr&   r'   r(   r)   r*   r+   r,   r-   r.   c	                 C   sB  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| }	|d|	d }| j||||d}
|	\}}|durJ||d7 }| |||
j	
|
j}|dur`t||
j	}| j|
|||||d}|d }| |}|tj|jd |jd|
tjjdd	f }|s||f|dd  S t|||j|jd
S )z
        Returns:

        Nz$You have to specify either input_ids)r(   r*   r&   r'   r
   )inputs_embedsr)   causal_attention_maskr+   r,   r-   r   )devicedim)last_hidden_statepooler_outputhidden_states
attentions)r   r+   r,   use_return_dict
ValueErrorr   viewr;   _build_causal_attention_maskr   r   rD   r   r<   r>   r   arangeshapeintargmaxr   rI   rJ   )r#   r&   r'   r(   r)   r*   r+   r,   r-   input_shaperI   r   seq_lenrC   encoder_outputsrG   pooled_outputr   r   r   r/   S   sZ   	
z"ContextCLIPTextTransformer.forwardc                 C   sB   t j||||d}|t t |j |d |d}|S )N)r   r
   )r   emptyfill_tensorr   r   triu_	unsqueeze)r#   r   rT   r   r   r   r   r   rN      s
   

z7ContextCLIPTextTransformer._build_causal_attention_mask)NNNNNN)r0   r1   r2   r   r   r   r5   r6   r   r7   r   r/   rN   r8   r   r   r$   r   r    J   s8    	

Mr    c                       sb   e Zd Zdef fddZ			ddejdedejdB dejdB d	ejdB d
ejfddZ	  Z
S )r:   r   c                    sN   t    |j}t|j|| _t|j|| _| 	dt
|jd d S )Nr*   )r
   rA   )r   r   r9   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr   rO   r   r?   r$   r   r   r      s
   
z"ContextCLIPTextEmbeddings.__init__Nr&   r'   r(   r*   rB   r.   c                 C   s   |d u rd}n|j d }|d ur|j d n|j d | }|d u r+| jd d d |f }|d u rp| |}g }|j d }	|d urpt|	D ]%}
||
 }||
d |f }||
|d f }|tj|||
 |gdd qCtj|dd}| |}|| }|S )Nr   r
   rA   rE   )	rP   r*   r^   rangeappendr   catstackr`   )r#   r&   r'   r(   r*   rB   ctx_len
seq_lengthinput_embeds_ctxr   icbpprefixsuffixposition_embeddingsr;   r   r   r   r/      s(   
 

 
z!ContextCLIPTextEmbeddings.forward)NNN)r0   r1   r2   r   r   r   r5   r6   
LongTensorr/   r8   r   r   r$   r   r:      s$    r:   r   )r   r   transformersr   transformers.modeling_outputsr   +transformers.models.clip.configuration_clipr   &transformers.models.clip.modeling_clipr   r5   r   rQ   r   r   Moduler    r:   r   r   r   r   <module>   s    "`