o
    Gi6                     @   sl   d dl Zd dlZd dlmZ d dlmZmZ d dlmZ ddl	m
Z
mZ ddlmZ G dd	 d	ee
eZdS )
    N)nn)
GPT2ConfigGPT2LMHeadModel)ModuleUtilsMixin   )ConfigMixinregister_to_config)
ModelMixinc                (       s>  e Zd ZdZddgZe										
	
	
						d<dedededB dededededededB dedededededede	de	d e	d!e	f& fd"d#Z
		d=d$ejd%ejd&ejdB d'ejdB fd(d)Zd*ed+ejd,ejfd-d.Zd/d0 Ze d1d2 Ze 				3	4	5	d>d6ed7ed8ed9edB fd:d;Z  ZS )?UniDiffuserTextDecodera  
    Text decoder model for a image-text [UniDiffuser](https://huggingface.co/papers/2303.06555) model. This is used to
    generate text from the UniDiffuser image-text embedding.

    Parameters:
        prefix_length (`int`):
            Max number of prefix tokens that will be supplied to the model.
        prefix_inner_dim (`int`):
            The hidden size of the incoming prefix embeddings. For UniDiffuser, this would be the hidden dim of the
            CLIP text encoder.
        prefix_hidden_dim (`int`, *optional*):
            Hidden dim of the MLP if we encode the prefix.
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
            dot-product/softmax to float() when training with mixed precision.
    zh\.\d+\.attn\.biaszh\.\d+\.attn\.masked_biasNQ           gelu_new皙?h㈵>{Gz?TFprefix_lengthprefix_inner_dimprefix_hidden_dim
vocab_sizen_positionsn_embdn_layern_headn_inneractivation_functionresid_pdrop
embd_pdrop
attn_pdroplayer_norm_epsiloninitializer_rangescale_attn_weights	use_cachescale_attn_by_inverse_layer_idxreorder_and_upcast_attnc                    s   t    || _||kr|d u rtd| d| d|| _|| _| jd ur.t| j| jnt | _	| jd ur?t| j|nt | _
tdi d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|}t|| _d S )Nz>`prefix_hidden_dim` cannot be `None` when `prefix_inner_dim`: z and `n_embd`: z are not equal.r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%    )super__init__r   
ValueErrorr   r   r   LinearIdentityencode_prefixdecode_prefixr   r   transformer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   
gpt_config	__class__r&   i/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/unidiffuser/modeling_text_decoder.pyr(   @   sd   

	
zUniDiffuserTextDecoder.__init__	input_idsprefix_embedsattention_masklabelsc           
      C   s   | j j |}| |}| |}tj||fdd}|dur1| |jd |j}tj||fdd}| j |||d}	| j	durB|	|fS |	S )a)  
        Args:
            input_ids (`torch.Tensor` of shape `(N, max_seq_len)`):
                Text tokens to use for inference.
            prefix_embeds (`torch.Tensor` of shape `(N, prefix_length, 768)`):
                Prefix embedding to prepend to the embedded tokens.
            attention_mask (`torch.Tensor` of shape `(N, prefix_length + max_seq_len, 768)`, *optional*):
                Attention mask for the prefix embedding.
            labels (`torch.Tensor`, *optional*):
                Labels to use for language modeling.
           dimNr   )inputs_embedsr7   r6   )
r.   wter,   r-   torchcatget_dummy_tokenshapedevicer   )
r/   r4   r5   r6   r7   embedding_texthiddenembedding_catdummy_tokenoutr&   r&   r3   forward   s   


zUniDiffuserTextDecoder.forward
batch_sizerA   returnc                 C   s   t j|| jt j|dS )N)dtyperA   )r=   zerosr   int64)r/   rH   rA   r&   r&   r3   r?      s   z&UniDiffuserTextDecoder.get_dummy_tokenc                 C   s
   |  |S )N)r,   )r/   prefixr&   r&   r3   encode   s   
zUniDiffuserTextDecoder.encodec           	      C   s~   t j|ddd}g }g }|D ]"}| ||}| j|||d\}}||d  ||d  qt |}t |}||fS )a  
        Generate captions given text embedding features. Returns list[L].

        Args:
            features (`torch.Tensor` of shape `(B, L, D)`):
                Text embedding features to generate captions from.
            eos_token_id (`int`):
                The token ID of the EOS token for the text decoder model.
            device:
                Device to perform text generation on.

        Returns:
            `list[str]`: A list of strings generated from the decoder model.
        r8   r   r9   )input_embedsrA   eos_token_id)r=   splitr-   togenerate_beamappendstack)	r/   featuresrP   rA   generated_tokensgenerated_seq_lengthsfeatureoutput_tokensseq_lengthsr&   r&   r3   generate_captions   s   


z(UniDiffuserTextDecoder.generate_captions   C         ?	beam_sizeentry_lengthtemperaturerP   c                    s  |}dd}	t j||t jd t j||t jd}
|dur|}n| jj|}t|D ]}| j|d}|j}|dddddf |dkrE|nd }|	d
 }|	du r||d\}	}|j|g|jdd R  }|dd|	d}}	du r||n~j|gjdd R  t j|fddnfttj ||
< d||
df< |	dddf | } |
   d7  < | dddf  }|d|d\}}||jd  } |  ||jd  }|d}| t j|fdd|| }|  }	|
| }
| jj| |jd dd}t j||fdd}|
||  }
|
 r$ nq*|	  }	|	jd	d
}fdd|D }t j|dd}t j fdd|D  jd | fS )a  
        Generates text using the given tokenizer and text prompt or token embedding via beam search. This
        implementation is based on the beam search implementation from the [original UniDiffuser
        code](https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py#L89).

        Args:
            eos_token_id (`int`, *optional*):
                The token ID of the EOS token for the text decoder model.
            input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
                Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds`
                must be supplied.
            input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                An embedded representation to directly pass to the transformer as a prefix for beam search. One of
                `input_ids` and `input_embeds` must be supplied.
            device:
                The device to perform beam search on.
            beam_size (`int`, *optional*, defaults to `5`):
                The number of best states to store during beam search.
            entry_length (`int`, *optional*, defaults to `67`):
                The number of iterations to run beam search.
            temperature (`float`, *optional*, defaults to 1.0):
                The temperature to use when performing the softmax over logits from the decoding model.

        Returns:
            `tuple(torch.Tensor, torch.Tensor)`: A tuple of tensors where the first element is a tensor of generated
            token sequences sorted by score in descending order, and the second element is the sequence lengths
            corresponding to those sequences.
        N)rA   rJ   )r;   r   r_   r8   r9   T)
descendingc                       g | ]} | qS r&   r&   .0i)tokensr&   r3   
<listcomp>#      z8UniDiffuserTextDecoder.generate_beam.<locals>.<listcomp>c                    re   r&   r&   rf   )r[   r&   r3   rj   %  rk   )rJ   )r=   onesintrK   boolr.   r<   rangelogitssoftmaxlogtopkexpandr@   permutesqueezer>   floatnpinfview	unsqueezeeqallargsortrU   tensorrJ   )r/   r4   rO   rA   r`   ra   rb   rP   stop_token_indexscores
is_stopped	generatedrh   outputsrp   next_tokens
scores_sumscores_sum_averagenext_tokens_sourcenext_token_embedorderoutput_textsr&   )r[   ri   r3   rS      s`   (&
"
z$UniDiffuserTextDecoder.generate_beam)Nr   r   r   r   r   Nr   r   r   r   r   r   TTFF)NN)NNNr]   r^   r_   N)__name__
__module____qualname____doc__"_keys_to_ignore_on_load_unexpectedr   rm   strrw   rn   r(   r=   TensorrG   rA   r?   rN   no_gradr\   rS   __classcell__r&   r&   r1   r3   r
      s    1	
D
 
r
   )numpyrx   r=   r   transformersr   r   transformers.modeling_utilsr   configuration_utilsr   r   modelsr	   r
   r&   r&   r&   r3   <module>   s    