o
    pi7                     @   sx   d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
 ddlmZmZ ddlmZ G d	d
 d
eee
ZdS )    )OptionalN)nn)
GPT2ConfigGPT2LMHeadModel)ModuleUtilsMixin   )ConfigMixinregister_to_config)
ModelMixinc                (       s>  e Zd ZdZddgZe										
	
	
						d<dededee dedededededee dede	de	de	de	de	de
de
d e
d!e
f& fd"d#Z		d=d$ejd%ejd&eej d'eej fd(d)Zd*ed+ejd,ejfd-d.Zd/d0 Ze d1d2 Ze 				3	4	5	d>d6ed7ed8e	d9ee fd:d;Z  ZS )?UniDiffuserTextDecodera  
    Text decoder model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is used to
    generate text from the UniDiffuser image-text embedding.

    Parameters:
        prefix_length (`int`):
            Max number of prefix tokens that will be supplied to the model.
        prefix_inner_dim (`int`):
            The hidden size of the incoming prefix embeddings. For UniDiffuser, this would be the hidden dim of the
            CLIP text encoder.
        prefix_hidden_dim (`int`, *optional*):
            Hidden dim of the MLP if we encode the prefix.
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
            dot-product/softmax to float() when training with mixed precision.
    zh\.\d+\.attn\.biaszh\.\d+\.attn\.masked_biasNQ           gelu_new皙?h㈵>{Gz?TFprefix_lengthprefix_inner_dimprefix_hidden_dim
vocab_sizen_positionsn_embdn_layern_headn_inneractivation_functionresid_pdrop
embd_pdrop
attn_pdroplayer_norm_epsiloninitializer_rangescale_attn_weights	use_cachescale_attn_by_inverse_layer_idxreorder_and_upcast_attnc                    s   t    || _||kr|d u rtd| d| d|| _|| _| jd ur.t| j| jnt | _	| jd ur?t| j|nt | _
tdi d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|}t|| _d S )Nz>`prefix_hidden_dim` cannot be `None` when `prefix_inner_dim`: z and `n_embd`: z are not equal.r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&    )super__init__r   
ValueErrorr   r   r   LinearIdentityencode_prefixdecode_prefixr   r   transformer)selfr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   
gpt_config	__class__r'   s/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/unidiffuser/modeling_text_decoder.pyr)   B   sd   

	
zUniDiffuserTextDecoder.__init__	input_idsprefix_embedsattention_masklabelsc           
      C   s   | j j |}| |}| |}tj||fdd}|dur1| |jd |j}tj||fdd}| j |||d}	| j	durB|	|fS |	S )a*  
        Args:
            input_ids (`torch.Tensor` of shape `(N, max_seq_len)`):
                Text tokens to use for inference.
            prefix_embeds (`torch.Tensor` of shape `(N, prefix_length, 768)`):
                Prefix embedding to preprend to the embedded tokens.
            attention_mask (`torch.Tensor` of shape `(N, prefix_length + max_seq_len, 768)`, *optional*):
                Attention mask for the prefix embedding.
            labels (`torch.Tensor`, *optional*):
                Labels to use for language modeling.
           dimNr   )inputs_embedsr8   r7   )
r/   wter-   r.   torchcatget_dummy_tokenshapedevicer   )
r0   r5   r6   r7   r8   embedding_texthiddenembedding_catdummy_tokenoutr'   r'   r4   forward   s   


zUniDiffuserTextDecoder.forward
batch_sizerB   returnc                 C   s   t j|| jt j|dS )N)dtyperB   )r>   zerosr   int64)r0   rI   rB   r'   r'   r4   r@      s   z&UniDiffuserTextDecoder.get_dummy_tokenc                 C   s
   |  |S )N)r-   )r0   prefixr'   r'   r4   encode   s   
zUniDiffuserTextDecoder.encodec           	      C   s~   t j|ddd}g }g }|D ]"}| ||}| j|||d\}}||d  ||d  qt |}t |}||fS )a  
        Generate captions given text embedding features. Returns list[L].

        Args:
            features (`torch.Tensor` of shape `(B, L, D)`):
                Text embedding features to generate captions from.
            eos_token_id (`int`):
                The token ID of the EOS token for the text decoder model.
            device:
                Device to perform text generation on.

        Returns:
            `List[str]`: A list of strings generated from the decoder model.
        r9   r   r:   )input_embedsrB   eos_token_id)r>   splitr.   togenerate_beamappendstack)	r0   featuresrQ   rB   generated_tokensgenerated_seq_lengthsfeatureoutput_tokensseq_lengthsr'   r'   r4   generate_captions   s   


z(UniDiffuserTextDecoder.generate_captions   C         ?	beam_sizeentry_lengthtemperaturerQ   c                    s  |}dd}	t j||t jd t j||t jd}
|dur|}n| jj|}t|D ]}| j|d}|j}|dddddf |dkrE|nd }|	d
 }|	du r||d\}	}|j|g|jdd R  }|dd|	d}}	du r||n~j|gjdd R  t j|fddnfttj ||
< d||
df< |	dddf | } |
   d7  < | dddf  }|d|d\}}||jd  } |  ||jd  }|d}| t j|fdd|| }|  }	|
| }
| jj| |jd dd}t j||fdd}|
||  }
|
 r$ nq*|	  }	|	jd	d
}fdd|D }t j|dd}t j fdd|D  jd | fS )a  
        Generates text using the given tokenizer and text prompt or token embedding via beam search. This
        implementation is based on the beam search implementation from the [original UniDiffuser
        code](https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py#L89).

        Args:
            eos_token_id (`int`, *optional*):
                The token ID of the EOS token for the text decoder model.
            input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
                Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds`
                must be supplied.
            input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                An embedded representation to directly pass to the transformer as a prefix for beam search. One of
                `input_ids` and `input_embeds` must be supplied.
            device:
                The device to perform beam search on.
            beam_size (`int`, *optional*, defaults to `5`):
                The number of best states to store during beam search.
            entry_length (`int`, *optional*, defaults to `67`):
                The number of iterations to run beam search.
            temperature (`float`, *optional*, defaults to 1.0):
                The temperature to use when performing the softmax over logits from the decoding model.

        Returns:
            `Tuple(torch.Tensor, torch.Tensor)`: A tuple of tensors where the first element is a tensor of generated
            token sequences sorted by score in descending order, and the second element is the sequence lengths
            corresponding to those sequences.
        N)rB   rK   )r<   r   r`   r9   r:   T)
descendingc                       g | ]} | qS r'   r'   .0i)tokensr'   r4   
<listcomp>%      z8UniDiffuserTextDecoder.generate_beam.<locals>.<listcomp>c                    rf   r'   r'   rg   )r\   r'   r4   rk   '  rl   )rK   )r>   onesintrL   boolr/   r=   rangelogitssoftmaxlogtopkexpandrA   permutesqueezer?   floatnpinfview	unsqueezeeqallargsortrV   tensorrK   )r0   r5   rP   rB   ra   rb   rc   rQ   stop_token_indexscores
is_stopped	generatedri   outputsrq   next_tokens
scores_sumscores_sum_averagenext_tokens_sourcenext_token_embedorderoutput_textsr'   )r\   rj   r4   rT      s`   (&
"
z$UniDiffuserTextDecoder.generate_beam)Nr   r   r   r   r   Nr   r   r   r   r   r   TTFF)NN)NNNr^   r_   r`   N)__name__
__module____qualname____doc__"_keys_to_ignore_on_load_unexpectedr	   rn   r   strrx   ro   r)   r>   TensorrH   rB   r@   rO   no_gradr]   rT   __classcell__r'   r'   r2   r4   r      s    1	
D
 
r   )typingr   numpyry   r>   r   transformersr   r   transformers.modeling_utilsr   configuration_utilsr   r	   modelsr
   r   r'   r'   r'   r4   <module>   s    