o
    i                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	  m
Z d dlm	Z	 ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/ e&0e1Z2e$G dd deZ3ee$ddG dd deZ4ee$ddG dd deZ5ee$ddG dd deZ6G d d! d!e	j7Z8d"ej9d#e:d$ej9fd%d&Z;	'dad(e	j7d)ej9d*ej9d+ej9d,eej9 d-e<d.e<d/e!e# fd0d1Z=G d2d3 d3e	j7Z>G d4d5 d5e	j7Z?G d6d7 d7eZ@G d8d9 d9e	j7ZAG d:d; d;e	j7ZBG d<d= d=e	j7ZCG d>d? d?eZDe$G d@dA dAe3ZEG dBdC dCe	j7ZFG dDdE dEe	j7ZGG dFdG dGe	j7ZHG dHdI dIe	j7ZIG dJdK dKe	j7ZJG dLdM dMe	j7ZKG dNdO dOe	j7ZLG dPdQ dQe	j7ZMG dRdS dSe	j7ZNe$dTdG dUdV dVe3ZOG dWdX dXe	j7ZPG dYdZ dZe	j7ZQe$d[dG d\d] d]e3ZRG d^d_ d_e3eZSg d`ZTdS )b    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   @   s>   e Zd ZU eed< dZdZddgZddgZdZ	dZ
dZdZd	S )
JanusPreTrainedModelconfigmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFN)__name__
__module____qualname__r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph!_supports_param_buffer_assignment r5   r5   \/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/janus/modeling_janus.pyr"   /   s   
 r"   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   @   s6   e Zd ZU dZdZeej ed< dZ	eej ed< dS )JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
r)   r*   r+   __doc__r9   r   torchFloatTensorr,   r:   r5   r5   r5   r6   r8   =   s   
 r8   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   @   sv   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )JanusBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater'   hidden_states
attentionsimage_hidden_states)r)   r*   r+   r;   r?   r   r<   r=   r,   r'   r	   r@   tuplerA   rB   r5   r5   r5   r6   r>   O   s   
 r>   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	JanusCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr'   r@   rA   rB   )r)   r*   r+   r;   rE   r   r<   r=   r,   rF   r'   r	   r@   rC   rA   rB   r5   r5   r5   r6   rD   p   s   
 rD   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddejde	dejfddZ
  ZS )JanusVisionEmbeddingsr#   c                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t| j| j| _| jdt| jddd d S )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   F)
persistent)super__init__r#   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr<   arangeexpandselfr#   	__class__r5   r6   rR      s    
"zJanusVisionEmbeddings.__init__
embeddingsheightwidthreturnc                 C   s   |j d }| jjj d }tj s||kr||kr| | jS | jjd}|j d }|| j }|| j }	t	|d }
|
d|
|
|}|dddd}tjj|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   rO   g      ?r   r   bicubicF)sizemodealign_corners)shaper]   weightr<   jit
is_tracingrN   	unsqueezerV   r   reshapepermuter   
functionalinterpolateview)rb   re   rf   rg   rZ   r[   patch_pos_embeddim
new_height	new_widthsqrt_num_positionsr5   r5   r6   interpolate_pos_encoding   s&   




z.JanusVisionEmbeddings.interpolate_pos_encodingFpixel_valuesr|   c           
      C   sh   |j \}}}}| jjj}| |j|d}|ddd}|r(| |||}	n| | j	}	||	 }|S )N)dtyper   r   )
rm   rY   rn   r~   toflatten	transposer|   r]   rN   )
rb   r}   r|   _rf   rg   target_dtypepatch_embedsre   
pos_embedsr5   r5   r6   forward   s   
zJanusVisionEmbeddings.forward)F)r)   r*   r+   r    rR   r<   Tensorintr|   boolr   __classcell__r5   r5   rc   r6   rG      s    $&rG   r@   n_reprh   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rm   r`   rr   )r@   r   batchnum_key_value_headsslenhead_dimr5   r5   r6   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   rO   )rx   r~   )ptrainingr   )r   num_key_value_groupsr<   matmulr   rm   r   rt   softmaxfloat32r   r~   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsr(   attn_outputr5   r5   r6   eager_attention_forward   s   
&r   c                       sL   e Zd ZdZdef fddZ	ddejdeej de	e
 fd	d
Z  ZS )JanusVisionAttentionz(Attention Class for Janus Vision Encoderr#   c                    sL  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )	N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: ).      Fr   biasr   )rQ   rR   r#   rS   rT   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)rb   r#   proj_dropoutqk_normrc   r5   r6   rR     s0   

$zJanusVisionAttention.__init__Nr@   r   r   c                 K   s4  |  \}}}| |}| |}| |}	|d| j| j}| |}|d| j| j}| |}|||| j| j	dd}|||| j| j	dd}|	
||| j| j	dd}	t}
| jjdkrjt| jj }
|
| |||	|f| jsvdn| j| j| jd|\}}|||| j}| |}| |}||fS )NrO   r   r   eagerr   )r   r   r   )rj   r   r   r   rr   r   r   r   r   r   rv   r   r#   _attn_implementationr   r   r   r   r   rT   r   r   )rb   r@   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputr5   r5   r6   r   !  s>   




	


zJanusVisionAttention.forwardN)r)   r*   r+   r;   r    rR   r<   r   r   r   r   r   r   r5   r5   rc   r6   r     s     r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )JanusVisionMLPr#   c                    sr   t    || _t|j|j | _t|j | _	t
|j| j| _t
| j|j| _t
|j| _t
|j| _d S r   )rQ   rR   r#   r   rS   	mlp_ratiointermediate_sizer   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2ra   rc   r5   r6   rR   N  s   
zJanusVisionMLP.__init__r@   rh   c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   r   rb   r@   r5   r5   r6   r   X  s   




zJanusVisionMLP.forward)	r)   r*   r+   r    rR   r<   r   r   r   r5   r5   rc   r6   r   M  s    
r   c                	       J   e Zd Zdef fddZedejdejdee	 dej
fdd	Z  ZS )
r&   r#   c                    sX   t    |j| _tj| j|jd| _t|| _	tj| j|jd| _
t|| _|| _d S N)eps)rQ   rR   rS   rT   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr#   ra   rc   r5   r6   rR   b  s   



z JanusVisionEncoderLayer.__init__r@   r   r   rh   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r@   r   r5   r   r   r   r   rb   r@   r   r   residualr   r5   r5   r6   r   k  s   



zJanusVisionEncoderLayer.forward)r)   r*   r+   r    rR   r   r<   r   r   r   r=   r   r   r5   r5   rc   r6   r&   a  s    	r&   c                       sN   e Zd ZdZdef fddZe	ddeej	 de
e defd	d
Z  ZS )JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r#   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r5   )r&   .0r   r#   r5   r6   
<listcomp>  s    z/JanusVisionEncoder.__init__.<locals>.<listcomp>F)	rQ   rR   r#   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingra   rc   r   r6   rR     s   
 
zJanusVisionEncoder.__init__Nr   r   rh   c                 K   s,   |}| j D ]}|||fi |}qt|dS )N)r?   )r   r   )rb   inputs_embedsr   r   r@   encoder_layerr5   r5   r6   r     s   

zJanusVisionEncoder.forwardr   )r)   r*   r+   r;   r    rR   r   r   r<   r   r   r   r   r   r   r5   r5   rc   r6   r     s    r   c                       sv   e Zd ZdZ fddZdejdedefddZ		dd
ejde	ej de
eje	ej e	e
ej  f fddZ  ZS )JanusAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	d| _
|j| _tj| jd| j dd| _|jr]tt| j}tt| j}nd }d }|d uryt|tj|dd|f}t|| j_t| j| j| _d S )	Nr   r   r   r   Fr   r   )requires_grad)rQ   rR   r#   rS   rT   r   r   r   r   r   r   r   r   r   qkvqkv_bias	Parameterr<   zeroscat
zeros_liker   
projection)rb   r#   q_biasv_biasr   rc   r5   r6   rR     s0   

zJanusAttention.__init__tensorr   bszc                 C   s    | ||| j| jdd S )Nr   r   )rv   r   r   r   r   )rb   r   r   r   r5   r5   r6   _shape  s    zJanusAttention._shapeNr@   	head_maskrh   c                 K   s   |  \}}}| |}|||d| j|| j ddddd}|d |d |d }}	}
t}| jjdkr<t| jj }|| ||	|
fd| j	sHdn| j
| jd	|\}}|||d
 }| |}||fS )z#Input shape: Batch x Time x Channelr   r   r   r      r   Nr   )r   r   r   rO   )rj   r   rr   r   rs   r   r#   r   r   r   r   r   r   r   )rb   r@   r   r   r   tgt_lenrT   	mixed_qkvr   r   r   r   r   r   r5   r5   r6   r     s0   



zJanusAttention.forwardr   )r)   r*   r+   r;   rR   r<   r   r   r   r   rC   r   r   r5   r5   rc   r6   r     s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )JanusMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )rQ   rR   r#   r   r   r   r   r   rS   r   r   r   ra   rc   r5   r6   rR     s
   
zJanusMLP.__init__r@   rh   c                 C   "   |  |}| |}| |}|S r   )r   r   r   r   r5   r5   r6   r        


zJanusMLP.forward)r)   r*   r+   rR   r<   r   r   r   r5   r5   rc   r6   r     s    r   c                	       r   )
JanusEncoderLayerr#   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S r   )rQ   rR   rS   rT   r   r   r   r   r   r   r   r   r   ra   rc   r5   r6   rR     s   


zJanusEncoderLayer.__init__r@   r   r   rh   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r@   r   r5   r   r   r5   r5   r6   r   
  s   



zJanusEncoderLayer.forward)r)   r*   r+   r   rR   r   r<   r   r   r   r=   r   r   r5   r5   rc   r6   r    s    r  c                       s   e Zd ZU dZeed< eedZdef fddZ	e
dde		ddeej d	ed
ee deeef fddZdd Z  ZS )JanusVisionModelr}   r#   )r@   rA   c                    sJ   t  | || _|j}t|| _t|| _tj	||j
d| _|   d S r   )rQ   rR   r#   rS   rG   re   r   encoderr   r   r   post_layernorm	post_init)rb   r#   rT   rc   r5   r6   rR   ,  s   

zJanusVisionModel.__init__F)tie_last_hidden_statesNr|   r   rh   c                 K   sn   |d u rt d| j||d}| jdd|i|}|j}| |}|d d dd d f }| |}t||dS )Nz You have to specify pixel_values)r|   r   r   )r?   pooler_outputr5   )r   re   r  r?   r  r   )rb   r}   r|   r   r@   encoder_outputsr?   pooled_outputr5   r5   r6   r   7  s    

zJanusVisionModel.forwardc                 C   s   | j S r   )re   rb   r5   r5   r6   get_input_embeddingsT  s   z%JanusVisionModel.get_input_embeddingsNF)r)   r*   r+   main_input_namer    r,   r  r   _can_record_outputsrR   r   r   r   r<   r=   r   r   r   r   rC   r   r   r  r   r5   r5   rc   r6   r  #  s*   
 
r  c                       *   e Zd Zdef fddZdd Z  ZS )JanusVisionAlignerMLPr#   c                    N   t    t j j| _t fddtd j	D | _
t j | _d S )Nc                       g | ]
}t  j jqS r5   r   r   projection_dimr   r   r5   r6   r   ^      z2JanusVisionAlignerMLP.__init__.<locals>.<listcomp>r   )rQ   rR   r   r   rS   r  r   r   r   depthhidden_layersr   r   r   ra   rc   r   r6   rR   Y     
zJanusVisionAlignerMLP.__init__c                 C   ,   |  |}| jD ]}| |}||}q|S r   r   r  r   rb   r@   layerr5   r5   r6   r   b  
   



zJanusVisionAlignerMLP.forward)r)   r*   r+   r    rR   r   r   r5   r5   rc   r6   r  X      	r  c                       sL   e Zd ZdZdef fddZdejfddZdej	d	ej
fd
dZ  ZS )JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r#   c                    sL   t    |j| _|j| _t|dd| _t| j| j| _	|j
gd | _d S )Nbetag      ?r   )rQ   rR   num_embeddingsrT   embedding_dimgetattrr"  r   r\   	embeddingrZ   quant_state_dimsra   rc   r5   r6   rR   u  s   
z"JanusVQVAEVectorQuantizer.__init__hidden_statec              
   C   s   | dddd }|d| j}tj|d dddtj| jjd dd dtd	|| jj	dd  }tj
|dd}| ||j}t| | d | jt||  d   }|||   }| dddd }|||fS )
Nr   r   r   r   rO   T)rx   keepdimrx   z	bd,dn->bn)rs   r   rv   r$  r<   sumr&  rn   einsumr   argminrm   meandetachr"  )rb   r(  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrE   r5   r5   r6   r   ~  s    
z!JanusVQVAEVectorQuantizer.forwardimage_tokensrh   c                 C   sb   |j d }| jjj d }| |}tj|ddd}||g| j|R }|dddd }|S )Nr   rO   r   )r   rx   r   r   )	rm   r&  rn   F	normalizerv   r'  rs   r   )rb   r4  r   emb_dimr3  r5   r5   r6   get_codebook_entry  s   

z,JanusVQVAEVectorQuantizer.get_codebook_entry)r)   r*   r+   r;   r!   rR   r<   r   r   
LongTensorr=   r8  r   r5   r5   rc   r6   r!  j  s
    
	r!  c                       s*   e Zd Z		d fdd	Zdd Z  ZS )JanusVQVAEResnetBlockNFc                    s   t    || _|d u r|n|| _|| _tjjd|ddd| _tjj	||dddd| _
tjjd|ddd| _tj|j| _tjj	||dddd| _| j| jkro| jratjj	||dddd| _d S tjj	||dddd| _d S d S )	N    ư>T
num_groupsrX   r   affiner   r   rK   rL   rM   r   )rQ   rR   rI   rJ   use_conv_shortcutr<   r   	GroupNormnorm1rW   conv1norm2r   r   conv2conv_shortcutnin_shortcut)rb   r#   rI   rJ   rG  rc   r5   r6   rR     s   
zJanusVQVAEResnetBlock.__init__c                 C   s   |}|  |}|t|9 }| |}| |}|t|9 }| |}| |}| j| jkr@| j	r;| 
|}|| S | |}|| S r   )rC  r<   sigmoidrD  rE  r   rF  rI   rJ   rA  rG  rH  )rb   r@   r   r5   r5   r6   r     s   






zJanusVQVAEResnetBlock.forwardr  r)   r*   r+   rR   r   r   r5   r5   rc   r6   r:    s
    r:  c                       $   e Zd Z fddZdd Z  ZS )JanusVQVAEAttnBlockc                    s   t    || _tjjd|ddd| _tjj||dddd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _d S )Nr;  r<  Tr=  r   r   r@  )rQ   rR   rI   r<   r   rB  normrW   qkvproj_outrb   rI   rc   r5   r6   rR     s   
zJanusVQVAEAttnBlock.__init__c                 C   s   |}|  |}| |}| |}| |}|j\}}}}	|||||	 ddd}|||||	 }t||}
|
t	|d  }
t
j|
dd}
|||||	 }|
ddd}
t||
||||	}| |}|| S )Nr   r   r   r   r*  )rM  rN  rO  rP  rm   rr   rs   r<   bmmr   r5  r   rQ  )rb   r@   r   r   r   r   r   channelsrf   rg   r   r   r5   r5   r6   r     s    




zJanusVQVAEAttnBlock.forwardrJ  r5   r5   rc   r6   rL    s    
rL  c                       rK  )JanusVQVAEConvDownsamplec                    s$   t    tj||dddd| _d S )Nr   r   r   r@  )rQ   rR   r   rW   convrR  rc   r5   r6   rR     s   
z!JanusVQVAEConvDownsample.__init__c                 C   s    t j|dddd}| |}|S )N)r   r   r   r   constantr   )padrk   r   )r5  rX  rV  r   r5   r5   r6   r     s   
z JanusVQVAEConvDownsample.forwardrJ  r5   r5   rc   r6   rU        rU  c                       rK  )JanusVQVAEConvUpsamplec                    s&   t    tjj||dddd| _d S )Nr   r   r@  )rQ   rR   r<   r   rW   rV  rR  rc   r5   r6   rR     s   
zJanusVQVAEConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )Ng       @nearest)scale_factorrk   )r5  ru   rV  r   r5   r5   r6   r   	  s   
zJanusVQVAEConvUpsample.forwardrJ  r5   r5   rc   r6   rZ    rY  rZ  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	JanusVQVAEMidBlockr#   rT  c                    s8   t    t|||d| _t|| _t|||d| _d S )Nr#   rI   rJ   )rQ   rR   r:  block_1rL  attn_1block_2)rb   r#   rT  rc   r5   r6   rR     s   

zJanusVQVAEMidBlock.__init__r@   rh   c                 C   r  r   )r_  r`  ra  r   r5   r5   r6   r     r  zJanusVQVAEMidBlock.forward)
r)   r*   r+   r!   r   rR   r<   r   r   r   r5   r5   rc   r6   r]    s    r]  c                       s,   e Zd Z fddZdejfddZ  ZS )JanusVQVAEEncoderc              	      sn  t    t|j| _|j| _|j}|j}|j}|j	}|j}t
jj||dddd| _dt| }|| _t | _t| jD ]T}t }	t }
|||  }|||  }t| jD ]}|	t|||d |}|| jd krt|
t| qXt }|	|_|
|_|| jd krt||_| j| q=t||| _t
jjd|ddd	| _t
jj||rd
| n|dddd| _d S )Nr   r   r@  )r   r^  r;  r<  Tr=  r   ) rQ   rR   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrI   double_latentlatent_channelsr<   r   rW   conv_inrC   in_channel_multiplierr   downr   appendr:  rL  ModuleblockattnrU  
downsampler]  midrB  norm_outconv_out)rb   r#   rg  rI   rh  ri  rd  rk  i_levelro  rp  block_in	block_outi_blockrl  rc   r5   r6   rR   &  sX   


zJanusVQVAEEncoder.__init__r}   c                 C   s   |  |g}t| jD ]C}t| jD ]'}| j| j| |d }t| j| jdkr4| j| j| |}|| q|| jd krN|| j| 	|d  q|d }| 
|}| |}|t|9 }| |}|S )NrO   r   r   )rj  r   re  rf  rl  ro  rc  rp  rm  rq  rr  rs  r<   rI  rt  )rb   r}   r@   ru  rx  r(  r?   r5   r5   r6   r   Y  s$   


zJanusVQVAEEncoder.forward)r)   r*   r+   rR   r<   r9  r   r   r5   r5   rc   r6   rb  %  s    3rb  c                       r   )JanusVQVAEDecoderc              	      sP  t    t|j| _|j| _|j}|j}|j}||j| jd   }t	j
j||dddd| _t||| _t
 | _tt| jD ]N}t
 }t
 }||j|  }	t| jd D ]}
|t|||	d |	}|| jd krt|t| qXt
 }||_||_|dkrt||_| j| q@t	j
jd|ddd	| _t	j
j||dddd| _d S )
Nr   r   r@  r^  r   r;  r<  Tr=  )rQ   rR   rc  rd  re  rf  rg  ri  rJ   r<   r   rW   rj  r]  rr  r   upreversedr   rm  r:  rL  rn  ro  rp  rZ  upsamplerB  rs  rt  )rb   r#   rg  ri  rJ   rv  ru  ro  rp  rw  rx  rz  rc   r5   r6   rR   s  sD   


zJanusVQVAEDecoder.__init__r(  rh   c                 C   s   |  |}| |}t| jD ]9}t| jd D ] }| j| j| |}t| j| jdkr8| j| j| |}q|| jd krH| j| 	|}q| 
|}|t|9 }| |}|S )Nr   r   )rj  rr  r   re  rf  rz  ro  rc  rp  r|  rs  r<   rI  rt  )rb   r(  ru  rx  r5   r5   r6   r     s   



zJanusVQVAEDecoder.forward)r)   r*   r+   rR   r<   r=   r   r   r5   r5   rc   r6   ry  r  s    .ry  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                       s   e Zd ZU eed< g dZdZdef fddZdej	fddZ
dej	d	ejfd
dZeedejd	eejejf fddZ  ZS )
JanusVQVAEr#   )rL  r:  r!  r}   c                    sp   t  | t|| _t|| _tj|j	|j
d| _tj|j
|j	d| _|   t|| _d| _|   d S )Nr   F)rQ   rR   rb  r  r!  quantizer<   r   rW   ri  rT   
quant_convpost_quant_convevalry  decoderr   r  ra   rc   r5   r6   rR     s   


zJanusVQVAE.__init__c                 C   s.   |  |}| |}| |\}}}|||fS r   )r  r  r~  )rb   r}   r@   quantemb_lossindicesr5   r5   r6   encode  s   


zJanusVQVAE.encoder4  rh   c                 C   sr   |j d | jjd | jjd  kr'td| jjd | jjd   d|j  d| j|}| |}| |}|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)rm   r~  r'  r   r8  r  r  )rb   r4  codebook_entryr@   r}   r5   r5   r6   decode  s   "	

zJanusVQVAE.decodec                 C   s6   |j d }| |\}}}| ||d}t||S )Nr   rO   )rm   r  r  rv   r8   )rb   r}   r   r  r:   r  r9   r5   r5   r6   r     s   

zJanusVQVAE.forward)r)   r*   r+   r!   r,   r/   r  rR   r<   r9  r  r=   r  r   r   rC   r   r   r5   r5   rc   r6   r}    s   
 	r}  c                       r  )JanusVQVAEAlignerMLPr#   c                    r  )Nc                    r  r5   r  r   r   r5   r6   r     r  z1JanusVQVAEAlignerMLP.__init__.<locals>.<listcomp>r   )rQ   rR   r   r   rT   r  r   r   r   r   r  r   r   r   ra   rc   r   r6   rR     r  zJanusVQVAEAlignerMLP.__init__c                 C   r  r   r  r  r5   r5   r6   r     r  zJanusVQVAEAlignerMLP.forward)r)   r*   r+   r!   rR   r   r   r5   r5   rc   r6   r    r   r  c                       s<   e Zd ZdZdef fddZdejdejfddZ	  Z
S )	JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r#   c                    s>   t    t|j|j| _t|j | _	t|j|j
| _d S r   )rQ   rR   r   r   image_token_embed_dimr  rQ  r   r   r   r#  vision_headra   rc   r5   r6   rR     s   
zJanusVQVAEHead.__init__r@   rh   c                 C   r  r   )rQ  r   r  r   r5   r5   r6   r     r  zJanusVQVAEHead.forward)r)   r*   r+   r;   r!   rR   r<   r   r   r   r   r5   r5   rc   r6   r    s    r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zd
ej	dej
dej
fddZee									dd
eej	 deej
 deej deej	 dee deej	 deej
 dee deeejf fddZ  ZS )
JanusModelr#   c                    s   t  | || _t|j| _t| jj| _t	|j
| _t| jjj| jjj| _t| jj| _t| jj| _tj|jd| _d| _|   d S )Nr   F)rQ   rR   r#   r  _from_configvision_configvision_modelr  alignerr}  	vq_configvqmodelr   r\   r#  rT   generation_embeddingsr  generation_alignerr  generation_headr   from_configtext_configlanguage_modelr   r  ra   rc   r5   r6   rR   #  s   zJanusModel.__init__c                 C   s
   | j  S r   )r  r  r  r5   r5   r6   r  8  s   
zJanusModel.get_input_embeddingsc                 C   s   | j | d S r   )r  set_input_embeddingsrb   r   r5   r5   r6   r  ;  s   zJanusModel.set_input_embeddingsc                 C   s   |  |}| |j}|S r   )r  r  r?   )rb   r}   image_embedsr5   r5   r6   get_image_features>  s   
zJanusModel.get_image_features	input_idsr   image_featuresc                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}||  | krP|jd |jd  }td| d| |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr~   devicerO   r   r   z6Image features and image tokens do not match: tokens: z, features )r  r<   r   r#   image_token_idlongr  allr+  rq   	expand_asr   numelrm   r   )rb   r  r   r  special_image_maskn_image_tokensn_image_featuresr5   r5   r6   get_placeholder_maskC  s   zJanusModel.get_placeholder_maskNr   r}   r   rN   r'   cache_position	use_cachelogits_to_keepc
              
   K   s   |d u |d uA rt d|d u r|  |}|d ur>| |}|d|jd }||j|j}| j|||d}|	||}| j
d|||||||	d|
}t|j|j|j|j|d ur_|dS d dS )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onerO   )r   r  )r   r   rN   r'   r  r  r  )r?   r'   r@   rA   rB   r5   )r   r  r  rr   rm   r   r  r~   r  masked_scatterr  r>   r?   r'   r@   rA   )rb   r  r}   r   rN   r'   r  r   r  r  r   r  r  image_attention_mask	lm_outputr5   r5   r6   r   [  sD   

zJanusModel.forward)	NNNNNNNNr   )r)   r*   r+   r   rR   r  r  r  r<   r9  r=   r  r   r   r   r   r	   r   r   r   r   r   r5   r5   rc   r6   r    sT    
	
r  c                       sJ  e Zd ZddgZdZdef fddZdd Zd	d
 Zde	j
de	j
fddZee										d&dee	j dee	j dee	j
 dee	j dee dee	j dee	j dee	j dee deee	j
f dee fddZ						d' fdd	Zd e	j
fd!d"Ze	j			d(dee	j
 dee	j d#ee f fd$d%Z  ZS ))JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr#   c                    sB   t  | || _t|| _tj|jj|jj	dd| _
|   d S )NFr   )rQ   rR   r#   r  r$   r   r   r  rS   
vocab_sizelm_headr  ra   rc   r5   r6   rR     s
   
z&JanusForConditionalGeneration.__init__c                 C   s   | j j S r   )r$   r  r  r  r5   r5   r6   r    s   z2JanusForConditionalGeneration.get_input_embeddingsc                 C   s   | j j| d S r   )r$   r  r  r  r5   r5   r6   r    s   z2JanusForConditionalGeneration.set_input_embeddingsinputsrh   c                 C   s   | j |}| j |}|S r   )r$   r  r  )rb   r  r(  r5   r5   r6   'prepare_embeddings_for_image_generation  s   zEJanusForConditionalGeneration.prepare_embeddings_for_image_generationNr   r  r}   r   rN   r'   r  r   labelsr  r  r   c                 K   s   | j d|||||||	|d|}|j}t|
trt|
 dn|
}| |dd|ddf }d}|durD| jd||| jjj	d|}t
|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r}   r   rN   r'   r   r  r  N)rF   r  r  )rE   rF   r'   r@   rA   rB   r5   )r$   r?   
isinstancer   slicer  loss_functionr#   r  r  rD   r'   r@   rA   rB   )rb   r  r}   r   rN   r'   r  r   r  r  r  r   outputsr@   slice_indicesrF   rE   r5   r5   r6   r     s<   	z%JanusForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r'   r   r   r  r  r   r}   )rQ   prepare_inputs_for_generation)
rb   r  r}   r'   r   r   r  r  r   model_inputsrc   r5   r6   r    s   z;JanusForConditionalGeneration.prepare_inputs_for_generationr4  c                 C   s"   | j j|}|dddd}|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r   r   )r$   r  r  rs   )rb   r4  decoded_imager5   r5   r6   decode_image_tokens  s   z1JanusForConditionalGeneration.decode_image_tokenslogits_processorc           %         sx  | d| j}t|}| dd}|dkr$t jd|||d d|S |jdi |}| tj	tj
fvr:td|  | |  |d urK|nt }d|d< |jd u r_td d	|_|j|d
< | ||j|\}}	}|j|j}
}t|jdkrtd|j d|d u}| j|||jd |jr|jdkr|t|j d |_| j||jd |d ||d}| jd|||jd|\}}| jjj j!}|j\}}|"dd}| dd }|"dd}||d< ||d d d f |jk||d d d f |j#d k@ }||d d d f $||j% | & |}| '|||}|(dd d u r<| j)|j*p,d|d t+|j,|| |d|d< t-j.||f|
|d}|j/}|j0}|j1}|j2}|j3}|r]|r]dnd }|rg|rgdnd }|rq|rqdnd }|r{|r{dnd }t4|D ]}| j5d||d|}|d 6|j|d< |d 6|j|d< | jj7di |||d}| 8||}|j9d d dd d f : } | j;| }!|||!}"|j<rt-j=|"dd}#t-j>|#dd?d}$nt-j@|"dd}$|$|d d |f< t-A|$|$g}$|$Bd}$| C|$}q|r,|r||!f7 }|r|| D f7 }|r$||jE7 }|r,||jF7 }|r:tG||!||||jHdS |S ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr'   static)cache_implementationr   max_cache_lenmodel_kwargsr  r5   )r   r  r  )output_attentionsoutput_hidden_statesrO   r*  )num_samples)	sequencesscoresrF   rA   r@   r'   )Ipopr  copydeepcopyrQ   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  loggerwarning_prepare_model_inputsbos_token_idr~   r  rc  rm   _prepare_special_tokensrm  r
   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr$   r  r#   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionget
_get_cacher  max
max_lengthr<   r   r  r  output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationr?   cloner  	do_sampler   multinomialsqueezeargmaxr   rq   r  floatrA   r@   r   r'   )%rb   r  r   r  r   r  r  r  r  model_input_namer~   r  kwargs_has_attention_maskr  r   r   input_tokensmaskr   generated_tokensr  r  r  r  r  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r(  r  next_token_scoresprobs
next_tokenrc   r5   r6   r    s   	

















	z&JanusForConditionalGeneration.generate)
NNNNNNNNNr   )NNNNNN)NNN)r)   r*   r+   _tied_weights_keysr3   r   rR   r  r  r<   r   r  r   r   r   r9  r=   r	   r   r   r   r   r   r   r  r  no_gradr   r  r   r5   r5   rc   r6   r    sz    		
6r  )r"   r  r  r}  r  )r   )Ur  dataclassesr   typingr   r   r   r<   torch.nn.functionalr   rt   r5  activationsr   cache_utilsr	   
generationr
   r   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   autor   configuration_janusr   r    r!   
get_loggerr)   r  r"   r8   r>   rD   rn  rG   r   r   r   r  r   r   r   r&   r   r   r   r  r  r  r!  r:  rL  rU  rZ  r]  rb  ry  r}  r  r  r  r  __all__r5   r5   r5   r6   <module>   s   
K
L#"L"4?,#MD=l  9