o
    ei
                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlm  mZ	 d dlmZ ddl
mZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 e'4e5Z6e%G dd de Z7ee%ddG dd deZ8ee%ddG dd deZ9ee%ddG d d! d!eZ:G d"d# d#ej;Z<d$ej=d%e>d&ej=fd'd(Z?	)d_d*ej;d+ej=d,ej=d-ej=d.ej=dB d/e@d0e@d1e"e$ fd2d3ZAG d4d5 d5ej;ZBG d6d7 d7ej;ZCG d8d9 d9eZDG d:d; d;ej;ZEe%G d<d= d=e7ZFG d>d? d?ej;ZGG d@dA dAej;ZHG dBdC dCej;ZIG dDdE dEej;ZJG dFdG dGej;ZKG dHdI dIej;ZLG dJdK dKej;ZMG dLdM dMej;ZNG dNdO dOej;ZOee%G dPdQ dQeZPe%dRdG dSdT dTe7ZQG dUdV dVej;ZRG dWdX dXej;ZSe%dYdG dZd[ d[e7ZTG d\d] d]e7eZUg d^ZVdS )`    N)Callable)	dataclass)nn   )initialization)ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check	torch_int)merge_with_config_defaults)capture_outputs   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                       sN   e Zd ZU eed< dZdZdZddgZddgZ	dZ
dZdZ fd	d
Z  ZS )JanusPreTrainedModelconfigmodelimagetextTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskc                    s@   t  | t|trt|jt|jj	d 
d d S d S )Nr   r-   )super_init_weights
isinstanceJanusVisionEmbeddingsinitcopy_position_idstorcharangeshapeexpand)selfmodule	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/janus/modeling_janus.pyr0   D   s   
&z"JanusPreTrainedModel._init_weights)__name__
__module____qualname__r    __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr0   __classcell__r>   r>   r<   r?   r#   7   s   
 r#   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   @   s6   e Zd ZU dZdZejdB ed< dZejdB ed< dS )JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)	r@   rA   rB   __doc__rO   r6   FloatTensorrC   rP   r>   r>   r>   r?   rN   J   s   
 rN   zy
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   @   sv   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
eej dB ed< dZeej dB ed< dZeej dB ed< dS )JanusBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater+   hidden_states
attentionsimage_hidden_states)r@   rA   rB   rQ   rT   r6   rR   rC   r+   r   rU   tuplerV   rW   r>   r>   r>   r?   rS   \   s   
 rS   zQ
    Base class for Janus causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZeej dB ed< dS )	JanusCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr+   rU   rV   rW   )r@   rA   rB   rQ   rZ   r6   rR   rC   r[   r+   r   rU   rX   rV   rW   r>   r>   r>   r?   rY   }   s   
 rY   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddejde	dejfddZ
  ZS )r2   r$   c                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t| j| j| _| jdt| jddd d S )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   r5   r.   F)
persistent)r/   __init__r$   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr6   r7   r9   r:   r$   r<   r>   r?   rc      s    
"zJanusVisionEmbeddings.__init__
embeddingsheightwidthreturnc                 C   s   |j d }| jjj d }tj s||kr||kr| | jS | jjd}|j d }|| j }|| j }	t	|d }
|
d|
|
|}|dddd}tjj|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   r-   g      ?r   r   bicubicF)sizemodealign_corners)r8   rn   weightr6   jit
is_tracingr5   	unsqueezerg   r   reshapepermuter   
functionalinterpolateview)r:   rq   rr   rs   rk   rl   patch_pos_embeddim
new_height	new_widthsqrt_num_positionsr>   r>   r?   interpolate_pos_encoding   s&   




z.JanusVisionEmbeddings.interpolate_pos_encodingFpixel_valuesr   c           
      C   sh   |j \}}}}| jjj}| |j|d}|ddd}|r(| |||}	n| | j	}	||	 }|S )N)dtyper   r   )
r8   rj   ry   r   toflatten	transposer   rn   r5   )
r:   r   r   _rr   rs   target_dtypepatch_embedsrq   
pos_embedsr>   r>   r?   forward   s   
zJanusVisionEmbeddings.forward)F)r@   rA   rB   r!   rc   r6   Tensorintr   boolr   rL   r>   r>   r<   r?   r2      s    $&r2   rU   n_reprt   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r8   r9   r}   )rU   r   batchnum_key_value_headsslenhead_dimr>   r>   r?   	repeat_kv   s
   0r           r;   querykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   r-   )r   r   )ptrainingr   )r   num_key_value_groupsr6   matmulr   r   r   softmaxfloat32r   r   r   r   
contiguous)r;   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr>   r>   r?   eager_attention_forward   s   
r   c                       sL   e Zd ZdZdef fddZ	ddejdejdB dee	 fd	d
Z
  ZS )JanusVisionAttentionz(Attention Class for Janus Vision Encoderr$   c                    sL  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr   biasr   )r/   rc   r$   rd   re   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)r:   r$   proj_dropoutqk_normr<   r>   r?   rc     s0   

$zJanusVisionAttention.__init__NrU   r   r   c                 K   s(  |  \}}}| |}| |}| |}	|d| j| j}| |}|d| j| j}| |}|||| j| j	dd}|||| j| j	dd}|	
||| j| j	dd}	t| jjt}
|
| |||	|f| jspdn| j| j| jd|\}}|||| j}| |}| |}||fS )Nr-   r   r   r   )r   r   r   )rv   r   r   r   r}   r   r   r   r   r   r   r   get_interfacer$   _attn_implementationr   r   r   r   r   re   r   r   )r:   rU   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputr>   r>   r?   r   -  s>   




	


zJanusVisionAttention.forwardN)r@   rA   rB   rQ   r!   rc   r6   r   r   r   r   rL   r>   r>   r<   r?   r     s     r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )JanusVisionMLPr$   c                    sr   t    || _t|j|j | _t|j | _	t
|j| j| _t
| j|j| _t
|j| _t
|j| _d S r   )r/   rc   r$   r   rd   	mlp_ratiointermediate_sizer   
hidden_actactivation_fnr   r   fc1fc2r   hidden_dropout_ratedropout1dropout2rp   r<   r>   r?   rc   Z  s   
zJanusVisionMLP.__init__rU   rt   c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   r   r:   rU   r>   r>   r?   r   d  s   




zJanusVisionMLP.forward)	r@   rA   rB   r!   rc   r6   r   r   rL   r>   r>   r<   r?   r   Y  s    
r   c                	       sJ   e Zd Zdef fddZedejdejdee	 dej
fdd	Z  ZS )
r*   r$   c                    sX   t    |j| _tj| j|jd| _t|| _	tj| j|jd| _
t|| _|| _d S N)eps)r/   rc   rd   re   r   r   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr$   rp   r<   r>   r?   rc   n  s   



z JanusVisionEncoderLayer.__init__rU   r   r   rt   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)rU   r   r>   )r   r   r   r   )r:   rU   r   r   residualr   r>   r>   r?   r   w  s   



zJanusVisionEncoderLayer.forward)r@   rA   rB   r!   rc   r   r6   r   r   r   rR   r   rL   r>   r>   r<   r?   r*   m  s    	r*   c                       sN   e Zd ZdZdef fddZe	ddejdB de	e
 defd	d
Z  ZS )JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r$   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r>   )r*   .0r   r$   r>   r?   
<listcomp>  s    z/JanusVisionEncoder.__init__.<locals>.<listcomp>F)	r/   rc   r$   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrp   r<   r   r?   rc     s   
 
zJanusVisionEncoder.__init__Nr   r   rt   c                 K   s,   |}| j D ]}|||fi |}qt|dS )N)rT   )r   r   )r:   inputs_embedsr   r   rU   encoder_layerr>   r>   r?   r     s   

zJanusVisionEncoder.forwardr   )r@   rA   rB   rQ   r!   rc   r   r6   r   r   r   r   r   rL   r>   r>   r<   r?   r     s    r   c                       s   e Zd ZU dZdZeed< eedZ	def fddZ
eedde			ddejd	B d
edee deeB fddZdd Z  ZS )JanusVisionModelr   )r'   r$   rU   rV   c                    sJ   t  | || _|j}t|| _t|| _tj	||j
d| _|   d S r   )r/   rc   r$   rd   r2   rq   r   encoderr   r   r   post_layernorm	post_init)r:   r$   re   r<   r>   r?   rc     s   

zJanusVisionModel.__init__F)tie_last_hidden_statesNr   r   rt   c                 K   sn   |d u rt d| j||d}| jdd|i|}|j}| |}|d d dd d f }| |}t||dS )Nz You have to specify pixel_values)r   r   r   )rT   pooler_outputr>   )r   rq   r   rT   r   r   )r:   r   r   r   rU   encoder_outputsrT   pooled_outputr>   r>   r?   r     s    	

zJanusVisionModel.forwardc                 C   s   | j S r   )rq   r:   r>   r>   r?   get_input_embeddings  s   z%JanusVisionModel.get_input_embeddingsNF)r@   rA   rB   main_input_namerE   r!   rC   r*   r   _can_record_outputsrc   r   r   r   r6   rR   r   r   r   rX   r   r   r   rL   r>   r>   r<   r?   r     s.   
 r   c                       *   e Zd Zdef fddZdd Z  ZS )JanusVisionAlignerMLPr$   c                    N   t    t j j| _t fddtd j	D | _
t j | _d S )Nc                       g | ]
}t  j jqS r>   r   r   projection_dimr   r   r>   r?   r         z2JanusVisionAlignerMLP.__init__.<locals>.<listcomp>r   )r/   rc   r   r   rd   r  r   r   r   depthhidden_layersr   r   r   rp   r<   r   r?   rc        
zJanusVisionAlignerMLP.__init__c                 C   ,   |  |}| jD ]}| |}||}q|S r   r   r  r   r:   rU   layerr>   r>   r?   r     
   



zJanusVisionAlignerMLP.forward)r@   rA   rB   r!   rc   r   rL   r>   r>   r<   r?   r         	r   c                       sL   e Zd ZdZdef fddZdejfddZdej	d	ej
fd
dZ  ZS )JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r$   c                    sL   t    |j| _|j| _t|dd| _t| j| j| _	|j
gd | _d S )Nbetag      ?r   )r/   rc   num_embeddingsre   embedding_dimgetattrr  r   rm   	embeddingrk   quant_state_dimsrp   r<   r>   r?   rc     s   
z"JanusVQVAEVectorQuantizer.__init__hidden_statec              
   C   s   | dddd }|d| j}tj|d dddtj| jjd dd dtd	|| jj	dd  }tj
|dd}| ||j}t| | d | jt||  d   }|||   }| dddd }|||fS )
Nr   r   r   r   r-   T)r   keepdimr   z	bd,dn->bn)r~   r   r   r  r6   sumr  ry   einsumr   argminr8   meandetachr  )r:   r  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrZ   r>   r>   r?   r     s    
z!JanusVQVAEVectorQuantizer.forwardimage_tokensrt   c                 C   sb   |j d }| jjj d }| |}tj|ddd}||g| j|R }|dddd }|S )Nr   r-   r   )r   r   r   r   )	r8   r  ry   F	normalizer   r  r~   r   )r:   r   r   emb_dimr  r>   r>   r?   get_codebook_entry*  s   

z,JanusVQVAEVectorQuantizer.get_codebook_entry)r@   rA   rB   rQ   r"   rc   r6   r   r   
LongTensorrR   r$  rL   r>   r>   r<   r?   r    s
    
	r  c                       s*   e Zd Z		d fdd	Zdd Z  ZS )JanusVQVAEResnetBlockNFc                    s   t    || _|d u r|n|| _|| _tjjd|ddd| _tjj	||dddd| _
tjjd|ddd| _tj|j| _tjj	||dddd| _| j| jkro| jratjj	||dddd| _d S tjj	||dddd| _d S d S )	N    ư>T
num_groupsri   r   affiner   r   r_   r`   ra   r   )r/   rc   r]   r^   use_conv_shortcutr6   r   	GroupNormnorm1rh   conv1norm2r   r   conv2conv_shortcutnin_shortcut)r:   r$   r]   r^   r3  r<   r>   r?   rc   ;  s   
zJanusVQVAEResnetBlock.__init__c                 C   s   |}|  |}|t|9 }| |}| |}|t|9 }| |}| |}| j| jkr@| j	r;| 
|}|| S | |}|| S r   )r/  r6   sigmoidr0  r1  r   r2  r]   r^   r-  r3  r4  )r:   rU   r   r>   r>   r?   r   R  s   






zJanusVQVAEResnetBlock.forwardr   r@   rA   rB   rc   r   rL   r>   r>   r<   r?   r&  :  s
    r&  c                       $   e Zd Z fddZdd Z  ZS )JanusVQVAEAttnBlockc                    s   t    || _tjjd|ddd| _tjj||dddd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _d S )Nr'  r(  Tr)  r   r   r,  )r/   rc   r]   r6   r   r.  normrh   qkvproj_outr:   r]   r<   r>   r?   rc   g  s   
zJanusVQVAEAttnBlock.__init__c                 C   s   |}|  |}| |}| |}| |}|j\}}}}	|||||	 ddd}|||||	 }t||}
|
t	|d  }
t
j|
dd}
|||||	 }|
ddd}
t||
||||	}| |}|| S )Nr   r   r   r   r  )r9  r:  r;  r<  r8   r}   r~   r6   bmmr   r!  r   r=  )r:   rU   r   r   r   r   r   channelsrr   rs   r   r   r>   r>   r?   r   q  s    




zJanusVQVAEAttnBlock.forwardr6  r>   r>   r<   r?   r8  f  s    
r8  c                       r7  )JanusVQVAEConvDownsamplec                    s$   t    tj||dddd| _d S )Nr   r   r   r,  )r/   rc   r   rh   convr>  r<   r>   r?   rc     s   
z!JanusVQVAEConvDownsample.__init__c                 C   s    t j|dddd}| |}|S )N)r   r   r   r   constantr   )padrw   r   )r!  rD  rB  r   r>   r>   r?   r     s   
z JanusVQVAEConvDownsample.forwardr6  r>   r>   r<   r?   rA        rA  c                       r7  )JanusVQVAEConvUpsamplec                    s&   t    tjj||dddd| _d S )Nr   r   r,  )r/   rc   r6   r   rh   rB  r>  r<   r>   r?   rc     s   
zJanusVQVAEConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )Ng       @nearest)scale_factorrw   )r!  r   rB  r   r>   r>   r?   r     s   
zJanusVQVAEConvUpsample.forwardr6  r>   r>   r<   r?   rF    rE  rF  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	JanusVQVAEMidBlockr$   r@  c                    s8   t    t|||d| _t|| _t|||d| _d S )Nr$   r]   r^   )r/   rc   r&  block_1r8  attn_1block_2)r:   r$   r@  r<   r>   r?   rc     s   

zJanusVQVAEMidBlock.__init__rU   rt   c                 C   "   |  |}| |}| |}|S r   )rK  rL  rM  r   r>   r>   r?   r        


zJanusVQVAEMidBlock.forward)
r@   rA   rB   r"   r   rc   r6   r   r   rL   r>   r>   r<   r?   rI    s    rI  c                       s,   e Zd Z fddZdejfddZ  ZS )JanusVQVAEEncoderc              	      sn  t    t|j| _|j| _|j}|j}|j}|j	}|j}t
jj||dddd| _dt| }|| _t | _t| jD ]T}t }	t }
|||  }|||  }t| jD ]}|	t|||d |}|| jd krt|
t| qXt }|	|_|
|_|| jd krt||_| j| q=t||| _t
jjd|ddd	| _t
jj||rd
| n|dddd| _d S )Nr   r   r,  )r   rJ  r'  r(  Tr)  r   ) r/   rc   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsr]   double_latentlatent_channelsr6   r   rh   conv_inrX   in_channel_multiplierr   downr   appendr&  r8  ModuleblockattnrA  
downsamplerI  midr.  norm_outconv_out)r:   r$   rU  r]   rV  rW  rR  rY  i_levelr]  r^  block_in	block_outi_blockrZ  r<   r>   r?   rc     sX   


zJanusVQVAEEncoder.__init__r   c                 C   s   |  |g}t| jD ]C}t| jD ]'}| j| j| |d }t| j| jdkr4| j| j| |}|| q|| jd krN|| j| 	|d  q|d }| 
|}| |}|t|9 }| |}|S )Nr-   r   r   )rX  r   rS  rT  rZ  r]  rQ  r^  r[  r_  r`  ra  r6   r5  rb  )r:   r   rU   rc  rf  r  rT   r>   r>   r?   r     s$   


zJanusVQVAEEncoder.forward)r@   rA   rB   rc   r6   r%  r   rL   r>   r>   r<   r?   rP    s    3rP  c                       s2   e Zd Z fddZdejdejfddZ  ZS )JanusVQVAEDecoderc              	      sP  t    t|j| _|j| _|j}|j}|j}||j| jd   }t	j
j||dddd| _t||| _t
 | _tt| jD ]N}t
 }t
 }||j|  }	t| jd D ]}
|t|||	d |	}|| jd krt|t| qXt
 }||_||_|dkrt||_| j| q@t	j
jd|ddd	| _t	j
j||dddd| _d S )
Nr   r   r,  rJ  r   r'  r(  Tr)  )r/   rc   rQ  rR  rS  rT  rU  rW  r^   r6   r   rh   rX  rI  r`  r   upreversedr   r[  r&  r8  r\  r]  r^  rF  upsampler.  ra  rb  )r:   r$   rU  rW  r^   rd  rc  r]  r^  re  rf  rh  r<   r>   r?   rc     sD   


zJanusVQVAEDecoder.__init__r  rt   c                 C   s   |  |}| |}t| jD ]9}t| jd D ] }| j| j| |}t| j| jdkr8| j| j| |}q|| jd krH| j| 	|}q| 
|}|t|9 }| |}|S )Nr   r   )rX  r`  r   rS  rT  rh  r]  rQ  r^  rj  ra  r6   r5  rb  )r:   r  rc  rf  r>   r>   r?   r   2  s   



zJanusVQVAEDecoder.forward)r@   rA   rB   rc   r6   rR   r   rL   r>   r>   r<   r?   rg    s    .rg  c                   @   sH   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dS )JanusVQVAEModelOutputa  
    quantized_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Quantized last hidden state from the VQ-VAE model.
    image_tokens (`torch.FloatTensor` of shape `(batch_size, config.vocab_size`):
        Indices of the image tokens predicted by the VQ-VAE model.
    embedding_loss (`torch.FloatTensor`):
        The embedding loss computed during quantization.
    Nquantized_last_hidden_stater   rP   )
r@   rA   rB   rQ   rl  r6   rR   rC   r   rP   r>   r>   r>   r?   rk  G  s
   
 	rk  aS  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    c                       s   e Zd ZU eed< g dZeedZdZ	def fddZ
eedejdee defd	d
ZdejdejfddZeedejdeejejf fddZ  ZS )
JanusVQVAEr$   )r8  r&  r  r   r   c                    sp   t  | t|| _t|| _tj|j	|j
d| _tj|j
|j	d| _|   t|| _d| _|   d S )Nr   F)r/   rc   rP  r   r  quantizer6   r   rh   rW  re   
quant_convpost_quant_convevalrg  decoderr   r   rp   r<   r>   r?   rc   m  s   


zJanusVQVAE.__init__r   rt   c                 K   s4   |  |}| |}| |\}}}t||||dS )N)rT   rl  r   rP   )r   ro  rn  rk  )r:   r   r   rU   conv_hidden_statesrl  emb_lossindicesr>   r>   r?   encodey  s   

zJanusVQVAE.encoder   c                 C   sr   |j d | jjd | jjd  kr'td| jjd | jjd   d|j  d| j|}| |}| |}|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r8   rn  r  r   r$  rp  rr  )r:   r   codebook_entryrU   r   r>   r>   r?   decode  s   "	

zJanusVQVAE.decodec                 K   s@   |j d }| j|fddi|}| |j|d}t||jS )Nr   return_dictTr-   )r8   rv  rx  r   r   rN   rP   )r:   r   r   r   encode_outputsrO   r>   r>   r?   r     s   
zJanusVQVAE.forward)r@   rA   rB   r"   rC   rG   r&  r8  r   r   rc   r   r   r6   r%  r   r   rk  rv  rR   rx  r   r   rX   r   rL   r>   r>   r<   r?   rm  X  s&   
 	 rm  c                       r   )JanusVQVAEAlignerMLPr$   c                    r   )Nc                    r   r>   r  r   r   r>   r?   r     r  z1JanusVQVAEAlignerMLP.__init__.<locals>.<listcomp>r   )r/   rc   r   r   re   r  r   r   r   r   r  r   r   r   rp   r<   r   r?   rc     r  zJanusVQVAEAlignerMLP.__init__c                 C   r  r   r  r	  r>   r>   r?   r     r  zJanusVQVAEAlignerMLP.forward)r@   rA   rB   r"   rc   r   rL   r>   r>   r<   r?   r{    r  r{  c                       s<   e Zd ZdZdef fddZdejdejfddZ	  Z
S )	JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r$   c                    s>   t    t|j|j| _t|j | _	t|j|j
| _d S r   )r/   rc   r   r   image_token_embed_dimr  r=  r   r   r   r  vision_headrp   r<   r>   r?   rc     s   
zJanusVQVAEHead.__init__rU   rt   c                 C   rN  r   )r=  r   r~  r   r>   r>   r?   r     rO  zJanusVQVAEHead.forward)r@   rA   rB   rQ   r"   rc   r6   r   tensorr   rL   r>   r>   r<   r?   r|    s    r|  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       s   e Zd Zdef fddZdd Zdd Zeede	j
d	ee d
eeB fddZde	jde	j
de	j
fddZee									dde	jdB de	j
dB de	jdB de	jdB dedB de	jdB de	j
dB dedB dee	jB d
efddZ  ZS )
JanusModelr$   c                    s   t  | || _t|j| _t| jj| _t	|j
| _t| jjj| jjj| _t| jj| _t| jj| _tj|jd| _d| _|   d S )Nr   F)r/   rc   r$   r   _from_configvision_configvision_modelr   alignerrm  	vq_configvqmodelr   rm   r  re   generation_embeddingsr{  generation_alignerr|  generation_headr   from_configtext_configlanguage_modelr   r   rp   r<   r>   r?   rc     s   zJanusModel.__init__c                 C   s
   | j  S r   )r  r   r   r>   r>   r?   r     s   
zJanusModel.get_input_embeddingsc                 C   s   | j | d S r   )r  set_input_embeddingsr:   r   r>   r>   r?   r    s   zJanusModel.set_input_embeddingsr   r   rt   c                 K   s(   | j |fddi|}| |j|_|S )Nry  T)r  r  rT   r   )r:   r   r   vision_outputsr>   r>   r?   get_image_features  s   zJanusModel.get_image_features	input_idsr   image_featuresc                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr   devicer-   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r6   r  r$   image_token_idlongr  allr  r8   r|   	expand_asr   r   numel)r:   r  r   r  special_image_maskn_image_tokensn_image_featuresr>   r>   r?   get_placeholder_mask  s   zJanusModel.get_placeholder_maskNr   r   r5   r+   cache_position	use_cachelogits_to_keepc
              
   K   s   |d u |d uA rt d|d u r|  |}|d urA| j|ddj}|d|jd }||j|j}| j	|||d}|
||}| jd|||||||	d|
}t|j|j|j|j|d urb|dS d dS )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oneT)ry  r-   )r   r  )r   r   r5   r+   r  r  r  )rT   r+   rU   rV   rW   r>   )r   r   r  r   r}   r8   r   r  r   r  masked_scatterr  rS   rT   r+   rU   rV   )r:   r  r   r   r5   r+   r  r   r  r  r   image_embedsr  image_attention_mask	lm_outputr>   r>   r?   r     sD   
zJanusModel.forward)	NNNNNNNNr   )r@   rA   rB   r    rc   r   r  r   r   r6   rR   r   r   rX   r   r  r%  r  r   r   r   r   rS   r   rL   r>   r>   r<   r?   r    sh    
	
r  c                       sR  e Zd ZddiZdZdZdef fddZdd	 Zd
d Z	de
jde
jfddZee										d(de
jdB de
jdB de
jdB de
jdB dedB de
jdB de
jdB de
jdB dedB dee
jB dee defddZ							d) fd d!	Zd"e
jfd#d$Ze
 			d*de
jdB de
jdB d%edB f fd&d'Z  ZS )+JanusForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightr&   Tr$   c                    sB   t  | || _t|| _tj|jj|jj	dd| _
|   d S )NFr   )r/   rc   r$   r  r%   r   r   r  rd   
vocab_sizelm_headr   rp   r<   r>   r?   rc   D  s
   
z&JanusForConditionalGeneration.__init__c                 C   s   | j j S r   )r%   r  r   r   r>   r>   r?   r   M  s   z2JanusForConditionalGeneration.get_input_embeddingsc                 C   s   | j j| d S r   )r%   r  r  r  r>   r>   r?   r  P  s   z2JanusForConditionalGeneration.set_input_embeddingsinputsrt   c                 C   s   | j |}| j |}|S r   )r%   r  r  )r:   r  r  r>   r>   r?   'prepare_embeddings_for_image_generationS  s   zEJanusForConditionalGeneration.prepare_embeddings_for_image_generationNr   r  r   r   r5   r+   r  r   labelsr  r  r   c                 K   s   | j d|||||||	|d|}|j}t|
trt|
 dn|
}| |dd|ddf }d}|durD| jd||| jjj	d|}t
|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   r5   r+   r   r  r  N)r[   r  r  )rZ   r[   r+   rU   rV   rW   r>   )r%   rT   r1   r   slicer  loss_functionr$   r  r  rY   r+   rU   rV   rW   )r:   r  r   r   r5   r+   r  r   r  r  r  r   outputsrU   slice_indicesr[   rZ   r>   r>   r?   r   X  s<   	z%JanusForConditionalGeneration.forwardFc	              	      s>   t  j|f||||||d|	}
|s|	dds||
d< |
S )N)r+   r   r   r  r  is_first_iterationr  Tr   )r/   prepare_inputs_for_generationget)r:   r  r   r+   r   r   r  r  r  r   model_inputsr<   r>   r?   r    s   z;JanusForConditionalGeneration.prepare_inputs_for_generationr   c                 C   s"   | j j|}|dddd}|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r   r   )r%   r  rx  r~   )r:   r   decoded_imager>   r>   r?   decode_image_tokens  s   z1JanusForConditionalGeneration.decode_image_tokenslogits_processorc           %         s  | d| j}t|}| dd}|dkr$t jd|||d d|S |jdi |}| tj	tj
fvr:td|  | |  |d urK|nt }d|d< |jd u r_td d	|_|j|d
< | ||j|\}}	}|j|j}
}t|jdkrtd|j d|d u}| j|||jd |jr|jdkr|t|j d |_| j||jd |d ||d}| jd|||jd|\}}| jjj j!}|j\}}|"dd}| dd }|"dd}||d< ||d d d f |jk||d d d f |j#d k@ }||d d d f $||j% | & |}| '|||}|(dd d u r<| j)|j*p,d|d t+|j,|| |d|d< t-j.||f|
|d}|j/}|j0}|j1}|j2}|j3}|r]|r]dnd }|rg|rgdnd }|rq|rqdnd }|r{|r{dnd }t4|D ]}| j5d||d|}d|v r|d 6|j|d< |d 6|j|d< | jj7di |||d}| 8||}|j9d d dd d f : } | j;| }!|||!}"|j<rt-j=|"dd}#t-j>|#dd?d}$nt-j@|"dd}$|$|d d |f< t-A|$|$g}$|$Bd}$| C|$}q|r1|r||!f7 }|r!|| D f7 }|r)||jE7 }|r1||jF7 }|r?tG||!||||jHdS |S ) Ngeneration_configgeneration_moder(   )r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr+   static)cache_implementationr   max_cache_lenmodel_kwargsr  r>   )r   r  r  )output_attentionsoutput_hidden_statesr-   r  )num_samples)	sequencesscoresr[   rV   rU   r+   )Ipopr  copydeepcopyr/   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  loggerwarning_prepare_model_inputsbos_token_idr   r  rQ  r8   _prepare_special_tokensr[  r	   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr%   r  r$   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr   _get_initial_cache_positionr  _prepare_static_cacher  max
max_lengthr6   zerosr  r  output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationrT   cloner  	do_sampler   multinomialsqueezeargmaxcatr|   r  floatrV   rU   r   r+   )%r:   r  r   r  r   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskr  r   r   input_tokensmaskr   generated_tokensr  r  r  r  r  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r  r  next_token_scoresprobs
next_tokenr<   r>   r?   r    s   	


















	z&JanusForConditionalGeneration.generate)
NNNNNNNNNr   )NNNNNNF)NNN)r@   rA   rB   _tied_weights_keysoutput_modalitiesrK   r    rc   r   r  r6   r   r  r   r   r%  rR   r   r   r   r   r   rY   r   r  r  no_gradr   r  rL   r>   r>   r<   r?   r  ?  s    		
6"r  )r#   r  r  rm  r   )r   )Wr  collections.abcr   dataclassesr   r6   torch.nn.functionalr   r   r!   r   r3   activationsr   cache_utilsr   
generationr	   r
   r   r   generation.utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   autor   configuration_janusr    r!   r"   
get_loggerr@   r  r#   rN   rS   rY   r\  r2   r   r   r   r  r   r   r   r*   r   r   r   r  r&  r8  rA  rF  rI  rP  rg  rk  rm  r{  r|  r  r  __all__r>   r>   r>   r?   <module>   s    
K
L#"6?,#MDGq  >