o
    wi                    @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z
d dlZd dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZKmLZLmMZM e4 rd dlZd dlNmZ d dlOm  mPZQ d dlRZe5 rd dlSZSddlTmUZU ddl7mVZVmWZW e6XeYZZG dd  d eIZ[G d!d" d"e<Z\G d#d$ d$eUZ]e2G d%d& d&e.Z^ee2d'd(G d)d* d*e+Z_G d+d, d,eDZ`G d-d. d.eEZaG d/d0 d0eMZbG d1d2 d2ejcZdG d3d4 d4ejcZeG d5d6 d6eLZfG d7d8 d8eKZgG d9d: d:e:ZhG d;d< d<ejcZiG d=d> d>eBZjG d?d@ d@eAZkG dAdB dBe?ZlG dCdD dDe@ZmG dEdF dFejcZnG dGdH dHejcZoG dIdJ dJejcZpG dKdL dLejcZqG dMdN dNe>ZrG dOdP dPejcZsG dQdR dRejcZte2dSd(G dTdU dUe^ZuG dVdW dWe^eZvG dXdY dYeZwg dZZxdS )[    N)Iterable)	dataclass)CallableOptionalUnion)nn)BlipImageProcessor   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BatchFeatureget_size_dict)resizeto_channel_dimension_format)ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatmake_list_of_imagesto_numpy_array)FlashAttentionKwargs)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_availableis_vision_availablelogging   )	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddings)PretrainedConfig)CONFIG_MAPPING
AutoConfigc                       sN   e Zd ZdZdZdZ									
												d fdd	Z  ZS )JanusVisionConfiga
  
    This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
    `JanusVisionModel` according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        image_size (`int`, *optional*, defaults to 384):
            The size (resolution) of each image.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for attention weights.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"`, and `"gelu_new"` are supported.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of MLP hidden dimensionality to embedding dimensionality.
        attention_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys, and values in the attention layers.
        hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout probability for fully connected layers in the encoder.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        projection_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for the projection layer.
        use_qk_norm (`bool`, *optional*, defaults to `False`):
            Whether to normalize the query and key matrices.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated normal initializer for initializing all weight matrices.
        depth (`int`, *optional*, defaults to 2):
            Number of hidden layers in the aligner module.
        num_image_tokens (`int`, *optional*, defaults to 576):
            Number of image tokens.
    janus_vision_modelvision_config         r	             ư>gelu      @T   F{Gz?r&   @  c                    sd   t  jd|||||||||	d	| | `|
| _|| _|| _|| _|| _|| _|| _	|| _
|| _d S )N)	hidden_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_act )super__init__intermediate_size	mlp_ratioattention_biashidden_dropout_rateprojection_dimprojection_dropoutuse_qk_norminitializer_rangedepthnum_image_tokens)selfrG   rH   rI   rJ   rK   rL   rM   rN   rO   rT   rU   rV   rW   rX   rY   rZ   r[   r\   kwargs	__class__rP   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/janus/modular_janus.pyrR      s.   

zJanusVisionConfig.__init__)r<   r=   r>   r	   r>   r?   r@   rA   rB   rC   Tr@   rD   r@   FrE   r&   rF   )__name__
__module____qualname____doc__
model_typebase_config_keyrR   __classcell__rP   rP   r_   ra   r9   Q   s.    .r9   c                       sx   e Zd ZdZddddddddg d	d
dddd
ddfdededededededededee dedef fddZ  Z	S )JanusVQVAEConfiga:
  
    This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
    `JanusVQVAEModel` according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Instantiating a
    configuration with the defaults will yield a similar configuration to the VQModel of the
    [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).

    Args:
        embed_dim (`int`, *optional*, defaults to 8):
            Dimensionality of each embedding vector.
        num_embeddings (`int`, *optional*, defaults to 16384):
            Number of codebook embeddings.
        double_latent (`bool`, *optional*, defaults to `False`):
            Whether to use double z channels.
        latent_channels (`int`, *optional*, defaults to 256):
            Number of channels for the latent space.
        num_patches (`int`, *optional*, defaults to 32):
            Num of patches the input images can be divided into.
        in_channels (`int`, *optional*, defaults to 3):
            Number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            Number of out channels.
        base_channels (`int`, *optional*, defaults to 128):
            Base channel count.
        channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
            Channel multipliers for each resolution.
        num_res_blocks (`int`, *optional*, defaults to 2):
            Number of residual blocks.
        dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in VAVAE MLP Connecter module.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        image_token_embed_dim (`int`, *optional*, defaults to 2048):
            Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
       i @  F       r	      )   rn   r&   r&      r&   r@   rE   rD   rB   	embed_dimnum_embeddingsdouble_latentlatent_channelsnum_patchesin_channelsout_channelsbase_channelschannel_multipliernum_res_blocksdropoutc                    s\   t  jd|||||||	|
||d
| || _|| _|| _|| _|| _|| _| `| `	| `
d S )N)
rp   rq   rr   rs   ru   rw   rx   ry   rz   rZ   rP   )rQ   rR   rt   rv   rW   rH   rO   image_token_embed_dim
resolutionattn_resolutions	attn_type)r]   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   rZ   rW   rH   rO   r{   r^   r_   rP   ra   rR      s.   zJanusVQVAEConfig.__init__)
rb   rc   rd   re   intboollistfloatrR   rh   rP   rP   r_   ra   ri      sR    .	
ri   c                       s:   e Zd ZdZdZeeedZ				d fdd	Z	  Z
S )	JanusConfiga;  
    This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
    Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.

    e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
    [deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
            The config object or dictionary of the vision backbone.
        vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
            The config object or dictionary of the VQVAE backbone.
        image_token_id (`int`, *optional*, defaults to 100581):
            Token index of a placeholder image token.

    Example:

    ```python
    >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

    >>> # Initializing a Janus vision config
    >>> vision_config = JanusVisionConfig()

    >>> # Initializing a Llama config
    >>> text_config = LlamaConfig()

    >>> # Initializing a VQ config
    >>> vq_config = JanusVQVAEConfig()

    >>> # Initializing a Janus Pro 1B style configuration
    >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

    >>> # Initializing a model from the Janus Pro 1B style configuration
    >>> model = JanusForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```janus)text_configr;   	vq_configN c                    s`  t |tr|dd|d< t|d  d	i || _n"|d u r*td td  | _nt |tr3|| _n	tdt	| |d u rJtd t
 | _n t |trXt
d	i || _nt |t
ra|| _n	tdt	| |d u rxtd t | _n t |trtd	i || _nt |tr|| _n	tdt	| | jj| jj | j_|| _t jd	i | d S )
Nrf   llamaz7`text_config` is None. Initializing with default valueszTInvalid type for `text_config`. Must be either `dict` or `LlamaConfig`. Type found: zK`vision_config` is None. Initializing with default JanusVisionConfig valuesz\Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`. Type found: zF`vq_config` is None. Initializing with default JanusVQVAEConfig valueszWInvalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`. Type found: rP   )
isinstancedictgetr7   r   loggerinfor6   
ValueErrortyper9   r;   ri   r   rL   rK   rt   image_token_idrQ   rR   )r]   r   r;   r   r   r^   r_   rP   ra   rR   A  sP   










zJanusConfig.__init__)NNNr   )rb   rc   rd   re   rf   r8   r9   ri   sub_configsrR   rh   rP   rP   r_   ra   r     s    -r   c                   @   sH   e Zd ZeZdZdZddgZddgZdZ	dZ
dZdZdZdZdd	 Zd
S )JanusPreTrainedModelmodelTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskFc                 C   s   t | jdr| jjjn| jj}t|tjtjfr0|jj	j
d|d |jd ur.|jj	  d S d S t|tjtjfrH|jj	  |jj	d d S t|tjrg|jj	j
d|d |jd uri|jj	|j   d S d S d S )Nr;   r@   )meanstd      ?)hasattrconfigr;   rZ   r   r   LinearConv2dweightdatanormal_biaszero_	GroupNorm	LayerNormfill_	Embeddingpadding_idx)r]   moduler   rP   rP   ra   _init_weights  s$   


z"JanusPreTrainedModel._init_weightsN)rb   rc   rd   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_cache_class_supports_static_cache!_supports_param_buffer_assignmentr   rP   rP   rP   ra   r   y  s    r   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   @   s2   e Zd ZU dZdZeej ed< dZ	ejed< dS )JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
rb   rc   rd   re   r   r   torchFloatTensor__annotations__r   rP   rP   rP   ra   r     s   
 r   c                   @      e Zd ZdS )JanusBaseModelOutputWithPastNrb   rc   rd   rP   rP   rP   ra   r         r   c                   @   r   )JanusCausalLMOutputWithPastNr   rP   rP   rP   ra   r     r   r   c                   @   s(   e Zd ZddejdedejfddZdS )	JanusVisionEmbeddingsFpixel_valuesinterpolate_pos_encodingreturnc           
      C   sh   |j \}}}}| jjj}| |j|d}|ddd}|r(| |||}	n| | j	}	||	 }|S )Ndtyper&   rn   )
shapepatch_embeddingr   r   toflatten	transposer   position_embeddingposition_ids)
r]   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedsrP   rP   ra   forward  s   
zJanusVisionEmbeddings.forwardN)F)rb   rc   rd   r   Tensorr   r   rP   rP   rP   ra   r     s     r   c                
       sX   e Zd ZdZdef fddZ		ddejdeej deej d	e	e
 fd
dZ  ZS )JanusVisionAttentionz(Attention Class for Janus Vision Encoderr   c                    sL  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frn   r   r   )rQ   rR   r   rG   rp   rI   	num_headshead_dimr   scalerM   rX   rY   	is_causalnum_key_value_groupsr   r   rU   q_projk_projv_projprojection_layerDropoutIdentityr   q_normk_norm)r]   r   proj_dropoutqk_normr_   rP   ra   rR     s0   

$zJanusVisionAttention.__init__Nhidden_statesattention_maskoutput_attentionsr^   c                 K   sH  |  \}}}| |}| |}	| |}
|d| j| j}| |}|	d| j| j}	| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkrjt| jj }|| ||	|
|f| jsvdn| j| j| jd|\}}|||| j}| |}| |}|r||f}|S |d f}|S )Nrn   r&   eagerr@   )rz   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr1   r   _attn_implementationr   trainingrM   r   r   rp   r   rX   )r]   r   r   r   r^   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputoutputsrP   rP   ra   r     sD   




	


zJanusVisionAttention.forward)NN)rb   rc   rd   re   r9   rR   r   r   r   r    r   r   rh   rP   rP   r_   ra   r     s     r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )JanusVisionMLPr   c                    sr   t    || _t|j|j | _t|j | _	t
|j| j| _t
| j|j| _t
|j| _t
|j| _d S N)rQ   rR   r   r   rG   rT   rS   r
   rO   activation_fnr   r   fc1fc2r   rV   dropout1dropout2r]   r   r_   rP   ra   rR     s   
zJanusVisionMLP.__init__r   r   c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r  r   r  r  r  r]   r   rP   rP   ra   r     s   




zJanusVisionMLP.forward)	rb   rc   rd   r9   rR   r   r   r   rh   rP   rP   r_   ra   r     s    
r   c                       "   e Zd Zdef fddZ  ZS )r   r   c                    sX   t    || _|j| _t|| _tj| j|j	d| _
tj| j|j	d| _t|| _d S )N)eps)rQ   rR   r   rG   rp   r   	self_attnr   r   rN   layer_norm1layer_norm2r   mlpr  r_   rP   ra   rR   )  s   

z JanusVisionEncoderLayer.__init__rb   rc   rd   r9   rR   rh   rP   rP   r_   ra   r   (      r   c                       r  )JanusVisionEncoderr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS rP   )r   .0r   r   rP   ra   
<listcomp>6  s    z/JanusVisionEncoder.__init__.<locals>.<listcomp>)rQ   rR   r   
ModuleListrangerH   layersr  r_   r  ra   rR   4  s   $zJanusVisionEncoder.__init__r  rP   rP   r_   ra   r  3  r  r  c                       r  )JanusVisionModelr   c                    s   t  | t|| _d S r   )rQ   rR   r  encoderr  r_   rP   ra   rR   :  s   zJanusVisionModel.__init__r  rP   rP   r_   ra   r  9  r  r  c                       *   e Zd Zdef fddZdd Z  ZS )JanusVisionAlignerMLPr   c                    N   t    t j j| _t fddtd j	D | _
t j | _d S )Nc                       g | ]
}t  j jqS rP   r   r   rW   r  r  rP   ra   r  E      z2JanusVisionAlignerMLP.__init__.<locals>.<listcomp>rn   )rQ   rR   r   r   rG   rW   r  r  r  r[   hidden_layersr
   rO   r   r  r_   r  ra   rR   @     
zJanusVisionAlignerMLP.__init__c                 C   ,   |  |}| jD ]}| |}||}q|S r   r  r  r   r]   r   layerrP   rP   ra   r   I  
   



zJanusVisionAlignerMLP.forward)rb   rc   rd   r9   rR   r   rh   rP   rP   r_   ra   r  ?      	r  c                       s8   e Zd Zdef fddZdejdejfddZ  Z	S )JanusVQVAEVectorQuantizerr   c                    s   t  | |jgd | _d S )Nr&   )rQ   rR   rt   quant_state_dimsr  r_   rP   ra   rR   R  s   z"JanusVQVAEVectorQuantizer.__init__image_tokensr   c                 C   sb   |j d }| jjj d }| |}tj|ddd}||g| j|R }|dddd }|S )Nr   r   r&   )pdimr	   rn   )	r   	embeddingr   F	normalizer   r(  permute
contiguous)r]   r)  r   emb_dimhidden_state_quantrP   rP   ra   get_codebook_entryV  s   

z,JanusVQVAEVectorQuantizer.get_codebook_entry)
rb   rc   rd   ri   rR   r   
LongTensorr   r3  rh   rP   rP   r_   ra   r'  Q  s    r'  c                   @   r   )JanusVQVAEResnetBlockNr   rP   rP   rP   ra   r5  f  r   r5  c                   @   r   )JanusVQVAEAttnBlockNr   rP   rP   rP   ra   r6  j  r   r6  c                   @   r   )JanusVQVAEConvDownsampleNr   rP   rP   rP   ra   r7  n  r   r7  c                       s$   e Zd Z fddZdd Z  ZS )JanusVQVAEConvUpsamplec                    s&   t    tjj||dddd| _d S )Nr	   rn   kernel_sizestridepadding)rQ   rR   r   r   r   conv)r]   ru   r_   rP   ra   rR   s  s   
zJanusVQVAEConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )Ng       @nearest)scale_factormode)r-  interpolater=  r  rP   rP   ra   r   w  s   
zJanusVQVAEConvUpsample.forward)rb   rc   rd   rR   r   rh   rP   rP   r_   ra   r8  r  s    r8  c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	JanusVQVAEMidBlockr   channelsc                    s8   t    t|||d| _t|| _t|||d| _d S )Nr   ru   rv   )rQ   rR   r5  block_1r6  attn_1block_2)r]   r   rC  r_   rP   ra   rR   ~  s   

zJanusVQVAEMidBlock.__init__r   r   c                 C   "   |  |}| |}| |}|S r   )rE  rF  rG  r  rP   rP   ra   r        


zJanusVQVAEMidBlock.forward)
rb   rc   rd   ri   r   rR   r   r   r   rh   rP   rP   r_   ra   rB  }  s    rB  c                       s,   e Zd Z fddZdejfddZ  ZS )JanusVQVAEEncoderc              	      sn  t    t|j| _|j| _|j}|j}|j}|j	}|j}t
jj||dddd| _dt| }|| _t | _t| jD ]T}t }	t }
|||  }|||  }t| jD ]}|	t|||d |}|| jd krt|
t| qXt }|	|_|
|_|| jd krt||_| j| q=t||| _t
jjd|ddd	| _t
jj||rd
| n|dddd| _d S )Nr	   rn   r9  )rn   rD  rl   rA   T
num_groupsrJ   r  affiner&   ) rQ   rR   lenrx   num_resolutionsry   rw   ru   rr   rs   r   r   r   conv_intuplein_channel_multiplierr  downr  appendr5  r6  Moduleblockattnr7  
downsamplerB  midr   norm_outconv_out)r]   r   rw   ru   rr   rs   rx   rR  i_levelrV  rW  block_in	block_outi_blockrS  r_   rP   ra   rR     sX   


zJanusVQVAEEncoder.__init__r   c                 C   s   |  |g}t| jD ]C}t| jD ]'}| j| j| |d }t| j| jdkr4| j| j| |}|| q|| jd krN|| j| 	|d  q|d }| 
|}| |}|t|9 }| |}|S )Nr   r   rn   )rP  r  rO  ry   rS  rV  rN  rW  rT  rX  rY  rZ  r   sigmoidr[  )r]   r   r   r\  r_  hidden_statelast_hidden_staterP   rP   ra   r     s$   


zJanusVQVAEEncoder.forward)rb   rc   rd   rR   r   r4  r   rh   rP   rP   r_   ra   rJ    s    3rJ  c                       s2   e Zd Z fddZdejdejfddZ  ZS )JanusVQVAEDecoderc              	      sP  t    t|j| _|j| _|j}|j}|j}||j| jd   }t	j
j||dddd| _t||| _t
 | _tt| jD ]N}t
 }t
 }||j|  }	t| jd D ]}
|t|||	d |	}|| jd krt|t| qXt
 }||_||_|dkrt||_| j| q@t	j
jd|ddd	| _t	j
j||dddd| _d S )
Nrn   r	   r9  rD  r   rl   rA   TrK  )rQ   rR   rN  rx   rO  ry   rw   rs   rv   r   r   r   rP  rB  rY  r  upreversedr  rT  r5  r6  rU  rV  rW  r8  upsampler   rZ  r[  )r]   r   rw   rs   rv   r]  r\  rV  rW  r^  r_  rd  r_   rP   ra   rR     sD   


zJanusVQVAEDecoder.__init__ra  r   c                 C   s   |  |}| |}t| jD ]9}t| jd D ] }| j| j| |}t| j| jdkr8| j| j| |}q|| jd krH| j| 	|}q| 
|}|t|9 }| |}|S )Nrn   r   )rP  rY  r  rO  ry   rd  rV  rN  rW  rf  rZ  r   r`  r[  )r]   ra  r\  r_  rP   rP   ra   r     s   



zJanusVQVAEDecoder.forward)rb   rc   rd   rR   r   r   r   rh   rP   rP   r_   ra   rc    s    .rc  c                       sl   e Zd Zg dZdZdef fddZdejdej	fdd	Z
eedej	deej	ej	f fd
dZ  ZS )
JanusVQVAE)r6  r5  r'  r   r   c                    s(   t  | t|| _d| _|   d S )NF)rQ   rR   rc  decodergradient_checkpointing	post_initr  r_   rP   ra   rR   ,  s   
zJanusVQVAE.__init__r)  r   c                 C   sr   |j d | jjd | jjd  kr'td| jjd | jjd   d|j  d| j|}| |}| |}|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        rn   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizer(  r   r3  post_quant_convrh  )r]   r)  codebook_entryr   r   rP   rP   ra   decode4  s   "	

zJanusVQVAE.decodec                 C   s6   |j d }| |\}}}| ||d}t||S )Nr   r   )r   encodern  r   r   )r]   r   r   quantr   indicesr   rP   rP   ra   r   G  s   

zJanusVQVAE.forward)rb   rc   rd   r   main_input_nameri   rR   r   r4  r   rn  r"   r!   rQ  r   rh   rP   rP   r_   ra   rg  $  s    rg  c                       r  )JanusVQVAEAlignerMLPr   c                    r  )Nc                    r  rP   r  r  r  rP   ra   r  Z  r  z1JanusVQVAEAlignerMLP.__init__.<locals>.<listcomp>rn   )rQ   rR   r   r   rp   rW   r  r  r  rH   r  r
   rO   r   r  r_   r  ra   rR   U  r   zJanusVQVAEAlignerMLP.__init__c                 C   r!  r   r"  r#  rP   rP   ra   r   ^  r%  zJanusVQVAEAlignerMLP.forward)rb   rc   rd   ri   rR   r   rh   rP   rP   r_   ra   rs  T  r&  rs  c                       s<   e Zd ZdZdef fddZdejdejfddZ	  Z
S )	JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                    s>   t    t|j|j| _t|j | _	t|j|j
| _d S r   )rQ   rR   r   r   r{   rW   proj_outr
   rO   r   rq   vision_headr  r_   rP   ra   rR   i  s   
zJanusVQVAEHead.__init__r   r   c                 C   rH  r   )ru  r   rv  r  rP   rP   ra   r   o  rI  zJanusVQVAEHead.forward)rb   rc   rd   re   ri   rR   r   r   tensorr   rh   rP   rP   r_   ra   rt  f  s    rt  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
	
	dde
jde
jdee
j dee
j dee dee
j dee
j dee dee dee deee
jf fddZ  ZS )
JanusModelr   c                    s   t  | || _t|j| _t| jj| _t	|j
| _t| jjj| jjj| _t| jj| _t| jj| _tj|jd| _d| _|   d S )Nr  F)rQ   rR   r   r  _from_configr;   vision_modelr  alignerrg  r   vqmodelr   r   rq   rp   generation_embeddingsrs  generation_alignerrt  generation_headr'   from_configr   language_modelri  rj  r  r_   rP   ra   rR   |  s   zJanusModel.__init__c                 C   s
   | j  S r   )r  get_input_embeddingsr]   rP   rP   ra   r       
zJanusModel.get_input_embeddingsc                 C   s   | j | d S r   )r  set_input_embeddingsr]   valuerP   rP   ra   r    s   zJanusModel.set_input_embeddingsc                 C   s   |  |}| |j}|S r   )rz  r{  rb  )r]   r   image_embedsrP   rP   ra   get_image_features  s   
zJanusModel.get_image_featuresNr   	input_idsr   r   r   r   cache_positioninputs_embeds	use_cacher   output_hidden_stateslogits_to_keepc                 K   sH  |	d ur|	n| j j}	|
d ur|
n| j j}
|d u |d uA r td| jr/| jr/|r/td d}|d ur;|d ur;td|d u rE|  |}|d ur}| 	|}|| j j
k}|jd }|d|}|ddd|}||j}||j|j}|||}| jd||||||	|
||d	|}t|j|j|j|j|d ur|dS d dS )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oner   )	r  r   r   r   r  r   r  r  r  )rb  r   r   
attentionsimage_hidden_statesrP   )r   r   r  r   ri  r   r   warning_oncer  r  r   r   r   	unsqueezeexpandr   devicer   masked_scatterr  r   rb  r   r   r  )r]   r  r   r   r   r   r  r  r  r   r  r  r^   r  image_attention_maskrp   image_features	lm_outputrP   rP   ra   r     sd   



zJanusModel.forward)NNNNNNNNNNr   )rb   rc   rd   r   rR   r  r  r  r"   r!   r   r4  r   r   r   r   r   r   r   r   rh   rP   rP   r_   ra   rx  v  sR    	
rx  c                       sj  e Zd ZddgZdZdef fddZdd Zd	d
 Zde	j
de	j
fddZdd Zdd Zdd Zdd Zee												d/de	jde	jdee	j
 dee	j dee dee	j dee	j d ee	j d!ee d"ee d#ee d$eee	j
f fd%d&Z						d0 fd'd(	Zd)e	j
fd*d+Ze	j			d1de	j
dee	j d,ee f fd-d.Z  ZS )2JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr   c                    sB   t  | || _t|| _tj|jj|jj	dd| _
|   d S )NFr   )rQ   rR   r   rx  r   r   r   r   rG   
vocab_sizelm_headrj  r  r_   rP   ra   rR     s
   
z&JanusForConditionalGeneration.__init__c                 C   s   | j j S r   )r   r  r  r  rP   rP   ra   r    s   z2JanusForConditionalGeneration.get_input_embeddingsc                 C   s   | j j| d S r   )r   r  r  r  rP   rP   ra   r    s   z2JanusForConditionalGeneration.set_input_embeddingsinputsr   c                 C   s   | j |}| j |}|S r   )r   r}  r~  )r]   r  ra  rP   rP   ra   'prepare_embeddings_for_image_generation  s   zEJanusForConditionalGeneration.prepare_embeddings_for_image_generationc                 C      | j S r   r  r  rP   rP   ra   get_output_embeddings      z3JanusForConditionalGeneration.get_output_embeddingsc                 C   
   || _ d S r   r  )r]   new_embeddingsrP   rP   ra   set_output_embeddings  r  z3JanusForConditionalGeneration.set_output_embeddingsc                 C   r  r   r   )r]   rh  rP   rP   ra   set_decoder  r  z)JanusForConditionalGeneration.set_decoderc                 C   r  r   r  r  rP   rP   ra   get_decoder	  r  z)JanusForConditionalGeneration.get_decoderNr   r  r   r   r   r   r  r  labelsr  r   r  r  c                 K   s   |
dur|
n| j j}
|dur|n| j j}| jd|||||||	|
||d
|}|j}t|tr5t| dn|}| |dd|ddf }d}|durV| j	||| j j
jd}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)
r  r   r   r   r   r  r  r   r  r  )logitsr  r  )lossr  r   r   r  r  rP   )r   r   r  r   rb  r   r   slicer  loss_functionr   r  r   r   r   r  r  )r]   r  r   r   r   r   r  r  r  r  r   r  r  r^   r   r   slice_indicesr  r  rP   rP   ra   r     s>   z%JanusForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r   r  r   r  r  r   r   )rQ   prepare_inputs_for_generation)
r]   r  r   r   r   r  r  r  r^   model_inputsr_   rP   ra   r  H  s   z;JanusForConditionalGeneration.prepare_inputs_for_generationr)  c                 C   s"   | j j|}|dddd}|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r&   r	   rn   )r   r|  rn  r/  )r]   r)  decoded_imagerP   rP   ra   decode_image_tokensf  s   z1JanusForConditionalGeneration.decode_image_tokenslogits_processorc           %         sz  | d| j}t|}| dd}|dkr$t jd|||d d|S |jdi |}| tj	tj
fvr:td|  | |  |d urK|nt }d|d< |jd u r_td d	|_|j|d
< | ||j|\}}	}|j|j}
}t|jdkrtd|j d|d u}| j|||jd |jr|jdkr|t|j d |_| j||jd |d ||d}| jd|||jd|\}}| jjj j!}|j\}}|"dd}| dd }|"dd}||d< ||d d d f |jk||d d d f |j#d k@ }||d d d f $||j% | & |}| '|||}|(dd d u r=| j)|j*p,d|d t+|j,|| ||d|d< t-j.||f|
|d}|j/}|j0}|j1}|j2}|j3}|r^|r^dnd }|rh|rhdnd }|rr|rrdnd }|r||r|dnd }t4|D ]}| j5d||d|}|d 6|j|d< |d 6|j|d< | jj7di |||d}| 8||}|j9d d dd d f : } | j;| }!|||!}"|j<rt-j=|"dd}#t-j>|#dd?d}$nt-j@|"dd}$|$|d d |f< t-A|$|$g}$|$Bd}$| C|$}q|r-|r||!f7 }|r|| D f7 }|r%||jE7 }|r-||jF7 }|r;tG||!||||jHdS |S ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r&   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  rn   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr   max_cache_lenr  model_kwargs)r   r  rP   )r  r  r  )r   r  r   )r+  )num_samples)	sequencesscoresr  r  r   r   )Ipopr  copydeepcopyrQ   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  r   warning_prepare_model_inputsbos_token_idr   r  rN  r   _prepare_special_tokensrT  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   rz  r   r\   repeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionr   
_get_cacher  max
max_lengthr   zerosr   r  output_scoresoutput_logitsreturn_dict_in_generater  r  r   r  #_update_model_kwargs_for_generationrb  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r  r   r  r   r   r   )%r]   r  r   r  r^   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskr\   r   r   input_tokensmaskr  generated_tokensr   r  r  r  r  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r   ra  r  next_token_scoresprobs
next_tokenr_   rP   ra   r  r  s   	
















	z&JanusForConditionalGeneration.generate)NNNNNNNNNNNr   )NNNNNN)NNN) rb   rc   rd   _tied_weights_keysr   r   rR   r  r  r   r   r  r  r  r  r  r"   r!   r4  r   r   r   r   r   r   r   r  r  no_gradr   r  rh   rP   rP   r_   ra   r    s    		
=r  c                       s  e Zd ZdZdddejddddddf
dedeee	e
f  de
d	ed
edee
ef dedeeeee f  deeeee f  dee f fddZ			d"dejdee
ee
e
e
f f deee	ef  deee	ef  dejf
ddZejddfdejdeee	e
f e
f d	edeee	ef  deee	ef  dejfddZ							d#ded
ee dee dee deee  deee  dee	 dee	 fddZ	d$dejdeeee f deeee f deee	ef  dejf
d d!Z  ZS )%JanusImageProcessora
  
    Constructs a JANUS image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        min_size (`int`, *optional*, defaults to 14):
            The minimum allowed size for the resized image. Ensures that neither the height nor width
            falls below this value after resizing.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
            overridden by the `resample` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
    TN   gp?	do_resizer   min_sizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbc                    sB   t  jdi | || _|d u rd| _d S tdd |D | _d S )N)   r  r  c                 S   s   g | ]}t |d  qS )   )r   )r  xrP   rP   ra   r  m  s    z0JanusImageProcessor.__init__.<locals>.<listcomp>rP   )rQ   rR   r  background_colorrQ  )r]   r  r   r  r  r  r  r   r  r  r  r^   r_   rP   ra   rR   Y  s
   
zJanusImageProcessor.__init__r   imager  data_formatinput_data_formatr   c                 C   s  t ||\}}|tjkr|jd n|jd }||kr*|dur&t|||}|S |}|S t||}t|tr8|g}nt||krFt	d| d|tjkrt
j|||f|jd}	t|D ]\}
}||	|
ddddf< qZ||kr|| d }||	dd||| ddf< |	S || d }||	dddd||| f< |	S t
j|||f|jd}	t|D ]\}
}||	dddd|
f< q||kr|| d }||	||| ddddf< |	S || d }||	dd||| ddf< |	S )a}  
        Pads an image to a square based on the longest edge.

        Args:
            image (`np.ndarray`):
                The image to pad.
            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                The color to use for the padding. Can be an integer for single channel or a
                tuple of integers representing for multi-channel images. If passed as integer
                in mutli-channel mode, it will default to `0` in subsequent channels.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The padded image.
        r   r   Nz(background_color must have no more than z) elements to match the number of channelsr   r&   )r   r   FIRSTr   r   r  r   r   rN  r   npr  r   	enumerate)r]   r  r  r	  r
  r   r   rJ   max_dimresultr  colorstartrP   rP   ra   pad_to_squareo  sL   




z!JanusImageProcessor.pad_to_squarec                 K   s   |du rt |}t||\}}t||}	t|dd}|d |d kr0td|d  d|d  |d }||	 }
tt||
 | jtt||
 | jg}t|f||||d|}| j|| j	|d	}|S )
a  
        Resize an image to dynamically calculated size.

        Args:
            image (`np.ndarray`):
                Image to resize.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `None`: will be inferred from input
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        NT)default_to_squarer   r   z5Output height and width must be the same. Got height=z and width=)r   r  r	  r
  )r  r  r
  )
r   r   r  r   r   r   r  r   r  r  )r]   r  r   r  r	  r
  r^   r   r   max_sizedeltaoutput_size_nonpaddedrP   rP   ra   r     s<   !
	zJanusImageProcessor.resizeimagesreturn_tensorsc	                 C   sR  |dur|n| j }|du rd| j n|}|dur|n| j}|dur#|n| j}|dur,|n| j}t|}t|d tjjrHt	|dkrD|S |d S |du rRt
|d }g }	|D ]@}
t|
}
|rg| j|
|||d}
|r{| j|
||d}
|
ddtj}
|r|r|dkrt|
tj|d	}
tj|
}
|	|
 qVd
|	i}|dkr|nd}t||dS )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Nr   r   rn   )r  r  r  r
  )r   r
  r  zPIL.Image.Image)input_channel_dimr   )r   tensor_type)r  r  r   r  r  r   r   PILImagerN  r   r   unnormalizerescaleclipastyper  uint8r   r   LAST	fromarrayrT  r   )r]   r  r  r  r   r  r  r
  r  r   r  r   rP   rP   ra   postprocess  s6   zJanusImageProcessor.postprocessc                 C   s   d}t |trt||krtd| dt| n|g| }t |tr7t||kr6td| dt| n|g| }tdd t||D }tdd |D }| j||||d}|S )	a~  
        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
        image = (image * image_std) + image_mean
        Args:
            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
                Batch of pixel values to postprocess.
            image_mean (`float` or `Iterable[float]`):
                The mean to use for unnormalization.
            image_std (`float` or `Iterable[float]`):
                The standard deviation to use for unnormalization.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        r	   zmean must have z$ elements if it is an iterable, got zstd must have c                 s   s    | ]
\}}| | V  qd S r   rP   )r  r   r   rP   rP   ra   	<genexpr>W  s    z2JanusImageProcessor.unnormalize.<locals>.<genexpr>c                 s   s    | ]}d | V  qdS )rn   NrP   )r  r   rP   rP   ra   r%  X  s    )r  r   r   r
  )r   r   rN  r   rQ  zipr.  )r]   r  r  r  r
  rJ   rev_image_meanrev_image_stdrP   rP   ra   r  1  s"   



zJanusImageProcessor.unnormalize)r   NN)NNNNNNNr   )rb   rc   rd   re   r   BICUBICr   r   r   strr   r   r   r   rR   r  ndarrayrQ  r   arrayr  r   r   r$  r   r  rh   rP   rP   r_   ra   r  3  s    '
	

N
H

	
8r  )	r  r   r  rx  rg  r  ri   r9   r   )yr  collections.abcr   dataclassesr   typingr   r   r   numpyr  r   r   .transformers.models.blip.image_processing_blipr   activationsr
   cache_utilsr   
generationr   r   r   r   generation.utilsr   image_processing_utilsr   r   image_transformsr   r   image_utilsr   r   r   r   r   r   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   r   processing_utilsr    utilsr!   r"   r#   r$   r%   autor'   blip_2.modeling_blip_2r(   !chameleon.configuration_chameleonr)   chameleon.modeling_chameleonr*   r+   r,   r-   r.   idefics.modeling_ideficsr/   r0   llama.modeling_llamar1   siglip.configuration_siglipr2   siglip.modeling_siglipr3   r4   r5   torch.nntorch.nn.functional
functionalr-  torch.utils.checkpointr  configuration_utilsr6   r7   r8   
get_loggerrb   r   r9   ri   r   r   r   r   r   r   rU  r   r   r   r  r  r  r'  r5  r6  r7  r8  rB  rJ  rc  rg  rs  rt  rx  r  r  __all__rP   rP   rP   ra   <module>   s   $	
aZm OMD0m  M  .