o
    پi                    @   s2  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZ d dlZd dlm  mZ d dlmZ d dlmZmZmZ d d	lm Z  d d
l!m"Z"m#Z# d dl$T d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 e9G dd dZ:			ddedej;de<de<dej;f
dd Z=d!d" Z>d#e>iZ?d dl@Z d$d% ZAd&d' ZB	(	)	*	+dd,ejd-eCd.eCd/eCd0eCf
d1d2ZDeAd3ZEG d4d5 d5eFeZGd6ejd7eGfd8d9ZH	:		dd;eIeJ d<eFd=e<d>e<fd?d@ZKG dAdB dBej;ZLG dCdD dDej;ZM	ddEeCdFe<dGe<fdHdIZNG dJdK dKej;ZOG dLdM dMej;ZPeeFeeejj; f ZQG dNdO dOej;ZR		P	:		ddQejd;eIeJ dReeIeJ  dSeJd<eFd=e<d>e<fdTdUZSdVdW ZTddej;dXeFddfdYdZZUG d[d\ d\ej;ZVd]d^ ZWG d_d` d`ejj;ZXdadbdcdddedfdgddhdidbdcdddedfdgddhdjdedkdldedmdgddhdnZY	o	j	p	ddqeFdreJdseJdteFfdudvZZG dwdx dxejj;Z[G dydz dzej;Z\G d{d| d|ej;Z]G d}d~ d~ej;Z^e_ejjdZ`dejav reJejad ZbndPZbdZcdde<de<fddZdG dd dej;ZeG dd dej;ZfG dd dej;ZgG dd dej;ZhG dd dej;ZiG dd dej;Zjdd ZkdddxZ[G dd dej;ZlG dd dej;ZmdddZnG dd dej;ZoG dd de#ZpG dd depZqe"jreseqd eqgZtdS )    N)field)Enum)partial)repeat)
CallableFinalIterableLiteralOptionalSequenceSetTupleTypeUnion)	rearrange)Tensor_assertnn)trunc_normal_)	AutoModelPreTrainedModel)*)VisionAttention)LogitsProcessor)QuantizationConfig))MultiModalityDataPaddingPatternTokenPairsgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)LlamaForCausalLM)loggerc                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< edd dZee ed< edd dZee ed< dZeed< d
Zeed< dS )	ModelArgsi @  codebook_size   codebook_embed_dimTcodebook_l2_normcodebook_show_usageg      ?commit_loss_beta        entropy_loss_ratioc                   C      g dS N   r/      r0       r2   r2   r2   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/deepseek_janus_pro.py<lambda>L       zModelArgs.<lambda>)default_factoryencoder_ch_multc                   C   r,   r-   r2   r2   r2   r2   r3   r4   M   r5   decoder_ch_mult   
z_channels	dropout_pN)__name__
__module____qualname__r$   int__annotations__r&   r'   boolr(   r)   floatr+   r   r7   Listr8   r:   r;   r2   r2   r2   r3   r#   C   s   
 r#    TFfnmoduledepth_firstinclude_rootreturnc                 C   sf   |s
|r
| ||d |  D ]\}}|rd||fn|}t| |||dd q|r1|r1| ||d |S )NrF   name.T)rE   rF   rK   rG   rH   )named_childrenjoinnamed_apply)rE   rF   rK   rG   rH   
child_namechild_moduler2   r2   r3   rO   R   s   rO   c                  K   s    t tdg dg dd| S )Nr.   )r7   r8   r2   )VQModelr#   )kwargsr2   r2   r3   VQ_16i   s   rT   zVQ-16c                    s    fdd}|S )Nc                    s.   t | tjjrt | tst| S tt|  S N)
isinstancecollectionsabcr   strtupler   xnr2   r3   parsex   s   z_ntuple.<locals>.parser2   )r^   r_   r2   r]   r3   _ntuplew   s   r`   c           	      C   s  dd }||d|  k s||d|  krt jddd ||| | }||| | }| d| d d| d  | jtjtjfv rU| j}| tj} | 	  | |} n| 	  | 
|td  | | | jtjkr}| tj} | j||d d S | j||d d S )	Nc                 S   s   dt | t d  d S )N      ?       @)matherfsqrtr[   r2   r2   r3   norm_cdf   s   z _trunc_normal_.<locals>.norm_cdfr0   zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr/   rb   )minmax)r"   warnuniform_dtypetorchfloat16bfloat16tofloat32erfinv_mul_rc   re   add_clamp_)	tensormeanstdabrf   luog_dtyper2   r2   r3   _trunc_normal_   s*    	
r~   r*   ra          rb   rv   rw   rx   ry   rz   c                 C   sN   t   t| dd|| | || W d   dS 1 s w   Y  dS )al  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(	ext{mean}, 	ext{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq 	ext{mean} \leq b`.
    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsequently scaled and shifted by the mean and std args.
    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    r   ra   N)rm   no_gradr~   rs   rt   )rv   rw   rx   ry   rz   r2   r2   r3   trunc_normal_tf_   s   
"r   r0   c                   @   s   e Zd ZdZdZdZdZdS )FormatNCHWNHWCNCLNLCN)r<   r=   r>   r   r   r   r   r2   r2   r2   r3   r      
    r   r\   fmtc                 C   sV   |t jkr| dddd} | S |t jkr| ddd} | S |t jkr)| d} | S )Nr   r0      r/   )r   r   permuter   flatten	transposer   )r\   r   r2   r2   r3   nchw_to   s   



r   bicubicnew_sizeinterpolation	antialiasverbosec              	      s.  ddl zddlm} W n ty   ddlm} Y nw t| jdks'J dtdks1J d| jdd }t|tkrB| S |rTt	d	| j d
 d d  fddfdd}||}tj
j|j| jdfdd}	|||	dddd}
| j}|  } |
| } | |} | S )a/  Resample the weights of the patch embedding kernel to target resolution.
    We resample the patch embedding kernel by approximately inverting the effect
    of patch resizing.

    Code based on:
      https://github.com/google-research/big_vision/blob/b00544b81f8694488d5f36295aeb7972f3755ffe/big_vision/models/proj/flexi/vit.py

    With this resizing, we can for example load a B/8 filter into a B/16 model
    and, on 2x larger input image, the result will match.

    Args:
        patch_embed: original parameter to be resized.
        new_size (tuple(int, int): target shape (height, width)-only.
        interpolation (str): interpolation for resize
        antialias (bool): use anti-aliasing filter in resize
        verbose (bool): log operation
    Returns:
        Resized patch embedding kernel.
    r   N)vmapr1   zFour dimensions expectedr0   zNew shape should only be hwzResize patch embedding  to z, w/ z interpolation.c                    s,   t | d }tj|| dd  }|S )N)NN.sizemoder   )r   r   .)rm   r   Finterpolatenumpy)x_np	_new_sizex_tfx_upsampled)r   r   r2   r3   resize  s   z$resample_patch_embed.<locals>.resizec                    sT   g }t  | D ]} | }d| || < |||d q	 |jS )Nra   )rangeprodzerosunravel_indexappendreshapestackT)	_old_sizer   mati	basis_vec)npr   r2   r3   get_resize_mat  s   
z,resample_patch_embed.<locals>.get_resize_matdevicec                    s   |  d }|  S )Nr   )r   )kernelresampled_kernel)r   resize_mat_pinvr2   r3   resample_kernel   s   
z-resample_patch_embed.<locals>.resample_kernelr/   )r   rm   r   ImportError
torch.funclenshaperZ   r"   inforv   linalgpinvr   r   rl   rB   rp   )patch_embedr   r   r   r   r   old_sizer   
resize_matr   v_resample_kernel
orig_dtyper2   )r   r   r   r   r   r   r3   resample_patch_embed   s8   

r   c                       s.  e Zd ZU dZeed< ejje	 ed< 													
d de
e dededede
e de	de
e de	de	de	f fddZdeeeeef f fddZ		d!de
eeeeef f  de
eeeeef f  fddZd"deeeef ef fddZdeeef deeef fddZdd Z  ZS )#
PatchEmbedz2D Image to Patch Embedding
output_fmtdynamic_img_pad      r      NTFimg_size
patch_sizein_chans	embed_dim
norm_layerr   biasstrict_img_sizec                    s   t    tt|| _| |\| _| _| _|d ur$d| _	t
|| _n|| _	t
j| _|	| _|
| _tj|||||d| _|rE||| _d S t | _d S )NFkernel_sizestrider   )super__init__rZ   	to_2tupler   _init_img_sizer   	grid_sizenum_patchesr   r   r   r   r   r   r   Conv2dprojIdentitynorm)selfr   r   r   r   r   r   r   r   r   r   	__class__r2   r3   r   4  s   

 zPatchEmbed.__init__c                 C   sR   | j sJ |d u rdS t|}tdd t|| j D }|d |d  }|||fS )N)NNNc                 S   s   g | ]\}}|| qS r2   r2   ).0spr2   r2   r3   
<listcomp>Y  s    z-PatchEmbed._init_img_size.<locals>.<listcomp>r   r/   )r   r   rZ   zip)r   r   r   r   r2   r2   r3   r   T  s   

zPatchEmbed._init_img_sizec                 C   s   d }|d ur
t |}|d urZ|| jkrZt 8 tj| jj| jj||| jj	d ud}|j
t| jj
|dd | jj	d urE|j	| jj	 || _W d    n1 sRw   Y  || _|p^| j}|| jksh|d uru| |\| _| _| _d S d S )Nr   T)r   )r   r   rm   r   r   r   r   in_channelsout_channelsr   weightcopy_r   r   r   r   r   )r   r   r   new_patch_sizenew_projr2   r2   r3   set_input_size]  s4   


zPatchEmbed.set_input_sizerI   c                 C   s   |rt | jS | jS rU   )ri   r   )r   	as_scalarr2   r2   r3   
feat_ratio{  s   
zPatchEmbed.feat_ratioc                 C   sZ   | j rt|d | jd  t|d | jd  fS |d | jd  |d | jd  fS )zGet grid (feature) size for given image size taking account of dynamic padding.
        NOTE: must be torchscript compatible so using fixed tuple indexing
        r   r/   )r   rc   ceilr   )r   r   r2   r2   r3   dynamic_feat_size  s
   $zPatchEmbed.dynamic_feat_sizec                 C   sv  |j \}}}}| jd urg| jr8t|| jd kd| d| jd  d t|| jd kd| d| jd  d n/| jsgt|| jd  dkd| d| jd  d t|| jd  dkd| d| jd  d | jr| jd || jd   | jd  }| jd || jd   | jd  }t|d|d|f}| |}| j	r|	d
dd}n| jtjkrt|| j}| |}|S )	Nr   zInput height (z) doesn't match model (z).r/   zInput width (z%) should be divisible by patch size (r0   )r   r   r   r   r   r   r   padr   r   r   r   r   r   r   r   )r   r\   BCHWpad_hpad_wr2   r2   r3   forward  s>   
""

zPatchEmbed.forward)
r   r   r   r   NTNTTF)NN)T)r<   r=   r>   __doc__r   r@   rm   jitr   rA   r
   r?   r   rY   r   r   r   r   r   r   r   r   __classcell__r2   r2   r   r3   r   .  s\   
 	
 
 "r   c                       s:   e Zd ZdZddejddddf fdd	Zdd	 Z  ZS )
MlpzMLP as used in Vision Transformer, MLP-Mixer and related networks

    NOTE: When use_conv=True, expects 2D NCHW tensors, otherwise N*C expected.
    NTr*   Fc	                    s   t    |p|}|p|}t|}t|}	|rttjddntj}
|
|||d d| _| | _t	|	d | _
|d ur?||nt | _|
|||d d| _t	|	d | _d S )Nr/   )r   r   r   )r   r   r   r   r   r   Linearfc1actDropoutdrop1r   r   fc2drop2)r   in_featureshidden_featuresout_features	act_layerr   r   dropuse_conv
drop_probslinear_layerr   r2   r3   r     s   
zMlp.__init__c                 C   @   |  |}| |}| |}| |}| |}| |}|S rU   )r   r   r  r   r  r  r   r\   r2   r2   r3   r        





zMlp.forward)	r<   r=   r>   r   r   GELUr   r   r   r2   r2   r   r3   r     s    r   	drop_probtrainingscale_by_keepc                 C   s`   |dks|s| S d| }| j d fd| jd   }| ||}|dkr,|r,|| | | S )a(  Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.

    r*   r/   r   r/   )r   ndim	new_empty
bernoulli_div_)r\   r  r  r  	keep_probr   random_tensorr2   r2   r3   	drop_path  s   
r  c                       s<   e Zd ZdZddedef fddZdd	 Zd
d Z  Z	S )DropPathzYDrop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).r*   Tr  r  c                    s   t t|   || _|| _d S rU   )r   r  r   r  r  )r   r  r  r   r2   r3   r     s   
zDropPath.__init__c                 C   s   t || j| j| jS rU   )r  r  r  r  r  r2   r2   r3   r        zDropPath.forwardc                 C   s   dt | jddS )Nz
drop_prob=r   z0.3f)roundr  r   r2   r2   r3   
extra_repr  r  zDropPath.extra_repr)r*   T)
r<   r=   r>   r   rB   rA   r   r   r  r   r2   r2   r   r3   r    s
    r  c                       s   e Zd Zdddddddejejef
dededede	d	e	d
edede
e dedejdejdejddf fddZdejdejfddZ  ZS )VisionTransformerBlock      @Fr*   Ndim	num_heads	mlp_ratioqkv_biasqk_norm	proj_drop	attn_dropinit_valuesr  r  r   	mlp_layerrI   c                    s   t    ||| _t|||d|d| _|rt||dnt | _|	dkr)t	|	nt | _
||| _||t|| |
|d| _|rHt||dnt | _|	dkrXt	|	| _d S t | _d S )NT)r   r#  projection_sizeuse_qkv_paralleldropout)r)  r*   )r  r  r  r  )r   r   norm1r   attn
LayerScaler   r   ls1r  
drop_path1norm2r?   mlpls2
drop_path2)r   r"  r#  r$  r%  r&  r'  r(  r)  r  r  r   r*  r   r2   r3   r     s,   

	

$zVisionTransformerBlock.__init__r\   c              
   C   sD   ||  | | | | }|| | | | | }|S rU   )r2  r1  r/  r.  r6  r5  r4  r3  r  r2   r2   r3   r   +  s     zVisionTransformerBlock.forward)r<   r=   r>   r   r  	LayerNormr   r?   rB   rA   r
   Moduler   rm   r   r   r   r2   r2   r   r3   r      sN    	
*r   c                	       sv   e Zd ZU dZejje ed< 				dde	de
dedef fd	d
Zdeejeejeej f f fddZ  ZS )PatchDropoutzO
    https://arxiv.org/abs/2212.00794 and https://arxiv.org/pdf/2208.07220
    return_indices      ?r/   Fprobnum_prefix_tokensorderedc                    sB   t    d|  krdk sJ  J || _|| _|| _|| _d S )Nr   ra   )r   r   r<  r=  r>  r:  )r   r<  r=  r>  r:  r   r2   r3   r   ;  s   

zPatchDropout.__init__rI   c              	   C   s  | j r| jdkr| jr|d fS |S | jr,|d d d | jf |d d | jd f }}nd }|jd }|jd }tdt|d| j  }tjtj	|||j
dddd d d |f }| jre|jddd }|d|dd|jd	d   }|d urtj||fdd}| jr||fS |S )
Nr*   r   r/   ra   r   r   r"  )r   r   r0   )r  r<  r:  r=  r   ri   r?   rm   argsortrandnr   r>  sortgather	unsqueezeexpandcat)r   r\   prefix_tokensr   Lnum_keepkeep_indicesr2   r2   r3   r   K  s.   

&zPatchDropout.forward)r;  r/   FF)r<   r=   r>   r   rm   r   r   rA   r@   rB   r?   r   r   r   r   r
   r   r   r2   r2   r   r3   r9  4  s&   
 r9  r/   posembr   r=  c                 C   sL  | j d }|d |d  | }||kr|d |d kr| S |d u r.tt|| }	|	|	f}|rF| d d d |f | d d |d f }
} nd | }
} | j d }| j}|  } | d|d |d ddddd} tj	| |||d} | dddddd|} | 
|} |
d urtj|
| gdd} tj s|rtd| d	| d
 | S )Nr/   r   r   r   r0   r   r?  zResized position embedding: r   rL   )r   r?   rc   re   rl   rB   r   r   r   r   rp   rm   rF  r   is_scriptingr"   r   )rK  r   r   r=  r   r   r   num_pos_tokensnum_new_tokenshwposemb_prefixr   r   r2   r2   r3   resample_abs_pos_embedo  s4   



$
rQ  c                 C   <   | j d urt| j | j jd d d t| j| jd d d S Nr/         ࿩rx   )	pos_embedr   r   latent
latent_dimr  r2   r2   r3   init_weights     
rY  rK   c                 C   sT   t | tjrt| jdd | jdurtj| j dS dS t| dr(| 	  dS dS )zCViT weight initialization, original timm impl (for reproducibility){Gz?rU  NrY  )
rV   r   r   r   r   r   initzeros_hasattrrY  rJ   r2   r2   r3   init_weights_vit_timm  s   

r_  c                C       s8  e Zd ZU dZee ed< ddddddd	d	d
dddddddddddddddddeddee	df de
eeeef f de
eeeef f dededed dededededededee deded ed!ed"ee ded#ed$ed%ed&ed'ed(ed)ed*ed+ d,ed-ee d.ee d/eej d0eej d1ed2dfB fd3d4ZdPd5ed6 d2dfd7d8Zejjd2efd9d:ZejjdQd;ed2efd<d=Zejjd2ejfd>d?ZdRded2dfd@dAZdBej d2ej fdCdDZ!	EdSdBej dFe
ee"f d2e#ej  fdGdHZ$dBej d2ej fdIdJZ%dQdBej dKed2ej fdLdMZ&dBej d2ej fdNdOZ'  Z(S )TVisionTransformerzVision Transformer

    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
        - https://arxiv.org/abs/2010.11929
    dynamic_img_sizer   r   r   i  tokenr      r!  TFNr   r*   rD   r   r   r   num_classesglobal_poolrD   avgrb  mapr   depthr#  r$  r%  r&  r)  class_tokenno_embed_class
reg_tokenspre_normfc_normr   	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rateweight_init)skipjaxjax_nlhbmocorD   embed_layer_norm_layer
_act_layerblock_fnr*  ignore_headrI   c!           %         s  t    |dv sJ |s|dksJ |du r|dkn|}!ttjddtj || _|| _ | _| _	|r7dnd| _
|  j
|7  _
|| _|| _|| _|| _d	| _| | _i }"|r`|"td	d
d |d|||| |d|"| _| jj}#|rttddnd| _|rttd|nd| _|r|#n|#| j
 }$ttd|$d | _tj|d| _|dkrt|| j
d| _nt  | _|rǈnt  | _!dd t"d||D tj# 	
fddt$|D  | _%|!snt  | _&|dkrt't(_'t(| j		d| _)nd| _)|!rnt  | _*t|| _+|dkr2t,| j	|nt  | _-|dkrC| '| dS dS )a  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of image input channels.
            num_classes: Number of classes for classification head.
            global_pool: Type of global pooling for final sequence (default: 'token').
            embed_dim: Transformer embedding dimension.
            depth: Depth of transformer.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: Enable bias for qkv projections if True.
            init_values: Layer-scale init values (layer-scale enabled if not None).
            class_token: Use class token.
            no_embed_class: Don't include position embeddings for class (or reg) tokens.
            reg_tokens: Number of register tokens.
            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
            drop_rate: Head dropout rate.
            pos_drop_rate: Position embedding dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            weight_init: Weight initialization scheme.
            embed_layer: Patch embedding layer.
            _norm_layer: Normalization layer.
            _act_layer: MLP activation layer.
            block_fn: Transformer block layer.
        rf  rb  Nrg  ư>)epsr/   r   Fr   )r   r   )r   r   r   r   r   r   r[  )r   r=  c                 S   s   g | ]}|  qS r2   )item)r   r\   r2   r2   r3   r   2  s    z.VisionTransformer.__init__.<locals>.<listcomp>c                    s0   g | ]}	
|  d qS ))r"  r#  r$  r%  r&  r)  r'  r(  r  r   r  r*  r2   )r   r   r  rs  r}  dprr   r)  r*  r$  r   r#  rr  r&  r%  r2   r3   r   6  s"    rh  )r#  r$  r   rv  r2   ).r   r   r   r   r7  r  rd  re  num_featuresr   r=  num_reg_tokenshas_class_tokenrk  ra  grad_checkpointingr~  updatedictr   r   	Parameterrm   r   	cls_token	reg_tokenrA  rV  r   pos_dropr9  
patch_dropr   norm_prelinspace
Sequentialr   blocksr   rY  AttentionPoolLatent	attn_poolrn  	head_dropr   head)%r   r   r   r   rd  re  r   ri  r#  r$  r%  r&  r)  rj  rk  rl  rm  rn  ra  r   ro  rp  rq  rr  rs  rt  ru  rz  r{  r|  r}  r*  r~  use_fc_norm
embed_argsr   	embed_lenr   r  r3   r     s   
>
	

"


zVisionTransformer.__init__r   rw  rx  ry  rD   c                 C   sD   |dv sJ t | jdd | jd urtjj| jdd tt|  d S )Nr  r[  rU  r  )r   rV  r  r   r\  normal_rO   r_  )r   r   r2   r2   r3   rY  ^  s
   
zVisionTransformer.init_weightsc                 C   s   h dS )N>   r  rV  
dist_tokenr2   r  r2   r2   r3   no_weight_decayf     z!VisionTransformer.no_weight_decaycoarsec                 C   s   t dddgdS )Nz ^cls_token|pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr  )r  )r   r  r2   r2   r3   group_matcherj  s   zVisionTransformer.group_matcherc                 C   s   | j S rU   )r  r  r2   r2   r3   get_classifierq  s   z VisionTransformer.get_classifierc                 C   s~   || _ |d ur*|dv sJ |dkr| jd u r	J d|dkr'| jd ur'd | _|| _|dkr8t| j|| _d S t | _d S )Nrf  rh  Fz=Cannot currently add attention pooling in reset_classifier().zmap r   )rd  r  re  r   r   r   r   r  )r   rd  re  r2   r2   r3   reset_classifieru  s   z"VisionTransformer.reset_classifierr\   c                 C   s   | j r"|j\}}}}t| j||g| jrdn| jd}||d|}n| j}g }| jd ur:|| j	|jd dd | j
d urM|| j
	|jd dd | jra|| }|r`tj||g dd}n|rmtj||g dd}|| }| |S )Nr   r  r   r/   r?  )ra  r   rQ  rV  rk  r=  viewr  r   rE  r  rm   rF  r  )r   r\   r   r   r   r   rV  to_catr2   r2   r3   
_pos_embed  s.   


zVisionTransformer._pos_embedr/   r^   c                 C   s   g t | j}}tt|trt|| |n|}| |}| |}| |}| 	|}t
| jD ]\}}||}||v rB|| q1|S rU   )r   r  setrV   r?   r   r   r  r  r  	enumerater   )r   r\   r^   outputs
num_blockstake_indicesr   blkr2   r2   r3   _intermediate_layers  s   




z&VisionTransformer._intermediate_layersc                 C   r  rU   )r   r  r  r  r  r   r  r2   r2   r3   forward_features  r  z"VisionTransformer.forward_features
pre_logitsc                 C   s|   | j d ur|  |}n | jdkr |d d | jd f jdd}n| jr+|d d df }| |}| |}|r9|S | |S )Nrg  r/   r?  r   )r  re  r=  rw   rn  r  r  )r   r\   r  r2   r2   r3   forward_head  s   

 

zVisionTransformer.forward_headc                 C   s   |  |}| js| |}|S rU   )r  r~  r  r  r2   r2   r3   r     s   

zVisionTransformer.forwardrD   FrU   r  ))r<   r=   r>   r   r   rA   r@   r   r   r   r   r?   r   r	   rB   r
   r   	LayerTyper   r   r8  r   rY  rm   r   ignorer   r  Dictr  r  r  r   r  r   rC   r  r  r  r   r   r2   r2   r   r3   r`    s   
 	
 !" '$

	r`  c                 C   sT   d| v rt }|S d| v rt}|S d| v rt|  }|S d| v r"t}|S td|  d)NMlpProjectorCLIPVisionTowerVQvision_headzclass_name z is invalid.)r  r  	VQ_modelsr  
ValueError)cls_nameclsr2   r2   r3   model_name_to_cls  s   
r  c                       $   e Zd Z fddZdd Z  ZS )r  c                    sJ   t    tj|d |d | _tj | _tj|d |d | _d S )Nn_embedimage_token_embedimage_token_size)	r   r   rm   r   r   output_mlp_projectorr  vision_activationr  )r   paramsr   r2   r3   r     s   

zvision_head.__init__c                 C   s"   |  |}| |}| |}|S rU   )r  r  r  r  r2   r2   r3   r     s   


zvision_head.forwardr<   r=   r>   r   r   r   r2   r2   r   r3   r    s    
r  P     i     r   gZӼ@rh  )
image_sizer   widthlayersheadsr$  re  use_checkpointr     i      r1   )siglip_so400m_patch14_384siglip_so400m_patch14_224siglip_large_patch16_384r  r   
model_namer  select_layer	ckpt_pathc           
      K   s   | t  v sJ dt   tdi t |  }|dkr(t|j|j| d }nt|j|}t||j|j||j|j	|j
|j|dd|dddd}|rgtj|d	dd
}|j|dd}	td| d|	 d |S )Nzmodel name should be in r   r/   r~  Tru  rv  )r   r   r   ri  r#  r$  rj  re  r~  ru  rd  cpu)map_locationweights_onlyF)strictzSigLIP-ViT restores from z,
	incompatible_keys:', rL   r2   )SigLIP_MODEL_CONFIGkeysSigLIPVisionCfgrh   r  r`  r   r  r  r$  rj  re  getrm   loadload_state_dictprint)
r  r  r  r  rS   
vision_cfgr  model
state_dictincompatible_keysr2   r2   r3   create_siglip_vit  s:   

r  c                       sB   e Zd ZdZd fdd	ZdedefddZdefd	d
Z  Z	S )	Normalizea  Normalize a tensor image with mean and standard deviation.
    This transform does not support PIL Image.
    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
    channels, this transform will normalize each channel of the input
    ``torch.*Tensor`` i.e.,
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``

    .. note::
        This transform acts out of place, i.e., it does not mutate the input tensor.

    Args:
        mean (sequence): Sequence of means for each channel.
        std (sequence): Sequence of standard deviations for each channel.
        inplace(bool,optional): Bool to make this operation in-place.

    Fc                    s    t    || _|| _|| _d S rU   )r   r   rw   rx   inplace)r   rw   rx   r  r   r2   r3   r   Z  s   

zNormalize.__init__rv   rI   c                 C   s   t || j| j| jS )z
        Args:
            tensor (Tensor): Tensor image to be normalized.

        Returns:
            Tensor: Normalized Tensor image.
        )r   	normalizerw   rx   r  )r   rv   r2   r2   r3   r   a  s   zNormalize.forwardc                 C   s   | j j d| j d| j dS )Nz(mean=z, std=))r   r<   rw   rx   r  r2   r2   r3   __repr__k  s   zNormalize.__repr__r  )
r<   r=   r>   r   r   r   r   rY   r  r   r2   r2   r   r3   r  H  s
    
r  c                       s   e Zd Z								ddedeeeef ef d	ed
edededee	e
  dee	e
  f fddZedejfddZedd Zdd Zdd Zdd Z  ZS )r  r  r  patchr   NrD   r  r  select_featurer  select_layersr  
pixel_mean	pixel_stdc	                    sx   t    || _|| _|| _|| _||||d}
|
|	 | |
\| _| _	|d ur5|d ur5t
||d}nd }|| _d S )N)r  r  r  r  )rw   rx   )r   r   r  r  r  r  r  build_vision_towervision_towerforward_kwargsr  
image_norm)r   r  r  r  r  r  r  r  r  rS   vision_tower_paramsr  r   r2   r3   r   p  s$   


zCLIPVisionTower.__init__rI   c                 C      t | j jS rU   )nextr  
parametersr   r  r2   r2   r3   r        zCLIPVisionTower.devicec                 C   r  rU   )r  r  r  rl   r  r2   r2   r3   rl     r  zCLIPVisionTower.dtypec                 C   sv   | j drd| _tdi |}t }||fS | j dr$t }||fS ddlm} |jdi |}tdd}||fS )	Nsiglipsamesamr   )CLIPVisionModelT)output_hidden_statesr2   )r  
startswithr  r  r  transformersr  from_pretrained)r   r  r  r  r  r2   r2   r3   r    s   
z"CLIPVisionTower.build_vision_towerc                 C   st   t |tjr	|}n|j| j }| jdkr |d d dd f }|S | jdkr)|}|S | jdkr2|}|S td| j )Nr  r/   	cls_patchr  zUnexpected select feature: )rV   rm   r   hidden_statesr  r  r  )r   image_forward_outsimage_featuresr2   r2   r3   feature_select  s   


zCLIPVisionTower.feature_selectc                 C   s6   | j dur
|  |}| j|fi | j}| |}|S )z

        Args:
            images (torch.Tensor): [b, 3, H, W]

        Returns:
            image_features (torch.Tensor): [b, n_patch, d]
        N)r  r  r  r  )r   imagesr  r  r2   r2   r3   r     s
   



zCLIPVisionTower.forward)r  r  r  r   NrD   NN)r<   r=   r>   rY   r   r   r?   listr
   rC   rB   r   propertyrm   r   rl   r  r  r   r   r2   r2   r   r3   r  o  sB    

	%
r  c                       s@   e Zd Z fddZdeeejejf ejf fddZ  Z	S )r  c                    sj  t    || _|d dkrt }n|d dkr$t|d |d }n|d dkr]|dd}t|d |d g}td|D ]}|t	  |t|d |d  q@tj
| }nS|d d	kr|dd}t|d |d d
 | _t|d |d d
 | _g }td|D ]}|t	  |t|d |d  qtj
| }n	td|d  || _d S )Nprojector_typeidentitylinear	input_dimr  mlp_geluri  r/   low_high_hybrid_split_mlp_gelur0   zUnknown projector type: )r   r   cfgr   r   r   r  r   r   r  r  high_up_projlow_up_projr  r  )r   r  modules	mlp_depth_r   r2   r3   r     s0   


zMlpProjector.__init__
x_or_tuplec                 C   sH   t |tr|\}}| |}| |}tj||gdd}n|}| |S )ao  

        Args:
            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:  if it is a tuple of torch.Tensor,
                then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
                otherwise it is the feature from the single vision encoder.

        Returns:
            x (torch.Tensor): [b, s, c]
        r   r?  )rV   rZ   r  r  rm   rF  r  )r   r  high_xlow_xr\   r2   r2   r3   r     s   



zMlpProjector.forward)
r<   r=   r>   r   r   r   rm   r   r   r   r2   r2   r   r3   r    s
    #r  c                	       sJ   e Zd Z		ddedededdf fdd	Zd
ejdejfddZ	  Z
S )r0  h㈵>Fr"  r)  r  rI   Nc                    s*   t    || _t|t| | _d S rU   )r   r   r  r   r  rm   onesgamma)r   r"  r)  r  r   r2   r3   r     s   
zLayerScale.__init__r\   c                 C   s   | j r	|| jS || j S rU   )r  rs   r  r  r2   r2   r3   r     s   zLayerScale.forward)r  F)r<   r=   r>   r?   rB   rA   r   rm   r   r   r   r2   r2   r   r3   r0    s    
r0  scaled_dot_product_attentionTIMM_FUSED_ATTNexperimentalc                 C   s    t rtrdS | rtdkS tdkS )NFr/   r   )_HAS_FUSED_ATTN_EXPORTABLE_USE_FUSED_ATTN)r  r2   r2   r3   use_fused_attn.  s
   r#  c                       s   e Zd ZU dZejje ed< 												
		d de	de	de	de	de
e	 dededede	de	dedede
ej def fddZdd Zdd Z  ZS )!r  z!Attention pooling w/ latent query
fused_attnNr%   r!  TFr/   rD   rb  r*   r  r  r   r#  	feat_sizer$  r%  r&  
latent_lenrX  rV  	pool_typer   r  c                    s^  t    |p|}|p|}|| dksJ || _|| | _|| _| jd | _|| _t | _|dkrB|d us7J t	
t||| _nd | _|
pH|| _|	| _t	
td| j|| _t	j|||d| _t	j||d |d| _|rt|| jnt	 | _|r|| jnt	 | _t	||| _t	|| _|d ur||nt	 | _t|t|| | _|   d S )Nr   rT  absr/   r   r0   )r   r   r#  head_dimr%  scalepoolr#  r$  r   r  rm   r   rV  rX  r&  rW  r   qkvr   q_normk_normr   r   r'  r   r   r?   r4  rY  )r   r  r  r   r#  r%  r$  r%  r&  r&  rX  rV  r'  r   r  r   r2   r3   r   <  s6   


zAttentionPoolLatent.__init__c                 C   rR  rS  )rV  r   r   rW  rX  r  r2   r2   r3   rY  p  rZ  z AttentionPoolLatent.init_weightsc                 C   sn  |j \}}}| jd ur|| jd|j }| j|dd}| ||| j	| j
| jdd}| |||d| j
| jddddd}|d\}}	| || |}}| jrct|||	}n|| j }||dd }
|
jdd}
|
|	 }|dd|| j	|}| |}| |}|| | | }| jd	kr|d d df }d S | jd
kr|d}d S d S )Nr   r   r/   r0   r   r1   r   r?  rb  rg  )r   rV  rD  rp   rl   rW  rE  r,  r   r&  r#  r)  r   r-  r   unbindr.  r/  r$  r   r  r*  softmaxr   r'  r4  r   r+  rw   )r   r\   r   Nr   q_latentr,  r-  kvr/  r2   r2   r3   r   u  s:   





zAttentionPoolLatent.forward)NNr%   Nr!  TFr/   NrD   rb  Nr*   )r<   r=   r>   r   rm   r   r   rA   r@   r?   r
   rB   rY   r   r8  r   rY  r   r   r2   r2   r   r3   r  7  s^   
 	
4r  c                       s6   e Zd Z								d fd	d
	Zdd Z  ZS )Encoderr      r.   r0   groupr*   Tr9   c	              
      st  t    t|| _|| _tj||dddd| _dt| }	t	 | _
t| jD ]W}
t }t	 }t	 }||	|
  }|||
  }t| jD ]}|t||||d |}|
| jd kre|t|| qG||_||_|
| jd kryt|||_| j
| q(t	 | _| jt||||d | jt||d | jt||||d t||| _tj||dddd| _d S )Nr   r/   r   r   paddingr  r-  	norm_typer<  )r   r   r   num_resolutionsnum_res_blocksr   r   conv_inrZ   
ModuleListconv_blocksr   r8  r   ResnetBlock	AttnBlockresr/  
Downsample
downsamplemidr  norm_outconv_out)r   r   chch_multr?  r<  r-  resamp_with_convr:   
in_ch_multi_level
conv_block	res_block
attn_blockblock_in	block_outr  r   r2   r3   r     sN   




zEncoder.__init__c                 C   s   |  |}t| jD ]-\}}t| jD ]}|j| |}t|jdkr*|j| |}q|| jd kr7|	|}q
| j
D ]}||}q;| |}t|}| |}|S )Nr   r/   )r@  r  rB  r   r?  rE  r   r/  r>  rG  rH  rI  nonlinearityrJ  )r   r\   hrO  blocki_block	mid_blockr2   r2   r3   r     s    





zEncoder.forward)r   r7  r.   r0   r8  r*   Tr9   r  r2   r2   r   r3   r6    s    :r6  c                       sB   e Zd Z								d fd	d
	Zedd Zdd Z  ZS )Decoderr9   r7  r.   r0   r8  r*   Tr   c	              
      sp  t    t|| _|| _||| jd   }	tj||	dddd| _t | _	| j	
t|	|	||d | j	
t|	|d | j	
t|	|	||d t | _tt| jD ]P}
t }t }t }|||
  }t| jd D ]}|
t|	|||d |}	|
| jd kr|
t|	| qo||_||_|
dkrt|	||_| j
| qTt|	|| _tj|	|dddd| _d S )Nr/   r   r9  r;  r=  r   )r   r   r   r>  r?  r   r   r@  rA  rH  r   rC  rD  rB  reversedr   r8  rE  r/  Upsampleupsampler  rI  rJ  )r   r:   rK  rL  r?  r<  r-  rM  r   rS  rO  rP  rQ  rR  rT  r  r   r2   r3   r     sP   





zDecoder.__init__c                 C   s   | j jS rU   )rJ  r   r  r2   r2   r3   
last_layer/  r  zDecoder.last_layerc                 C   s   |  |}| jD ]}||}qt| jD ]/\}}t| jd D ]}|j| |}t|jdkr6|j| |}q|| j	d krC|
|}q| |}t|}| |}|S )Nr/   r   )r@  rH  r  rB  r   r?  rE  r   r/  r>  r]  rI  rU  rJ  )r   zrV  rY  rO  rW  rX  r2   r2   r3   r   3  s    





zDecoder.forward)r9   r7  r.   r0   r8  r*   Tr   )r<   r=   r>   r   r
  r^  r   r   r2   r2   r   r3   rZ    s    =
rZ  c                       s.   e Zd Z fddZdd Zd	ddZ  ZS )
VectorQuantizerc                    s   t    || _|| _|| _|| _|| _|| _t	| j| j| _
| j
jjd| j d| j  | jr@tj| j
jjddd| j
j_| jrNttd| _d S d S )Ng      ra   r0   r   r   r"  i   )r   r   n_ee_dimbetar+   l2_norm
show_usager   	Embedding	embeddingr   datark   r   r  r  rm   r   codebook_used)r   rb  rc  rd  r+   re  rf  r   r2   r3   r   L  s    
zVectorQuantizer.__init__c              
   C   sN  t d| }|d| j}| jr-tj|ddd}tj|ddd}tj| jj	ddd}n| jj	}t j
|d dddt j
|d dd dt d	|t d
|  }t j|dd}|| |j}d }d }d }	d }
d }| jrt ||  d }	| jt | | d  }
| jt|  }|||   }t d|}||	|
|f|||ffS )Nzb c h w -> b h w cr   r0   ra  r/   T)r"  keepdimr?  z	bd,dn->bnz
n d -> d nzb h w c -> b c h w)rm   einsum
contiguousr  rc  re  r   r  rh  r   sumargminr   r  rw   detachrd  r+   compute_entropy_loss)r   r_  z_flattenedrh  dmin_encoding_indicesz_q
perplexitymin_encodingsvq_losscommit_lossentropy_lossr2   r2   r3   r   _  sD   	zVectorQuantizer.forwardNTc                 C   s~   | j rtj| jjddd}n| jj}|| }|d ur=|r8||d |d |d |d }|dddd }|S ||}|S )Nr0   r   ra  r   r   r/   )	re  r   r  rh  r   r   r   rm  r  )r   indicesr   channel_firstrh  ru  r2   r2   r3   get_codebook_entry  s    
z"VectorQuantizer.get_codebook_entryNT)r<   r=   r>   r   r   r}  r   r2   r2   r   r3   r`  K  s    0r`  c                       s.   e Zd Z				d	 fdd	Zdd Z  ZS )
rC  NFr*   r8  c                    s   t    || _|d u r|n|}|| _|| _t||| _tj||dddd| _	t||| _
t|| _tj||dddd| _| j| jkra| jrTtj||dddd| _d S tj||dddd| _d S d S )Nr   r/   r9  r   )r   r   r   r   use_conv_shortcutr  r.  r   r   conv1r3  r   r-  conv2conv_shortcutnin_shortcut)r   r   r   r  r-  r<  r   r2   r3   r     s.   




zResnetBlock.__init__c                 C   s|   |}|  |}t|}| |}| |}t|}| |}| |}| j| jkr:| jr5| 	|}|| S | 
|}|| S rU   )r.  rU  r  r3  r-  r  r   r   r  r  r  )r   r\   rV  r2   r2   r3   r     s   






zResnetBlock.forward)NFr*   r8  r  r2   r2   r   r3   rC    s    "rC  c                       s&   e Zd Zd fdd	Zdd Z  ZS )rD  r8  c                    sr   t    t||| _tj||dddd| _tj||dddd| _tj||dddd| _tj||dddd| _	d S )Nr/   r   r9  )
r   r   r  r   r   r   r,  r4  r5  proj_out)r   r   r<  r   r2   r3   r     s   

zAttnBlock.__init__c                 C   s   |}|  |}| |}| |}| |}|j\}}}}	|||||	 }|ddd}|||||	 }t||}
|
t	|d  }
t
j|
dd}
|||||	 }|
ddd}
t||
}|||||	}| |}|| S )Nr   r0   r/   rT  r?  )r   r,  r4  r5  r   r   r   rm   bmmr?   r   r1  r  )r   r\   h_r,  r4  r5  rz   crV  ww_r2   r2   r3   r     s$   




zAttnBlock.forwardr8  r  r2   r2   r   r3   rD    s    
rD  c                 C   s   | t |  S rU   )rm   sigmoidr[   r2   r2   r3   rU    s   rU  r8  c                 C   s<   |dv sJ |dkrt jd| dddS |dkrt | S d S )N)r8  batchr8      r  T)
num_groupsnum_channelsr  affiner  )r   	GroupNormSyncBatchNorm)r   r<  r2   r2   r3   r    s   
c                       r  )r\  c                    s4   t    || _| jrtj||dddd| _d S d S )Nr   r/   r9  r   r   	with_convr   r   convr   r   r  r   r2   r3   r     s   

zUpsample.__init__c                 C   sR   |j tjkrtj|tjdddtj}ntj|ddd}| jr'| 	|}|S )Nrb   nearest)scale_factorr   )
rl   rm   rq   r   r   rp   rB   ro   r  r  r  r2   r2   r3   r     s   
zUpsample.forwardr  r2   r2   r   r3   r\    s    r\  c                       r  )rF  c                    s4   t    || _| jrtj||dddd| _d S d S )Nr   r0   r   r9  r  r  r   r2   r3   r   "  s   

zDownsample.__init__c                 C   s>   | j rd}tj||ddd}| |}|S tj|ddd}|S )N)r   r/   r   r/   constantr   )r   valuer0   )r   r   )r  r   r   r  
avg_pool2d)r   r\   r   r2   r2   r3   r   +  s   
zDownsample.forwardr  r2   r2   r   r3   rF  !  s    	rF  r1  {Gz?c                 C   s   |  d| jd }|| }tj|dd}tj|d dd}|dkr$|}ntd|tj|dd}t	|t
|d   }ttj	|| dd }	|	| }
|
S )Nr   r?  r  r1  zEntropy loss {} not supportedr   )r   r   r   r1  log_softmaxr  formatrm   rw   rn  log)affinity	loss_typetemperatureflat_affinityprobs	log_probstarget_probs	avg_probsavg_entropysample_entropylossr2   r2   r3   rq  5  s   rq  c                       sD   e Zd Zdef fddZdd Zdd Zdd
dZdd Z  Z	S )rR   configc                    s   t    || _t|j|j|jd| _t|j	|j|jd| _
t|j|j|j|j|j|j| _t|j|jd| _t|j|jd| _d S )N)rL  r:   r-  r/   )r   r   r  r6  r7   r:   r;   encoderrZ  r8   decoderr`  r$   r&   r)   r+   r'   r(   quantizer   r   
quant_convpost_quant_conv)r   r  r   r2   r3   r   F  s0   


zVQModel.__init__c                 C   s.   |  |}| |}| |\}}}|||fS rU   )r  r  r  )r   r\   rV  quantemb_lossr   r2   r2   r3   encodea  s   


zVQModel.encodec                 C   s   |  |}| |}|S rU   )r  r  )r   r  decr2   r2   r3   decodeg  s   

zVQModel.decodeNTc                 C   s   | j |||}| |}|S rU   )r  r}  r  )r   code_br   r|  quant_br  r2   r2   r3   decode_codel  s   
zVQModel.decode_codec                 C   s"   |  |\}}}| |}||fS rU   )r  r  )r   inputr  diffr  r  r2   r2   r3   r   q  s   
zVQModel.forwardr~  )
r<   r=   r>   r#   r   r  r  r  r   r   r2   r2   r   r3   rR   E  s    
rR   c                   @   s   e Zd ZeZdZg ZdZdS )MultiModalityPreTrainedModelmulti_modalitypast_key_valuesN)r<   r=   r>   MultiModalityConfigconfig_classbase_model_prefix_no_split_modules_skip_keys_device_placementr2   r2   r2   r3   r  w  r   r  c                       s   e Zd Z	ddedee f fddZdee de	j
fdd	Zdejfd
dZe	 	dde	jde	j
dedede	j
f
ddZde	jfddZdee defddZdeeee	j
f  fddZ  ZS )MultiModalityCausalLMNr  quant_configc                    s   t  | |j}t|j}|di |j| _|j}t|j}||j| _|j	}t|j}| | _
|j}	t|	j}
|
|	j| _|j}t|j}||j| _tj|jd |jd | _|j}t||d| _t|| _d S )Nr  r  )r  r2   )r   r   vision_configr  r  r  vision_modelaligner_configalignergen_vision_configgen_vision_modelgen_aligner_configgen_alignergen_head_configgen_headrm   r   rg  	gen_embedlanguage_configr!   language_modelr   logits_processor)r   r  r  r  
vision_clsr  aligner_clsr  gen_vision_clsr  gen_aligner_clsr  gen_head_clsr  r   r2   r3   r     s2   




zMultiModalityCausalLM.__init__itemsrI   c                 C   sn   t jdd |D dd}|jdd \}}|j| jj| jjd}t|d}| | |}t|d||d	}|S )
Nc                 S   s   g | ]}|j qS r2   )feature)r   r  r2   r2   r3   r     s    z;MultiModalityCausalLM.get_image_feature.<locals>.<listcomp>r   r?  r0   )r   rl   zb n c h w -> (b n) c h wz(b n) t d -> b (n t) d)rz   r^   )	rm   concatr   rp   r  r   rl   r   r  )r   r  pixel_valuesbsr^   r  images_embedsr2   r2   r3   get_image_feature  s   
z'MultiModalityCausalLM.get_image_featurec                 C   s
   | j  S rU   )r  get_input_embeddingsr  r2   r2   r3   r    s   
z*MultiModalityCausalLM.get_input_embeddingsF	input_ids	positionsforward_batchget_embeddingc                 C   s   t ||| | j|d}|S )N)r  r  multimodal_modelr  r  )r   r  )r   r  r  r  r  r  r2   r2   r3   r     s   zMultiModalityCausalLM.forward	image_idsc                 C   s   |  | |S rU   )r  r  )r   r  r2   r2   r3   prepare_gen_img_embeds  s   z,MultiModalityCausalLM.prepare_gen_img_embedsimage_inputsc                 C   s*   |j }|j}||fg}t|}|||S rU   )im_start_id	im_end_idr   pad_input_tokens)r   r  r  r  r  media_token_pairshelperr2   r2   r3   pad_input_ids  s
   
z#MultiModalityCausalLM.pad_input_idsweightsc                 C   s  g d}t |  }|D ]|\}}d|v sd|v rqd|v s!d|v r"q|dr,||vr,qd|v r1q|dd	}d
|v rA|dd}|D ]+\}}}||vrMqC|||}|dr]||vr]qC|| }	t|	dd }
|
|	||  n|dry||vryq|| }	t|	dt}
|
|	| qd S )N))	.qkv_projz.q_projr,  )r  z.k_projr4  )r  z.v_projr5  )gate_up_proj	gate_projr   )r  up_projr/   zrotary_emb.inv_freq~	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towergenzself_attn.out_projzself_attn.projzvision_model.vision_towerzattn.qkvzattn.qkv_projz.biasweight_loader)r  named_parametersr   replaceendswithgetattrr    )r   r  stacked_params_mappingparams_dictrK   loaded_weight
param_nameweight_nameshard_idparamr  r2   r2   r3   load_weights  s>   	
z"MultiModalityCausalLM.load_weightsrU   r  )r<   r=   r>   r  r
   r   r   rC   r   rm   r   r  r   rg  r  r   
LongTensorr   rA   r   r  r?   r   r  r   r   rY   r  r   r2   r2   r   r3   r    s2    &$	r  )r  model_class)rD   TF)r*   ra   r   rb   )r   TF)r*   FT)Nr/   r   TFr  )r  r  r   rD   r  r  )r1  r  )urW   rc   osdataclassesr   enumr   	functoolsr   	itertoolsr   typingr   r   r   r	   r
   r   r   r   r   r   rm   torch.nn.functionalr   
functionalr   einopsr   r   r   torch.nn.initr   r  r   r   sglang.srt.configs.janus_pro"sglang.srt.layers.attention.visionr   "sglang.srt.layers.logits_processorr   sglang.srt.layers.quantizationr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr    sglang.srt.models.llamar!   sglang.utilsr"   	dataclassr#   r8  rA   rO   rT   r  collections.abcr`   r~   rB   r   r   rY   r   r   rC   r?   r   r   r   r  r  r   r  r9  rQ  rY  r_  r`  r  r  r  r  r  r  r  r0  r^  r   environr"  r!  r#  r  r6  rZ  r`  rC  rD  rU  r\  rF  rq  rR   r  r  registerr  
EntryClassr2   r2   r2   r3   <module>   st  0	
	2

N ,
1>

0
  )#
,'e>	iQZV5&


2	 
