o
    i                     @   s  d dl Z d dlmZ d dlmZmZ d dlZd dlmZ d dl	m  m
Z ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* e+e,Z-G dd de Z.G dd de!Z/G dd dej0Z1G dd deZ2G dd dej0Z3G dd dej0Z4G dd dej0Z5G d d! d!ej0Z6G d"d# d#ej0Z7G d$d% d%ej0Z8G d&d' d'ej0Z9G d(d) d)e&Z:G d*d+ d+ej;Z<G d,d- d-ej0Z=G d.d/ d/ej0Z>G d0d1 d1ej0Z?G d2d3 d3ej0Z@G d4d5 d5ej0ZAed6d7G d8d9 d9eZBG d:d; d;ZCG d<d= d=eeBZDG d>d? d?e#eDZEG d@dA dAe"eDeZFG dBdC dCeDZGG dDdE dEeDeZHg dFZIdS )G    N)cached_property)OptionalUnion   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging)deprecate_kwarg   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelTransformersKwargs)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   @      e Zd ZdS )Emu3AttentionN__name__
__module____qualname__ r"   r"   b/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/emu3/modular_emu3.pyr   ,       r   c                       s   e Zd Zdedef fddZedddd							
				ddejde	ej de	ej
 de	e de	e de	ej
 de	eejejf  dee dejfddZ  ZS )Emu3DecoderLayerconfig	layer_idxc                    s    t  || t|j| _d S N)super__init__nnDropoutattention_dropoutdropout)selfr&   r'   	__class__r"   r#   r*   2   s   zEmu3DecoderLayer.__init__past_key_valuepast_key_valuesz4.58)new_nameversionNFhidden_statesattention_maskposition_ids	use_cachecache_positionposition_embeddingskwargsreturnc              
   K   sj   |}	|  |}| jd|||||||d|\}}
|	| | }|}	| |}| |}|	| | }|S )N)r6   r7   r8   r3   r9   r:   r;   r"   )input_layernorm	self_attnr.   post_attention_layernormmlp)r/   r6   r7   r8   r3   r9   r:   r;   r<   residual_r"   r"   r#   forward6   s&   




zEmu3DecoderLayer.forward)NNNFNN)r   r    r!   r   intr*   r   torchTensorr   
LongTensorr   booltupler
   r   rD   __classcell__r"   r"   r0   r#   r%   1   s8    	
r%   c                       s6   e Zd ZdZdef fddZdejfddZ  Z	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r&   c                    s>   t    t|j|j| _| jjj	d|j d|j  d S )Ng            ?)
r)   r*   r+   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_r/   r&   r0   r"   r#   r*   c   s   
"z!Emu3VQVAEVectorQuantizer.__init__hidden_statec                 C   s   |j \}}}}}|ddddd }|d|}tj|d ddd}tj| jjd dd	}	dt|| jj	dd }
||	 |
 }
tj
|
dd	}|||||}|S )
Nr   r   r      r   T)dimkeepdimrY   )shapepermute
contiguousviewrF   sumrQ   rR   matmul	transposeargmin)r/   rV   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicesr"   r"   r#   rD   h   s   z Emu3VQVAEVectorQuantizer.forward)
r   r    r!   __doc__r   r*   rF   rG   rD   rK   r"   r"   r0   r#   rL   X   s    
rL   c                   @   r   )Emu3VQVAEEncoderConvDownsampleNr   r"   r"   r"   r#   ro   z   r$   ro   c                       s$   e Zd Z fddZdd Z  ZS )Emu3VQVAEEncoderConvUpsamplec                    s$   t    tj||dddd| _d S )Nr   r   kernel_sizestridepadding)r)   r*   r+   Conv2dconv)r/   in_channelsr0   r"   r#   r*      s   
z%Emu3VQVAEEncoderConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )N       @nearestscale_factormode)Finterpolaterv   r/   r6   r"   r"   r#   rD      s   
z$Emu3VQVAEEncoderConvUpsample.forwardr   r    r!   r*   rD   rK   r"   r"   r0   r#   rp   ~   s    rp   c                	       sF   e Zd Zdededee dee f fddZdejfdd	Z  Z	S )
Emu3VQVAEConv3d
in_channelout_channelrr   rs   c                    s   t    dd t|dd  |dd  D }d| _|d d d D ]}|  j|d |d  |d f7  _q!|  jd7  _tj||||d| _d S )	Nc                 S   s   g | ]\}}|| qS r"   r"   ).0
one_kernel
one_strider"   r"   r#   
<listcomp>   s    z,Emu3VQVAEConv3d.__init__.<locals>.<listcomp>r   r"   rX   r   )r   r   )rs   )r)   r*   ziprt   r+   Conv3drv   )r/   r   r   rr   rs   padding_sizespad_sizer0   r"   r#   r*      s   
$$zEmu3VQVAEConv3d.__init__r6   c                 C   s   t || j}| |}|S r(   )r}   padrt   rv   r   r"   r"   r#   rD      s   
zEmu3VQVAEConv3d.forward)
r   r    r!   rE   rJ   r*   rF   rG   rD   rK   r"   r"   r0   r#   r      s    r   c                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	Emu3VQVAESpatialNormrw   out_channelsc                    sN   t    tj|dddd| _tj||dddd| _tj||dddd| _d S )N    ư>Tnum_channels
num_groupsepsaffiner   r   rq   )r)   r*   r+   	GroupNorm
norm_layerru   conv_yconv_br/   rw   r   r0   r"   r#   r*      s*   
zEmu3VQVAESpatialNorm.__init__r6   quant_statesc                 C   s@   t j||jdd  dd}| |}|| | | | }|S )Nry   )sizer|   )r}   r~   r\   r   r   r   )r/   r6   r   r"   r"   r#   rD      s   
zEmu3VQVAESpatialNorm.forward	r   r    r!   rE   r*   rF   rG   rD   rK   r"   r"   r0   r#   r      s    r   c                       6   e Zd Zdedef fddZdejfddZ  ZS )Emu3VQVAETemporalUpsampler   r   c                        t    t||ddd| _d S )Nr   r   r   r   r   r   rr   rs   r)   r*   r   rv   r/   r   r   r0   r"   r#   r*         
z"Emu3VQVAETemporalUpsample.__init__r6   c                 C   sr   |j \}}}}}|ddddd |d|}tj|ddd	}|||||dddddd }| |}|S )
Nr   r   r   rW   r   rX   rx   ry   rz   )r\   r]   r^   r_   r}   r~   rv   )r/   r6   rd   rf   re   rg   rh   r"   r"   r#   rD      s    $
z!Emu3VQVAETemporalUpsample.forwardr   r"   r"   r0   r#   r          r   c                       r   )Emu3VQVAETemporalDownsampler   r   c                    r   )N)rW   r   r   )r   r   r   r   r   r   r0   r"   r#   r*      r   z$Emu3VQVAETemporalDownsample.__init__r6   c                 C   s   |  |}|S r(   )rv   r   r"   r"   r#   rD      s   
z#Emu3VQVAETemporalDownsample.forwardr   r"   r"   r0   r#   r      r   r   c                       s(   e Zd Z	d fdd	Zdd Z  ZS )Emu3VQVAETemporalResnetBlockNc                    s   t    || _|d u r|n|| _t|| _t||ddd| _t|| _	t||ddd| _
| j| jkrBtj||dddd| _d S d S )Nr   r   r   r   r   rq   )r)   r*   rw   r   r+   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   r0   r"   r#   r*      s4   
z%Emu3VQVAETemporalResnetBlock.__init__c                 C   sf   |}|  |}|t|9 }| |}| |}|t|9 }| |}| j| jkr/| |}|| S r(   )	r   rF   sigmoidr   r   r   rw   r   r   )r/   r6   rB   r"   r"   r#   rD     s   




z$Emu3VQVAETemporalResnetBlock.forwardr(   r   r"   r"   r0   r#   r      s     r   c                       sT   e Zd Z		d
dedee dee f fddZddejdeej fdd	Z  Z	S )Emu3VQVAEResnetBlockNrw   r   quant_channelsc                    s   t    || _|d u r|n|}|| _|| _|d u r/tj|dddd| _tj|dddd| _nt	||| _t	||| _tj
||dddd| _tj
||dddd| _| j| jkrdtj
||dddd| _d S d S )	Nr   r   Tr   r   r   rq   r   )r)   r*   rw   r   r   r+   r   r   r   r   ru   r   r   r   )r/   rw   r   r   r0   r"   r#   r*   &  sB   
zEmu3VQVAEResnetBlock.__init__r6   c                 C   s   | j d u rdn|f}|}| j|g|R  }|t|9 }| |}| j|g|R  }|t|9 }| |}| j| jkrA| 	|}|| S Nr"   )
r   r   rF   r   r   r   r   rw   r   r   )r/   r6   r   	norm_argsrB   r"   r"   r#   rD   R  s   


zEmu3VQVAEResnetBlock.forward)NNr(   )
r   r    r!   rE   r   r*   rF   rG   rD   rK   r"   r"   r0   r#   r   %  s    $,r   c                       s"   e Zd Zdef fddZ  ZS )Emu3VQVAEAttentionBlockr&   c                    s   t  | d| _d S )Nr   )r)   r*   num_key_value_groupsrU   r0   r"   r#   r*   e  s   
z Emu3VQVAEAttentionBlock.__init__)r   r    r!   r   r*   rK   r"   r"   r0   r#   r   d  s    r   c                       s*   e Zd ZdZ fddZdddZ  ZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                    s   t  jdi | d S r   )r)   r*   r/   r<   r0   r"   r#   r*   s  s   zEmu3VQVAEGroupNorm.__init__Nc                 C   s   t || j| j| j| jS r(   )r}   
group_normr   rR   biasr   )r/   inputr   r"   r"   r#   rD   v  s   zEmu3VQVAEGroupNorm.forwardr(   )r   r    r!   rn   r*   rD   rK   r"   r"   r0   r#   r   l  s    r   c                       s:   e Zd Zd fdd	Zddejdeej fddZ  ZS )	Emu3VQVAEMiddleBlockNc                    s`   t    t|||d| _t|| _|d u r t|dddd| _nt||| _t|||d| _	d S )Nrw   r   r   r   r   Tr   )
r)   r*   r   block_1r   attn_1r   	attn_normr   block_2)r/   r&   rw   r   r0   r"   r#   r*   {  s   

zEmu3VQVAEMiddleBlock.__init__r6   r   c                 C   s   |  ||}|}| ||}|j\}}}}||||| dd}| |d }|||||dddd}|| }| ||}|S )Nr   r   r   r   )	r   r   r\   r_   rb   r   reshaper]   r   )r/   r6   r   rB   rd   rf   rg   rh   r"   r"   r#   rD     s   zEmu3VQVAEMiddleBlock.forwardr(   )	r   r    r!   r*   rF   FloatTensorr   rD   rK   r"   r"   r0   r#   r   z  s    $r   c                       ,   e Zd Z fddZdejfddZ  ZS )Emu3VQVAEDownBlockc              
      s(  t    t|j| _|j| _|j}|j}dt| }|| _t	
 | _t| jD ]i}t	
 }t	
 }t	
 }|||  }	|||  }
t| jD ]*}|t|	|
d |
}	|jd urq||jv rq|t| |t	j|	dddd qGt	 }||_||_||_|| jd krt|	|_| j| q(d S )Nr   rw   r   r   r   Tr   r   )r)   r*   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrJ   in_channel_multiplierr+   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsro   
downsample)r/   r&   r   r   r   i_levelr   r   r   block_in	block_outi_blockr   r0   r"   r#   r*     sD   


zEmu3VQVAEDownBlock.__init__r6   c           
      C   s   t | jD ]^\}}t| jD ]H}|j| |}t|jdkrV|}|j| |}|j\}}}}	|	||||	 
dd}|j| |d }||||	|dddd}|| }q|| jd krc||}q|S )Nr   r   r   r   )	enumerater   r   r   r   r   r   r   r\   r_   rb   r   r]   r   r   )
r/   r6   r   blocksr   rB   rd   rf   rg   rh   r"   r"   r#   rD     s    
zEmu3VQVAEDownBlock.forwardr   r    r!   r*   rF   r   rD   rK   r"   r"   r0   r#   r     s    %r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Emu3VQVAEUpBlockc              	      s  t    t|j| _|j| _|j}|j|jd  }t	 | _
tt| jD ]]}t	 }t	 }t	 }|j|j|  }t| jd D ]"}	|t|||d |}||jv re|t| |t|| qCt }
||
_||
_||
_|dkr|t||
_| j
d|
 q&d S )NrX   r   r   r   )r)   r*   r   r   r   r   rP   r   r+   r   upreversedr   r   r   r   r   r   r   r   r   r   rp   upsampleinsert)r/   r&   r   r   r   r   r   r   r   r   r   r0   r"   r#   r*     s@   



zEmu3VQVAEUpBlock.__init__r6   r   c                 C   s   t | jd d d D ]d\}}t| jd D ]J}|j| ||}t|jdkr_|}|j| ||}|j\}}}	}
|	|||	|
 
dd}|j| |d }|||	|
|dddd}|| }q|t| jd krn||}q
|S )NrX   r   r   r   r   )r   r   r   r   r   r   r   r   r\   r_   rb   r   r]   r   )r/   r6   r   r   r   r   rB   rd   rf   rg   rh   r"   r"   r#   rD     s    
zEmu3VQVAEUpBlock.forwardr   r"   r"   r0   r#   r     s    %r   c                       r   )Emu3VQVAEEncoderc                    s  t    |j}|j}|j}|j}|j}|rd| n|}||d  }tjj	||dddd| _
t|| _t||| _tjjd|ddd	| _tjj	||dddd| _tt|j}	t | _t | _t|	D ]}
t||}| j| qft|jD ]}t||d
}| j| qyd S )Nr   rX   r   r   rq   r   r   T)r   r   r   r   r   )r)   r*   r   rw   double_latentlatent_channelsr   rF   r+   ru   conv_inr   
down_blockr   middle_blockr   norm_outconv_outrE   mathlog2temporal_downsample_factorr   	time_convtime_res_stackr   r   r   r   r   )r/   r&   r   rw   r   r   r   r   r   temporal_down_blocksirv   rC   time_res_convr0   r"   r#   r*     s@   




zEmu3VQVAEEncoder.__init__pixel_valuesc                 C   s   |j d }|jdg|j dd  R  }| |}| |}| |}| |}|t|9 }| |}|jd|g|j dd  R  }|	ddddd}| j
D ]}||}|t|9 }qN| jD ]}||}q_|	ddddd}|S )Nr   rX   r   r   r   rW   )r\   r   r   r   r   r   rF   r   r   r]   r   r   )r/   r   temporal_dimr6   rv   layerr"   r"   r#   rD   9  s"   








zEmu3VQVAEEncoder.forward)r   r    r!   r*   rF   rH   rD   rK   r"   r"   r0   r#   r     s    'r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Emu3VQVAEDecoderr&   c           	         s  t    |j}|j|jd  }t | _t|j	D ]}t
|j|jd}| j| qtt|j}t | _t|D ]}t|j|j}| j| q<tj|j|dddd| _t|||d| _t|| _|j|jd  }t||| _tj||jdddd| _d S )NrX   r   r   r   rq   )r   r   )r)   r*   rP   r   r   r+   r   r   r   r   r   r   r   rE   r   r   r   r   r   ru   r   r   r   r   up_blockr   r   r   r   )	r/   r&   r   r   rC   r   temp_upsample_block_numr   rv   r0   r"   r#   r*   X  s@   



zEmu3VQVAEDecoder.__init__r6   r   c                 C   s  t j||fdd}|ddddd}| jD ]}||}q| jD ]}||}|t |9 }q|ddddd}t j|ddd\}}|jdg|jdd  R  }|jdg|jdd  R  }| 	|}| 
||}| ||}| ||}|t |9 }| |}|S )Nr   r[   r   r   r   rW   rX   )rF   catr]   r   r   r   chunkr   r\   r   r   r   r   r   )r/   r6   r   hidden_quant_statesr   r"   r"   r#   rD     s$   




zEmu3VQVAEDecoder.forward)	r   r    r!   r   r*   rF   rG   rD   rK   r"   r"   r0   r#   r   W  s    'r   aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    )custom_introc                       sz   e Zd ZU eed< dZdZdZdZdZ	dZ
g dZdd Zdef fdd	Zdejd
ejfddZdejfddZ  ZS )	Emu3VQVAEr&   
emuvideovqr   T)r   r   r   rL   c                 C   s\  t |tjtjfr6tjj|jddd |jd ur4tj|j\}}dt	
| }tj|j| | d S d S t |tjrqtjj|jt	
dd |jd urotj|j\}}|dkradt	
| nd}tj|j| | d S d S t |tjtjtjfrtj|jd tj|jd	 d S t |tjr|jj  |jd ur|jj|j   d S d S d S )
Nfan_outrelu)r|   nonlinearityr      )ar   rM   g        )
isinstancer+   ru   r   initkaiming_normal_rR   r   _calculate_fan_in_and_fan_outr   sqrtrT   Linearkaiming_uniform_BatchNorm2dr   r   	constant_rN   rS   normal_padding_idxzero_)r/   modulefan_inrC   boundr"   r"   r#   _init_weights  s.   


zEmu3VQVAE._init_weightsc                    s   t  | || _t|| _t|| _t|| _dt	|j
d  | _t|j|jddd| _t|j|jddd| _dt	|j
d  | _|   |   d S )Nr   r   )r   r   r   r   r   )r)   r*   r&   r   encoderr   decoderrL   quantizer   r   vision_spatial_factorr   r   rP   
quant_convpost_quant_convspatial_scale_factoreval	post_initrU   r0   r"   r#   r*     s   


zEmu3VQVAE.__init__image_sizesc                    s   |j dk}|r jj}|j\}}}}|dd|ddd}n|j\}}}}} |}	|	ddddd}	 |	}	|	ddddd}	 	|	}
|rO|

dn|
} fddt||D }|S )NrW   r   r   r   r   c                    s@   g | ]\}}|d t |d  j d t |d  j f qS )Nr   r   )rE   r  )r   single_imager   r/   r"   r#   r     s    .z$Emu3VQVAE.encode.<locals>.<listcomp>)ndimr&   r   r\   	unsqueezerepeatr  r]   r  r  squeezer   )r/   r   r  is_imagere   rd   rf   rg   rh   r6   codesimage_tokensr"   r!  r#   encode  s    




zEmu3VQVAE.encoder6   c                 C   s   |j dk}|r|d}|j\}}}}| j| }|jd }||||||ddddd }| 	|}	|ddddd}|	ddddd}	| 
|	|}
|
||| jj | jj|| j || j }
|rn|
d d df S |
S )Nr   r   rX   r   rW   r   )r"  r#  r\   r  rQ   flattenr_   r]   r^   r  r  r   r&   r   r   r  )r/   r6   r&  rd   re   rg   rh   quantrf   
post_quantvideor"   r"   r#   decode  s&   


$

zEmu3VQVAE.decode)r   r    r!   r   __annotations__base_model_prefixmain_input_name_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr  r*   rF   rG   r)  r.  rK   r"   r"   r0   r#   r     s   
 	r   c                   @   s   e Zd ZdZdd Zedd Zedd Zedd	 Zed
d Z	edd Z
edd Zdeej dejfddZdejdejfddZdS )Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 C   s"   || _ |d| _|d| _d S )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r/   r8  r"   r"   r#   r*     s   z#Emu3ImageVocabularyMapping.__init__c                 C      t dd | j D S )Nc                 S   s   g | ]\}}| d r|qS z<|visual token
startswithr   namevalr"   r"   r#   r          z;Emu3ImageVocabularyMapping.image_tokens.<locals>.<listcomp>sortedr8  itemsr!  r"   r"   r#   r(       z'Emu3ImageVocabularyMapping.image_tokensc                 C   r<  )Nc                 S   s   g | ]\}}| d r|qS r=  r>  r@  r"   r"   r#   r   $  rC  z?Emu3ImageVocabularyMapping.image_tokens_str.<locals>.<listcomp>rD  r!  r"   r"   r#   image_tokens_str"  rG  z+Emu3ImageVocabularyMapping.image_tokens_strc                    s    fdd j D S )Nc                    s$   i | ]}t |d d  j| qS )ir   )rE   r8  )r   tokenr!  r"   r#   
<dictcomp>(  s   $ z6Emu3ImageVocabularyMapping.img2bpe.<locals>.<dictcomp>)rH  r!  r"   r!  r#   img2bpe&     z"Emu3ImageVocabularyMapping.img2bpec                 C   s   dd | j  D S )Nc                 S   s   i | ]\}}||qS r"   r"   )r   kvr"   r"   r#   rJ  ,      z6Emu3ImageVocabularyMapping.bpe2img.<locals>.<dictcomp>)rK  rF  r!  r"   r"   r#   bpe2img*  rL  z"Emu3ImageVocabularyMapping.bpe2imgc                 C   >   t jt| j d t jd}| j D ]\}}|||< q|S Nr   dtype)rF   zerosmaxrP  keysrE   rF  r/   mappingrM  rN  r"   r"   r#   bpe2img_mapping_tensor.     
z1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorc                 C   rQ  rR  )rF   rU  rV  rK  rW  rE   rF  rX  r"   r"   r#   img2bpe_mapping_tensor5  r[  z1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor	img_batchr=   c                 C   sR   |j }tj|jd dftjd| j }| j|d }tj||gdd}||S )Nr   r   rS  cpurX   r[   )	devicerF   onesr\   rE   r:  r\  tor   )r/   r]  r_  eol_row
img_tokensr"   r"   r#   convert_img2bpe<  s
    
z*Emu3ImageVocabularyMapping.convert_img2bpec                 C   s0   |j }|dd df }| j|d }||S )N.rX   r^  )r_  rZ  ra  )r/   r]  r_  rc  r"   r"   r#   convert_bpe2imgC  s   
z*Emu3ImageVocabularyMapping.convert_bpe2imgN)r   r    r!   rn   r*   r   r(  rH  rK  rP  rZ  r\  listrF   rG   rd  re  r"   r"   r"   r#   r7    s"    





r7  c                   @   s   e Zd ZdgZdZdZdS )Emu3PreTrainedModelr%   TN)r   r    r!   r6  r4  r5  r"   r"   r"   r#   rg  J  s
    rg  c                       s,   e Zd ZeedZdef fddZ  ZS )Emu3TextModel)r6   
attentionsr&   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r"   )r%   )r   r'   r&   r"   r#   r   [  rO  z*Emu3TextModel.__init__.<locals>.<listcomp>)r)   r*   r+   r   r   num_hidden_layerslayersrU   r0   rj  r#   r*   X  s   
zEmu3TextModel.__init__)	r   r    r!   r%   r   _can_record_outputsr   r*   rK   r"   r"   r0   r#   rh  R  s
    rh  c                       s2   e Zd ZU eed<  fddZ fddZ  ZS )Emu3ForCausalLMr&   c                    s   t  | t|| _d S r(   )r)   r*   rh  modelrU   r0   r"   r#   r*   b  s   zEmu3ForCausalLM.__init__c                     s   t    dS )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```N)r)   rD   )super_kwargsr0   r"   r#   rD   f  s   zEmu3ForCausalLM.forward)r   r    r!   r   r/  r*   rD   rK   r"   r"   r0   r#   rn  _  s   
 rn  c                       s:  e Zd ZddiZ fddZdd Zdd Zd	d
 Zdd Zde	j
de	jfddZde	j
de	jfddZe	jde	jdedefddZde	jde	j
de	j
fddZee									d'dee	j dee	j
 dee	j dee	j dee	j d ee dee	j
 d!ee d"ee	j d#ee d$eeef fd%d&Z  ZS )(	Emu3Modelztext_model.model
text_modelc                    s>   t  | t|j| _t|j| _t	|j
| _|   d S r(   )r)   r*   rh  _from_configtext_configrr  r   	vq_configvqmodelr7  vocabulary_mapvocabulary_mappingr  rU   r0   r"   r#   r*   ~  s
   zEmu3Model.__init__c                 C   
   | j  S r(   )rr  get_input_embeddingsr!  r"   r"   r#   rz       
zEmu3Model.get_input_embeddingsc                 C      | j | d S r(   )rr  set_input_embeddingsr/   valuer"   r"   r#   r}       zEmu3Model.set_input_embeddingsc                 C   s
   || _ d S r(   rr  r/   r  r"   r"   r#   set_decoder  r{  zEmu3Model.set_decoderc                 C      | j S r(   r  r!  r"   r"   r#   get_decoder     zEmu3Model.get_decoderr   r  c                    s.    j ||} fdd|D }t|}|S )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        c                    s   g | ]
} j | qS r"   )rx  rd  r*  )r   tokensr!  r"   r#   r     s    z.Emu3Model.get_image_tokens.<locals>.<listcomp>)rv  r)  rF   r   )r/   r   r  image_tokens_listbpe_tokens_list
bpe_tokensr"   r!  r#   get_image_tokens  s   
zEmu3Model.get_image_tokensc                    s:     ||} fdd|D }  |}t||}|S )a7  
        Tokenizes images into discrete tokens with VQGAN module and embeds
        them with text embeddings layer

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
                The tensors corresponding to the input images.
        c                    s,   g | ]\}}| j j | j j d   qS r   )rv  r  )r   rg   rh   r!  r"   r#   r     s    z0Emu3Model.get_image_features.<locals>.<listcomp>)r  rz  rF   split)r/   r   r  r(  split_sizesimage_featuresr"   r!  r#   get_image_features  s   	
zEmu3Model.get_image_featuresr(  rg   rh   c                 C   s>   |ddddf  d||d }| j|}| j|}|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        NrX   r   )r_   rx  re  rv  r.  )r/   r(  rg   rh   	sequencesimager"   r"   r#   decode_image_tokens  s   "zEmu3Model.decode_image_tokens	input_idsinputs_embedsr  c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}|jd |jd  }||  | krPtd| d| |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)rT  r_  rX   r   r   z6Image features and image tokens do not match: tokens: z, features )rz  rF   tensorrx  r;  longr_  allr`   r#  	expand_asra  r\   numel
ValueError)r/   r  r  r  special_image_maskn_image_tokensn_image_featuresr"   r"   r#   get_placeholder_mask  s   zEmu3Model.get_placeholder_maskNr7   r8   r3   r9   r:   r<   r=   c
              	   K   s   |du |duA rt d|du r|  |}|dur5| ||}tj|dd}| j|||d}|||}| jd||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r[   )r  r  )r7   r8   r3   r  r9   r:   r"   )r  rz  r  rF   r   r  masked_scatterrr  )r/   r  r   r  r7   r8   r3   r  r9   r:   r<   image_embedsr  outputsr"   r"   r#   rD     s0   
zEmu3Model.forward)	NNNNNNNNN)r   r    r!   _checkpoint_conversion_mappingr*   rz  r}  r  r  rF   r   rH   r  r  no_gradrE   r  r  r   r   r   rG   r   rI   r
   r   r   rJ   r   rD   rK   r"   r"   r0   r#   rq  {  sh    	
	

rq  c                       sV  e Zd ZdZdgZddddZ fddZd	d
 Zdd Zde	j
fddZdd Zdd Zedd Zedd Zedd Zdd Zee											d/deej deej d eej d!eej d"eej d#ee d$eej d%ee d&eej d'eej d(eeejf d)ee dee e!f fd*d+Z"						,	d0 fd-d.	Z#  Z$S )1Emu3ForConditionalGeneration zlm_head.weightzmodel.text_modelzmodel.vqmodellm_head)z^text_model.modelz^vqmodelz^text_model.lm_headc                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NF)r   )r)   r*   rq  ro  r+   r  rt  hidden_size
vocab_sizer  r  rU   r0   r"   r#   r*     s   
z%Emu3ForConditionalGeneration.__init__c                 C   ry  r(   )ro  rz  r!  r"   r"   r#   rz  $  r{  z1Emu3ForConditionalGeneration.get_input_embeddingsc                 C   r|  r(   )ro  r}  r~  r"   r"   r#   r}  '  r  z1Emu3ForConditionalGeneration.set_input_embeddingsr=   c                 C   r  r(   )r  r!  r"   r"   r#   get_output_embeddings*  r  z2Emu3ForConditionalGeneration.get_output_embeddingsc                 C   r|  r(   )ro  r  r  r"   r"   r#   r  -  r  z(Emu3ForConditionalGeneration.set_decoderc                 C   ry  r(   )ro  r  r!  r"   r"   r#   r  0  r{  z(Emu3ForConditionalGeneration.get_decoderc                 C      | j jS r(   )ro  rr  r!  r"   r"   r#   rr  4     z'Emu3ForConditionalGeneration.text_modelc                 C   r  r(   )ro  rv  r!  r"   r"   r#   rv  8  r  z$Emu3ForConditionalGeneration.vqmodelc                 C   r  r(   )ro  rx  r!  r"   r"   r#   rx  <  r  z/Emu3ForConditionalGeneration.vocabulary_mappingc                 K   s   | j jdi |S r   )ro  r  r   r"   r"   r#   r  @  s   z0Emu3ForConditionalGeneration.decode_image_tokensNr   r  r   r  r7   r8   r3   r  r9   r:   labelslogits_to_keepr<   c              
   K   s   | j d|||||||	d|}|d }t|trt| dn|}| |dd|ddf }d}|
durD| jd||
| jjjd|}t	|||j
|j|jdS )an  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import requests
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> image = Image.open(requests.get("https://www.ilankelman.org/stopsigns/australia.jpg", stream=True).raw)

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```)r  r7   r8   r3   r  r9   r:   r   N)logitsr  r  )lossr  r3   r6   ri  r"   )ro  r  rE   slicer  loss_functionr&   rt  r  r   r3   r6   ri  )r/   r  r   r  r7   r8   r3   r  r9   r:   r  r  r<   r  r6   slice_indicesr  r  r"   r"   r#   rD   C  s8   >z$Emu3ForConditionalGeneration.forwardTc	              
      s<   t  j|f|||||||d|	}
|d dkrd |
d< |
S )N)r3   r7   r  r:   r8   r   r9   r   r   )r)   prepare_inputs_for_generation)r/   r  r3   r7   r  r:   r8   r9   r   r<   model_inputsr0   r"   r#   r    s    	z:Emu3ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTN)%r   r    r!   r0  _tied_weights_keysr  r*   rz  r}  r+   r   r  r  r  propertyrr  rv  rx  r  r   r   r   rF   rH   r   rG   r   rI   r   rE   r
   r   rJ   r   rD   r  rK   r"   r"   r0   r#   r    s    


	

]r  )r  rn  rh  rg  r   rq  )Jr   	functoolsr   typingr   r   rF   torch.nnr+   torch.nn.functional
functionalr}   cache_utilsr   
generationr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   r   utils.deprecationr   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerr   loggerr   r%   r   rL   ro   rp   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r7  rg  rh  rn  rq  r  __all__r"   r"   r"   r#   <module>   sb   
'"$1?";:FFo6  ,