o
    ei                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm  m	Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 e1e2Z3eeG dd deZ4G dd de&Z5G dd de'Z6G dd dej7Z8G dd de$Z9G dd dej7Z:G d d! d!ej7Z;G d"d# d#ej7Z<G d$d% d%ej7Z=G d&d' d'ej7Z>G d(d) d)ej7Z?G d*d+ d+ej7Z@G d,d- d-e,ZAG d.d/ d/ejBZCG d0d1 d1ej7ZDG d2d3 d3ej7ZEG d4d5 d5ej7ZFG d6d7 d7ej7ZGG d8d9 d9ej7ZHed:d;G d<d= d=eZIG d>d? d?ZJG d@dA dAe#ZKG dBdC dCe)eKZLG dDdE dEe(eKeZMG dFdG dGeKZNG dHdI dIeKeZOg dJZPdS )K    N)	dataclass)cached_property   )initialization)Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tupleloggingtorch_compilable_check)merge_with_config_defaults)capture_outputs   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelTransformersKwargs)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   @   s$   e Zd ZU dZdZejdB ed< dS )Emu3VQVAEModelOutputz
    image_tokens (`torch.LongTensor` of shape `(batch_size, config.vocab_size`):
        Indices of the image tokens predicted by the VQ-VAE model.
    Nimage_tokens)__name__
__module____qualname____doc__r    torch
LongTensor__annotations__ r(   r(   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/emu3/modular_emu3.pyr   -   s   
 r   c                   @      e Zd ZdS )Emu3AttentionNr!   r"   r#   r(   r(   r(   r)   r+   8       r+   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )Emu3DecoderLayerconfig	layer_idxc                    s    t  || t|j| _d S N)super__init__nnDropoutattention_dropoutdropout)selfr/   r0   	__class__r(   r)   r3   >   s   zEmu3DecoderLayer.__init__NFhidden_statesattention_maskposition_idspast_key_values	use_cachecache_positionposition_embeddingskwargsreturnc              
   K   sj   |}	|  |}| jd|||||||d|\}}
|	| | }|}	| |}| |}|	| | }|S )N)r;   r<   r=   r>   r?   r@   rA   r(   )input_layernorm	self_attnr7   post_attention_layernormmlp)r8   r;   r<   r=   r>   r?   r@   rA   rB   residual_r(   r(   r)   forwardB   s&   




zEmu3DecoderLayer.forward)NNNFNN)r!   r"   r#   r   intr3   r%   Tensorr&   r   booltupler   r   rJ   __classcell__r(   r(   r9   r)   r.   =   s6    	
r.   c                       s6   e Zd ZdZdef fddZdejfddZ  Z	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r/   c                    s>   t    t|j|j| _| jjj	d|j d|j  d S )Ng            ?)
r2   r3   r4   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_r8   r/   r9   r(   r)   r3   n   s   
"z!Emu3VQVAEVectorQuantizer.__init__hidden_statec                 C   s   |j \}}}}}|ddddd }|d|}tj|d ddd}tj| jjd dd	}	dt|| jj	dd }
||	 |
 }
tj
|
dd	}|||||}|S )
Nr   r   r      r   T)dimkeepdimr]   )shapepermute
contiguousviewr%   sumrU   rV   matmul	transposeargmin)r8   rZ   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicesr(   r(   r)   rJ   s   s   z Emu3VQVAEVectorQuantizer.forward)
r!   r"   r#   r$   r   r3   r%   rL   rJ   rO   r(   r(   r9   r)   rP   c   s    
rP   c                   @   r*   )Emu3VQVAEEncoderConvDownsampleNr,   r(   r(   r(   r)   rr      r-   rr   c                       s$   e Zd Z fddZdd Z  ZS )Emu3VQVAEEncoderConvUpsamplec                    s$   t    tj||dddd| _d S )Nr   r   kernel_sizestridepadding)r2   r3   r4   Conv2dconv)r8   in_channelsr9   r(   r)   r3      s   
z%Emu3VQVAEEncoderConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )N       @nearestscale_factormode)Finterpolatery   r8   r;   r(   r(   r)   rJ      s   
z$Emu3VQVAEEncoderConvUpsample.forwardr!   r"   r#   r3   rJ   rO   r(   r(   r9   r)   rs      s    rs   c                	       sF   e Zd Zdededee dee f fddZdejfdd	Z  Z	S )
Emu3VQVAEConv3d
in_channelout_channelru   rv   c                    s   t    dd t|dd  |dd  D }d| _|d d d D ]}|  j|d |d  |d f7  _q!|  jd7  _tj||||d| _d S )	Nc                 S   s   g | ]\}}|| qS r(   r(   ).0
one_kernel
one_strider(   r(   r)   
<listcomp>   s    z,Emu3VQVAEConv3d.__init__.<locals>.<listcomp>r   r(   r\   r   )r   r   )rv   )r2   r3   ziprw   r4   Conv3dry   )r8   r   r   ru   rv   padding_sizespad_sizer9   r(   r)   r3      s   
$$zEmu3VQVAEConv3d.__init__r;   c                 C   s   t || j}| |}|S r1   )r   padrw   ry   r   r(   r(   r)   rJ      s   
zEmu3VQVAEConv3d.forward)
r!   r"   r#   rK   rN   r3   r%   rL   rJ   rO   r(   r(   r9   r)   r      s    r   c                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	Emu3VQVAESpatialNormrz   out_channelsc                    sN   t    tj|dddd| _tj||dddd| _tj||dddd| _d S )N    ư>Tnum_channels
num_groupsepsaffiner   r   rt   )r2   r3   r4   	GroupNorm
norm_layerrx   conv_yconv_br8   rz   r   r9   r(   r)   r3      s*   
zEmu3VQVAESpatialNorm.__init__r;   quant_statesc                 C   s@   t j||jdd  dd}| |}|| | | | }|S )Nr|   )sizer   )r   r   r`   r   r   r   )r8   r;   r   r(   r(   r)   rJ      s   
zEmu3VQVAESpatialNorm.forward	r!   r"   r#   rK   r3   r%   rL   rJ   rO   r(   r(   r9   r)   r      s    r   c                       6   e Zd Zdedef fddZdejfddZ  ZS )Emu3VQVAETemporalUpsampler   r   c                        t    t||ddd| _d S )Nr   r   r   r   r   r   ru   rv   r2   r3   r   ry   r8   r   r   r9   r(   r)   r3         
z"Emu3VQVAETemporalUpsample.__init__r;   c                 C   sr   |j \}}}}}|ddddd |d|}tj|ddd	}|||||dddddd }| |}|S )
Nr   r   r   r[   r   r\   r{   r|   r}   )r`   ra   rb   rc   r   r   ry   )r8   r;   rh   rj   ri   rk   rl   r(   r(   r)   rJ      s    $
z!Emu3VQVAETemporalUpsample.forwardr   r(   r(   r9   r)   r          r   c                       r   )Emu3VQVAETemporalDownsampler   r   c                    r   )N)r[   r   r   )r   r   r   r   r   r   r9   r(   r)   r3      r   z$Emu3VQVAETemporalDownsample.__init__r;   c                 C   s   |  |}|S r1   )ry   r   r(   r(   r)   rJ      s   
z#Emu3VQVAETemporalDownsample.forwardr   r(   r(   r9   r)   r      r   r   c                       s(   e Zd Z	d fdd	Zdd Z  ZS )Emu3VQVAETemporalResnetBlockNc                    s   t    || _|d u r|n|| _t|| _t||ddd| _t|| _	t||ddd| _
| j| jkrBtj||dddd| _d S d S )Nr   r   r   r   r   rt   )r2   r3   rz   r   r4   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   r9   r(   r)   r3      s4   
z%Emu3VQVAETemporalResnetBlock.__init__c                 C   sf   |}|  |}|t|9 }| |}| |}|t|9 }| |}| j| jkr/| |}|| S r1   )	r   r%   sigmoidr   r   r   rz   r   r   )r8   r;   rH   r(   r(   r)   rJ      s   




z$Emu3VQVAETemporalResnetBlock.forwardr1   r   r(   r(   r9   r)   r      s     r   c                       sT   e Zd Z		d
dededB dedB f fddZddejdejdB fdd	Z  ZS )Emu3VQVAEResnetBlockNrz   r   quant_channelsc                    s   t    || _|d u r|n|}|| _|| _|d u r/tj|dddd| _tj|dddd| _nt	||| _t	||| _tj
||dddd| _tj
||dddd| _| j| jkrdtj
||dddd| _d S d S )	Nr   r   Tr   r   r   rt   r   )r2   r3   rz   r   r   r4   r   r   r   r   rx   r   r   r   )r8   rz   r   r   r9   r(   r)   r3   1  sB   
zEmu3VQVAEResnetBlock.__init__r;   c                 C   s   | j d u rdn|f}|}| j|g|R  }|t|9 }| |}| j|g|R  }|t|9 }| |}| j| jkrA| 	|}|| S Nr(   )
r   r   r%   r   r   r   r   rz   r   r   )r8   r;   r   	norm_argsrH   r(   r(   r)   rJ   ]  s   


zEmu3VQVAEResnetBlock.forward)NNr1   r   r(   r(   r9   r)   r   0  s    $,r   c                       "   e Zd Zdef fddZ  ZS )Emu3VQVAEAttentionBlockr/   c                    s   t  | d| _d S )Nr   )r2   r3   num_key_value_groupsrY   r9   r(   r)   r3   p  s   
z Emu3VQVAEAttentionBlock.__init__)r!   r"   r#   r   r3   rO   r(   r(   r9   r)   r   o      r   c                       s*   e Zd ZdZ fddZdddZ  ZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                    s   t  jdi | d S r   )r2   r3   r8   rB   r9   r(   r)   r3   ~  s   zEmu3VQVAEGroupNorm.__init__Nc                 C   s   t || j| j| j| jS r1   )r   
group_normr   rV   biasr   )r8   inputr   r(   r(   r)   rJ     s   zEmu3VQVAEGroupNorm.forwardr1   )r!   r"   r#   r$   r3   rJ   rO   r(   r(   r9   r)   r   w  s    r   c                       s:   e Zd Zd fdd	ZddejdejdB fddZ  ZS )	Emu3VQVAEMiddleBlockNc                    s`   t    t|||d| _t|| _|d u r t|dddd| _nt||| _t|||d| _	d S )Nrz   r   r   r   r   Tr   )
r2   r3   r   block_1r   attn_1r   	attn_normr   block_2)r8   r/   rz   r   r9   r(   r)   r3     s   

zEmu3VQVAEMiddleBlock.__init__r;   r   c                 C   s   |  ||}|}| ||}|j\}}}}||||| dd}| |d }|||||dddd}|| }| ||}|S )Nr   r   r   r   )	r   r   r`   rc   rf   r   reshapera   r   )r8   r;   r   rH   rh   rj   rk   rl   r(   r(   r)   rJ     s   zEmu3VQVAEMiddleBlock.forwardr1   r!   r"   r#   r3   r%   FloatTensorrJ   rO   r(   r(   r9   r)   r     s    $r   c                       ,   e Zd Z fddZdejfddZ  ZS )Emu3VQVAEDownBlockc              
      s(  t    t|j| _|j| _|j}|j}dt| }|| _t	
 | _t| jD ]i}t	
 }t	
 }t	
 }|||  }	|||  }
t| jD ]*}|t|	|
d |
}	|jd urq||jv rq|t| |t	j|	dddd qGt	 }||_||_||_|| jd krt|	|_| j| q(d S )Nr   rz   r   r   r   Tr   r   )r2   r3   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrN   in_channel_multiplierr4   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsrr   
downsample)r8   r/   r   r   r   i_levelr   r   r   block_in	block_outi_blockr   r9   r(   r)   r3     sD   


zEmu3VQVAEDownBlock.__init__r;   c           
      C   s   t | jD ]^\}}t| jD ]H}|j| |}t|jdkrV|}|j| |}|j\}}}}	|	||||	 
dd}|j| |d }||||	|dddd}|| }q|| jd krc||}q|S )Nr   r   r   r   )	enumerater   r   r   r   r   r   r   r`   rc   rf   r   ra   r   r   )
r8   r;   r   blocksr   rH   rh   rj   rk   rl   r(   r(   r)   rJ     s    
zEmu3VQVAEDownBlock.forwardr   r(   r(   r9   r)   r     s    %r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Emu3VQVAEUpBlockc              	      s  t    t|j| _|j| _|j}|j|jd  }t	 | _
tt| jD ]]}t	 }t	 }t	 }|j|j|  }t| jd D ]"}	|t|||d |}||jv re|t| |t|| qCt }
||
_||
_||
_|dkr|t||
_| j
d|
 q&d S )Nr\   r   r   r   )r2   r3   r   r   r   r   rT   r   r4   r   upreversedr   r   r   r   r   r   r   r   r   r   rs   upsampleinsert)r8   r/   r   r   r   r   r   r   r   r   r   r9   r(   r)   r3     s@   



zEmu3VQVAEUpBlock.__init__r;   r   c                 C   s   t | jd d d D ]d\}}t| jd D ]J}|j| ||}t|jdkr_|}|j| ||}|j\}}}	}
|	|||	|
 
dd}|j| |d }|||	|
|dddd}|| }q|t| jd krn||}q
|S )Nr\   r   r   r   r   )r   r   r   r   r   r   r   r   r`   rc   rf   r   ra   r   )r8   r;   r   r   r   r   rH   rh   rj   rk   rl   r(   r(   r)   rJ     s    
zEmu3VQVAEUpBlock.forwardr   r(   r(   r9   r)   r     s    %r   c                       r   )Emu3VQVAEEncoderc                    s  t    |j}|j}|j}|j}|j}|rd| n|}||d  }tjj	||dddd| _
t|| _t||| _tjjd|ddd	| _tjj	||dddd| _tt|j}	t | _t | _t|	D ]}
t||}| j| qft|jD ]}t||d
}| j| qyd S )Nr   r\   r   r   rt   r   r   T)r   r   r   r   r   )r2   r3   r   rz   double_latentlatent_channelsr   r%   r4   rx   conv_inr   
down_blockr   middle_blockr   norm_outconv_outrK   mathlog2temporal_downsample_factorr   	time_convtime_res_stackr   r   r   r   r   )r8   r/   r   rz   r   r   r   r   r   temporal_down_blocksiry   rI   time_res_convr9   r(   r)   r3     s@   




zEmu3VQVAEEncoder.__init__pixel_valuesc                 C   s   |j d }|jdg|j dd  R  }| |}| |}| |}| |}|t|9 }| |}|jd|g|j dd  R  }|	ddddd}| j
D ]}||}|t|9 }qN| jD ]}||}q_|	ddddd}|S )Nr   r\   r   r   r   r[   )r`   r   r   r   r   r   r%   r   r   ra   r   r   )r8   r   temporal_dimr;   ry   layerr(   r(   r)   rJ   D  s"   








zEmu3VQVAEEncoder.forward)r!   r"   r#   r3   r%   r&   rJ   rO   r(   r(   r9   r)   r     s    'r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Emu3VQVAEDecoderr/   c           	         s  t    |j}|j|jd  }t | _t|j	D ]}t
|j|jd}| j| qtt|j}t | _t|D ]}t|j|j}| j| q<tj|j|dddd| _t|||d| _t|| _|j|jd  }t||| _tj||jdddd| _d S )Nr\   r   r   r   rt   )r   r   )r2   r3   rT   r   r   r4   r   r   r   r   r   r   r   rK   r   r   r   r   r   rx   r   r   r   r   up_blockr   r   r   r   )	r8   r/   r   r   rI   r   temp_upsample_block_numr   ry   r9   r(   r)   r3   c  s@   



zEmu3VQVAEDecoder.__init__r;   r   c                 C   s  t j||fdd}|ddddd}| jD ]}||}q| jD ]}||}|t |9 }q|ddddd}t j|ddd\}}|jdg|jdd  R  }|jdg|jdd  R  }| 	|}| 
||}| ||}| ||}|t |9 }| |}|S )Nr   r_   r   r   r   r[   r\   )r%   catra   r   r   r   chunkr   r`   r   r   r   r   r   )r8   r;   r   hidden_quant_statesr   r(   r(   r)   rJ     s$   




zEmu3VQVAEDecoder.forward)	r!   r"   r#   r   r3   r%   rL   rJ   rO   r(   r(   r9   r)   r   b  s    'r   aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    custom_introc                
       s   e Zd ZU eed< dZdZdZdZdZ	dZ
dZg dZeegedZe dd	 Zdef fd
dZeedejdejdee defddZdejfddZ  ZS )	Emu3VQVAEr/   
emuvideovqr   )imageT)r   r   r   rP   r;   
attentionsc                 C   s  t |tjtjfr5tj|jddd |jd ur3tjj	|j\}}dt
| }t|j| | d S d S t |tjrotj|jt
dd |jd urmtjj	|j\}}|dkr`dt
| nd}t|j| | d S d S t |tjtjtjfrt|jd t|jd	 t|d
d d urt|j t|j t|j d S d S t |tjrt|j |jd urt|jddst|j|j  d S d S d S d S )Nfan_outrelu)r   nonlinearityr      )ar   rQ   g        running_mean_is_hf_initializedF)
isinstancer4   rx   r   initkaiming_normal_rV   r   r%   _calculate_fan_in_and_fan_outr   sqrtrX   Linearkaiming_uniform_BatchNorm2dr   r   	constant_getattrzeros_r  ones_running_varnum_batches_trackedrR   normal_padding_idx)r8   modulefan_inrI   boundr(   r(   r)   _init_weights  s8   

zEmu3VQVAE._init_weightsc                    s   t  | || _t|| _t|| _t|| _dt	|j
d  | _t|j|jddd| _t|j|jddd| _dt	|j
d  | _|   |   d S )Nr   r   )r   r   r   r   r   )r2   r3   r/   r   encoderr   decoderrP   quantizer   r   vision_spatial_factorr   r   rT   
quant_convpost_quant_convspatial_scale_factoreval	post_initrY   r9   r(   r)   r3     s   


zEmu3VQVAE.__init__image_sizesrB   rC   c                    s   |j dk}|r jj}|j\}}}}	|dd|ddd}n|j\}}}}}	 |}
|
ddddd} |}|ddddd} 	|}|rO|
dn|} fddt||D }t|
|dS )	Nr[   r   r   r   r   c                    s@   g | ]\}}|d t |d  j d t |d  j f qS )Nr   r   )rK   r(  )r   single_imager   r8   r(   r)   r     s    .z$Emu3VQVAE.encode.<locals>.<listcomp>)last_hidden_stater    )ndimr/   r   r`   	unsqueezerepeatr%  ra   r)  r'  squeezer   r   )r8   r   r.  rB   is_imageri   rh   rj   rk   rl   r;   conv_hidden_statescodesr    r(   r0  r)   encode  s&   




zEmu3VQVAE.encoder;   c                 C   s   |j dk}|r|d}|j\}}}}| j| }|jd }||||||ddddd }| 	|}	|ddddd}|	ddddd}	| 
|	|}
|
||| jj | jj|| j || j }
|rn|
d d df S |
S )Nr   r   r\   r   r[   r   )r2  r3  r`   r'  rU   flattenrc   ra   rb   r*  r&  r   r/   r   r   r+  )r8   r;   r6  rh   ri   rk   rl   quantrj   
post_quantvideor(   r(   r)   decode  s&   


$

zEmu3VQVAE.decode)r!   r"   r#   r   r'   base_model_prefixmain_input_nameinput_modalities_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr   r   r   _can_record_outputsr%   no_gradr$  r3   r   r   rL   r   r   r   r9  r>  rO   r(   r(   r9   r)   r    s8   
 	
!r  c                   @   s   e Zd ZdZdd Zedd Zedd Zedd	 Zed
d Z	edd Z
edd Zdeej dejfddZdejdejfddZdS )Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 C   s"   || _ |d| _|d| _d S )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r8   rJ  r(   r(   r)   r3   6  s   z#Emu3ImageVocabularyMapping.__init__c                 C      t dd | j D S )Nc                 S   s   g | ]\}}| d r|qS z<|visual token
startswithr   namevalr(   r(   r)   r   =      z;Emu3ImageVocabularyMapping.image_tokens.<locals>.<listcomp>sortedrJ  itemsr0  r(   r(   r)   r    ;     z'Emu3ImageVocabularyMapping.image_tokensc                 C   rN  )Nc                 S   s   g | ]\}}| d r|qS rO  rP  rR  r(   r(   r)   r   A  rU  z?Emu3ImageVocabularyMapping.image_tokens_str.<locals>.<listcomp>rV  r0  r(   r(   r)   image_tokens_str?  rY  z+Emu3ImageVocabularyMapping.image_tokens_strc                    s    fdd j D S )Nc                    s$   i | ]}t |d d  j| qS )ir   )rK   rJ  )r   tokenr0  r(   r)   
<dictcomp>E  s   $ z6Emu3ImageVocabularyMapping.img2bpe.<locals>.<dictcomp>)rZ  r0  r(   r0  r)   img2bpeC     z"Emu3ImageVocabularyMapping.img2bpec                 C   s   dd | j  D S )Nc                 S   s   i | ]\}}||qS r(   r(   )r   kvr(   r(   r)   r\  I      z6Emu3ImageVocabularyMapping.bpe2img.<locals>.<dictcomp>)r]  rX  r0  r(   r(   r)   bpe2imgG  r^  z"Emu3ImageVocabularyMapping.bpe2imgc                 C   >   t jt| j d t jd}| j D ]\}}|||< q|S Nr   dtype)r%   zerosmaxrb  keysrK   rX  r8   mappingr_  r`  r(   r(   r)   bpe2img_mapping_tensorK     
z1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorc                 C   rc  rd  )r%   rg  rh  r]  ri  rK   rX  rj  r(   r(   r)   img2bpe_mapping_tensorR  rm  z1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor	img_batchrC   c                 C   sR   |j }tj|jd dftjd| j }| j|d }tj||gdd}||S )Nr   r   re  cpur\   r_   )	devicer%   onesr`   rK   rL  rn  tor   )r8   ro  rq  eol_row
img_tokensr(   r(   r)   convert_img2bpeY  s
    
z*Emu3ImageVocabularyMapping.convert_img2bpec                 C   s0   |j }|dd df }| j|d }||S )N.r\   rp  )rq  rl  rs  )r8   ro  rq  ru  r(   r(   r)   convert_bpe2img`  s   
z*Emu3ImageVocabularyMapping.convert_bpe2imgN)r!   r"   r#   r$   r3   r   r    rZ  r]  rb  rl  rn  listr%   rL   rv  rw  r(   r(   r(   r)   rI  1  s"    





rI  c                   @   s$   e Zd ZdgZdZdZeedZdS )Emu3PreTrainedModelr.   Tr  N)	r!   r"   r#   rF  rD  rE  r.   r+   rG  r(   r(   r(   r)   ry  g  s    
ry  c                       r   )Emu3TextModelr/   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r(   )r.   )r   r0   r/   r(   r)   r   w  ra  z*Emu3TextModel.__init__.<locals>.<listcomp>)r2   r3   r4   r   r   num_hidden_layerslayersrY   r9   r{  r)   r3   t  s   
zEmu3TextModel.__init__)r!   r"   r#   r   r3   rO   r(   r(   r9   r)   rz  s  r   rz  c                       s2   e Zd ZU eed<  fddZ fddZ  ZS )Emu3ForCausalLMr/   c                    s   t  | t|| _d S r1   )r2   r3   rz  modelrY   r9   r(   r)   r3   ~  s   zEmu3ForCausalLM.__init__c                     s   t    dS )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```N)r2   rJ   )super_kwargsr9   r(   r)   rJ     s   zEmu3ForCausalLM.forward)r!   r"   r#   r   r'   r3   rJ   rO   r(   r(   r9   r)   r~  {  s   
 r~  c                       sL  e Zd ZddiZ fddZdd Zdd Zd	ejd
ej	dej	fddZ
eeddd	ejd
ej	dee deeB fddZe dej	dedefddZdej	dejdejfddZee									d%dej	dB d	ejdB d
ejdB dejdB dej	dB d edB dejdB d!edB d"ej	dB dee deeB fd#d$Z  ZS )&	Emu3Modelztext_model.model
text_modelc                    s>   t  | t|j| _t|j| _t	|j
| _|   d S r1   )r2   r3   rz  _from_configtext_configr  r  	vq_configvqmodelrI  vocabulary_mapvocabulary_mappingr-  rY   r9   r(   r)   r3     s
   zEmu3Model.__init__c                 C   
   | j  S r1   )r  get_input_embeddingsr0  r(   r(   r)   r       
zEmu3Model.get_input_embeddingsc                 C      | j | d S r1   )r  set_input_embeddingsr8   valuer(   r(   r)   r       zEmu3Model.set_input_embeddingsr   r.  rC   c                    s4    j j||dd} fdd|jD }t|}|S )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        T)return_dictc                       g | ]
} j | qS r(   r  rv  r:  r   tokensr0  r(   r)   r         z.Emu3Model.get_image_tokens.<locals>.<listcomp>)r  r9  r    r%   r   )r8   r   r.  vqmodel_outputsbpe_tokens_list
bpe_tokensr(   r0  r)   get_image_tokens  s   

zEmu3Model.get_image_tokenszbTokenizes images into discrete tokens with VQGAN module and embeds them with text embeddings layerr  rB   c           
         sl    j j||fddi|} fdd|D } fdd|jD }t|}  |}t||}	|	|_|S )z
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
            The tensors corresponding to the input images.
        r  Tc                    s,   g | ]\}}| j j | j j d   qS r   )r  r(  )r   rk   rl   r0  r(   r)   r     s    z0Emu3Model.get_image_features.<locals>.<listcomp>c                    r  r(   r  r  r0  r(   r)   r     r  )r  r9  r    r%   r   r  splitpooler_output)
r8   r   r.  rB   r  split_sizesr  r  image_embeddingsimage_featuresr(   r0  r)   get_image_features  s$   


zEmu3Model.get_image_featuresr    rk   rl   c                 C   s>   |ddddf  d||d }| j|}| j|}|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr\   r   )rc   r  rw  r  r>  )r8   r    rk   rl   	sequencesr  r(   r(   r)   decode_image_tokens  s   "zEmu3Model.decode_image_tokens	input_idsinputs_embedsr  c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)rf  rq  r\   r   r   z6Image features and image tokens do not match, tokens: z, features: )r  r%   tensorr  rM  longrq  allrd   r`   r3  	expand_asrs  r   numel)r8   r  r  r  special_image_maskn_image_tokensn_image_featuresr(   r(   r)   get_placeholder_mask  s   zEmu3Model.get_placeholder_maskNr<   r=   r>   r?   r@   c
              	   K   s   |du |duA rt d|du r|  |}|dur6| ||j}tj|dd}| j|||d}|||}| jd||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r_   )r  r  )r<   r=   r>   r  r?   r@   r(   )	
ValueErrorr  r  r  r%   r   r  masked_scatterr  )r8   r  r   r.  r<   r=   r>   r  r?   r@   rB   r  r  outputsr(   r(   r)   rJ     s0   
zEmu3Model.forward)	NNNNNNNNN)r!   r"   r#   _checkpoint_conversion_mappingr3   r  r  r%   r   r&   r  r   r   r   r   rN   r   r  rH  rK   r  r  rL   r   rM   r	   rJ   rO   r(   r(   r9   r)   r    s|    	
	
r  c                       s  e Zd ZdZddiZddddZ fdd	Zd
d Zdd Zde	j
fddZdd Zee											d'dejdB dejdB dejdB dejdB dejdB dedB dejdB dedB dejdB dejdB deejB d ee deeB fd!d"Z						#		$d( fd%d&	Z  ZS ))Emu3ForConditionalGeneration)r  textzlm_head.weightz$model.text_model.embed_tokens.weightzmodel.text_modelzmodel.vqmodellm_head)z^text_model.modelz^vqmodelz^text_model.lm_headc                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NF)r   )r2   r3   r  r  r4   r  r  hidden_size
vocab_sizer  r-  rY   r9   r(   r)   r3   @  s   
z%Emu3ForConditionalGeneration.__init__c                 C   r  r1   )r  r  r0  r(   r(   r)   r  G  r  z1Emu3ForConditionalGeneration.get_input_embeddingsc                 C   r  r1   )r  r  r  r(   r(   r)   r  J  r  z1Emu3ForConditionalGeneration.set_input_embeddingsrC   c                 C   s   | j S r1   )r  r0  r(   r(   r)   get_output_embeddingsM  s   z2Emu3ForConditionalGeneration.get_output_embeddingsc                 K   s   | j jdi |S r   )r  r  r   r(   r(   r)   r  P  s   z0Emu3ForConditionalGeneration.decode_image_tokensNr   r  r   r.  r<   r=   r>   r  r?   r@   labelslogits_to_keeprB   c              
   K   s   | j d|||||||	d|}|d }t|trt| dn|}| |dd|ddf }d}|
durD| jd||
| jjjd|}t	|||j
|j|jdS )a  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```)r  r<   r=   r>   r  r?   r@   r   N)logitsr  r  )lossr  r>   r;   r	  r(   )r  r  rK   slicer  loss_functionr/   r  r  r	   r>   r;   r	  )r8   r  r   r.  r<   r=   r>   r  r?   r@   r  r  rB   r  r;   slice_indicesr  r  r(   r(   r)   rJ   S  s8   Az$Emu3ForConditionalGeneration.forwardTFc
                    s:   t  j|f||||||||	d|
}|	s|rd |d< |S )N)r>   r<   r  r@   r=   r   r?   is_first_iterationr   )r2   prepare_inputs_for_generation)r8   r  r>   r<   r  r@   r=   r?   r   r  rB   model_inputsr9   r(   r)   r    s"   
z:Emu3ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTNF)r!   r"   r#   output_modalities_tied_weights_keysr  r3   r  r  r4   r   r  r  r   r   r%   r&   r   rL   r   rM   rK   r   r   rN   r	   rJ   r  rO   r(   r(   r9   r)   r  7  sz    	
`r  )r  r~  rz  ry  r  r  )Qr   dataclassesr   	functoolsr   r%   torch.nnr4   torch.nn.functional
functionalr    r   r  cache_utilsr   
generationr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerr!   loggerr   r+   r.   r   rP   rr   rs   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  rI  ry  rz  r~  r  r  __all__r(   r(   r(   r)   <module>   sn   
	&"$1?";:FF 6   