o
    eil                    @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	Z	d dl
mZ d dlm  mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9m:Z: ee/G dd de#Z;dd Z<eddjddZ=de	j>de?d e	j>fd!d"Z@	#dkd$ejAd%e	j>d&e	j>d'e	j>d(e	j>dB d)eBd*eBd+e,e. fd,d-ZCee=G d.d/ d/ejAZDed0G d1d2 d2ejAZEG d3d4 d4ejAZFG d5d6 d6e ZGG d7d8 d8ejAZHG d9d: d:ejAZIG d;d< d<ejAZJG d=d> d>ejAZKG d?d@ d@ejAZLG dAdB dBejAZMG dCdD dDejAZNG dEdF dFejAZOG dGdH dHejAZPG dIdJ dJejAZQG dKdL dLejRZSG dMdN dNejAZTG dOdP dPejAZUG dQdR dRejAZVG dSdT dTejAZWG dUdV dVejAZXe/dWdXG dYdZ dZe*ZYG d[d\ d\ZZe/G d]d^ d^e*Z[G d_d` d`ejAZ\e/G dadb dbe[Z]e/G dcdd dde[eZ^G dedf dfe[Z_G dgdh dhe[eZ`g diZadS )l    N)Callable)	dataclass)cached_property)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   @   s$   e Zd ZU dZdZejdB ed< dS )Emu3VQVAEModelOutputz
    image_tokens (`torch.LongTensor` of shape `(batch_size, config.vocab_size`):
        Indices of the image tokens predicted by the VQ-VAE model.
    Nimage_tokens)__name__
__module____qualname____doc__r%   torch
LongTensor__annotations__ r-   r-   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/emu3/modeling_emu3.pyr$   1   s   
 r$   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shaper*   cat)xx1x2r-   r-   r.   rotate_half<   s   r8   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer8   )qkcossinunsqueeze_dimq_embedk_embedr-   r-   r.   apply_rotary_pos_embC   s
   

rB   hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)r3   expandreshape)rC   rD   batchnum_key_value_headsslenhead_dimr-   r-   r.   	repeat_kv]   s
   0rL           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr0   r   r/   )r2   dtype)ptrainingr    )rL   num_key_value_groupsr*   matmul	transposenn
functionalsoftmaxfloat32torV   rT   rX   
contiguous)rN   rO   rP   rQ   rR   rS   rT   rU   
key_statesvalue_statesattn_weightsattn_outputr-   r-   r.   eager_attention_forwardi   s   
rf   c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejf fddZ  ZS )Emu3Attention=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )NrK         Tbias)super__init__ri   rj   getattrhidden_sizenum_attention_headsrK   rI   rY   rS   attention_dropout	is_causalr\   Linearattention_biasq_projk_projv_projo_projselfri   rj   	__class__r-   r.   ro      s(   
zEmu3Attention.__init__NrC   position_embeddingsrR   past_key_valuescache_positionrU   rE   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jd|\}}|jg |dR   }| |}||fS )Nr/   r    r0   )r>   r=   r   rM   )rT   rS   )r3   rK   rw   viewr[   rx   ry   rB   updaterj   r   get_interfaceri   _attn_implementationrf   rX   rs   rS   rG   ra   rz   )r|   rC   r   rR   r   r   rU   input_shapehidden_shapequery_statesrb   rc   r=   r>   cache_kwargsattention_interfacere   rd   r-   r-   r.   forward   s8   	

zEmu3Attention.forward)NNNN)r&   r'   r(   r)   r!   intro   r*   Tensortupler	   r+   r   r   r   __classcell__r-   r-   r}   r.   rg      s,    rg   RMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )Emu3RMSNormư>epsrE   Nc                    s&   t    tt|| _|| _dS )z:
        Emu3RMSNorm is equivalent to T5LayerNorm
        N)rn   ro   r\   	Parameterr*   onesweightvariance_epsilon)r|   rq   r   r}   r-   r.   ro      s   

zEmu3RMSNorm.__init__rC   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr0   r/   T)keepdim)	rV   r`   r*   r_   powmeanrsqrtr   r   )r|   rC   input_dtypevariancer-   r-   r.   r      s
   zEmu3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r   r3   r   r|   r-   r-   r.   
extra_repr   s   zEmu3RMSNorm.extra_repr)r   )
r&   r'   r(   floatro   r*   r   r   r   r   r-   r-   r}   r.   r      s    r   c                       $   e Zd Z fddZdd Z  ZS )Emu3MLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nrl   )rn   ro   ri   rq   intermediate_sizer\   ru   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr|   ri   r}   r-   r.   ro      s   
zEmu3MLP.__init__c                 C   s$   |  | | || | }|S N)r   r   r   r   )r|   r5   r   r-   r-   r.   r      s    zEmu3MLP.forwardr&   r'   r(   ro   r   r   r-   r-   r}   r.   r      s    
r   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )Emu3DecoderLayerri   rj   c                    s`   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
t|j| _d S )N)ri   rj   r   )rn   ro   rq   rg   	self_attnr   mlpr   rms_norm_epsinput_layernormpost_attention_layernormr\   Dropoutrs   rT   r{   r}   r-   r.   ro      s   

zEmu3DecoderLayer.__init__NFrC   rR   position_idsr   	use_cacher   r   rU   rE   c              
   K   sj   |}	|  |}| jd|||||||d|\}}
|	| | }|}	| |}| |}|	| | }|S )N)rC   rR   r   r   r   r   r   r-   )r   r   rT   r   r   )r|   rC   rR   r   r   r   r   r   rU   residual_r-   r-   r.   r      s&   




zEmu3DecoderLayer.forward)NNNFNN)r&   r'   r(   r!   r   ro   r*   r   r+   r	   boolr   r   r   r   r   r-   r-   r}   r.   r      s6    	
r   c                       s6   e Zd ZdZdef fddZdejfddZ  Z	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    ri   c                    s>   t    t|j|j| _| jjj	d|j d|j  d S )Ng            ?)
rn   ro   r\   	Embeddingcodebook_size	embed_dim	embeddingr   datauniform_r   r}   r-   r.   ro   &  s   
"z!Emu3VQVAEVectorQuantizer.__init__hidden_statec                 C   s   |j \}}}}}|ddddd }|d|}tj|d ddd}tj| jjd dd	}	dt|| jj	dd }
||	 |
 }
tj
|
dd	}|||||}|S )
Nr   r    r      r0   r/   T)r2   r   r1   )r3   permutera   r   r*   sumr   r   rZ   r[   argmin)r|   r   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicesr-   r-   r.   r   +  s   z Emu3VQVAEVectorQuantizer.forward)
r&   r'   r(   r)   r#   ro   r*   r   r   r   r-   r-   r}   r.   r     s    
r   c                       r   )Emu3VQVAEEncoderConvDownsamplec                    s$   t    tj||dddd| _d S )Nr   r0   r   kernel_sizestridepaddingrn   ro   r\   Conv2dconvr|   in_channelsr}   r-   r.   ro   >     
z'Emu3VQVAEEncoderConvDownsample.__init__c                 C   s    t j|dddd}| |}|S )N)r   r    r   r    constantr   )padmoderQ   )Fr   r   r|   rC   r-   r-   r.   r   B  s   
z&Emu3VQVAEEncoderConvDownsample.forwardr   r-   r-   r}   r.   r   =      r   c                       r   )Emu3VQVAEEncoderConvUpsamplec                    s$   t    tj||dddd| _d S )Nr   r    r   r   r   r}   r-   r.   ro   J  r   z%Emu3VQVAEEncoderConvUpsample.__init__c                 C   s   t j|ddd}| |}|S )N       @nearestscale_factorr   )r   interpolater   r   r-   r-   r.   r   N  s   
z$Emu3VQVAEEncoderConvUpsample.forwardr   r-   r-   r}   r.   r   I  r   r   c                	       sF   e Zd Zdededee dee f fddZdejfdd	Z  Z	S )
Emu3VQVAEConv3d
in_channelout_channelr   r   c                    s   t    dd t|dd  |dd  D }d| _|d d d D ]}|  j|d |d  |d f7  _q!|  jd7  _tj||||d| _d S )	Nc                 S   s   g | ]\}}|| qS r-   r-   ).0
one_kernel
one_strider-   r-   r.   
<listcomp>^  s    z,Emu3VQVAEConv3d.__init__.<locals>.<listcomp>r    r-   r/   r0   )r0   r   )r   )rn   ro   zipr   r\   Conv3dr   )r|   r   r   r   r   padding_sizespad_sizer}   r-   r.   ro   U  s   
$$zEmu3VQVAEConv3d.__init__rC   c                 C   s   t || j}| |}|S r   )r   r   r   r   r   r-   r-   r.   r   k  s   
zEmu3VQVAEConv3d.forward)
r&   r'   r(   r   r   ro   r*   r   r   r   r-   r-   r}   r.   r   T  s    r   c                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	Emu3VQVAESpatialNormr   out_channelsc                    sN   t    tj|dddd| _tj||dddd| _tj||dddd| _d S )N    r   Tnum_channels
num_groupsr   affiner    r   r   )rn   ro   r\   	GroupNorm
norm_layerr   conv_yconv_br|   r   r   r}   r-   r.   ro   r  s*   
zEmu3VQVAESpatialNorm.__init__rC   quant_statesc                 C   s@   t j||jdd  dd}| |}|| | | | }|S )Nr   )sizer   )r   r   r3   r   r   r   )r|   rC   r   r-   r-   r.   r     s   
zEmu3VQVAESpatialNorm.forward	r&   r'   r(   r   ro   r*   r   r   r   r-   r-   r}   r.   r   q  s    r   c                       6   e Zd Zdedef fddZdejfddZ  ZS )Emu3VQVAETemporalUpsampler   r   c                        t    t||ddd| _d S )Nr   r   r   r    r    r    r   r   rn   ro   r   r   r|   r   r   r}   r-   r.   ro        
z"Emu3VQVAETemporalUpsample.__init__rC   c                 C   sr   |j \}}}}}|ddddd |d|}tj|ddd	}|||||dddddd }| |}|S )
Nr   r    r   r   r0   r/   r   r   r   )r3   r   ra   r   r   r   r   )r|   rC   r   r   r   r   r   r-   r-   r.   r     s    $
z!Emu3VQVAETemporalUpsample.forwardr  r-   r-   r}   r.   r        r  c                       r  )Emu3VQVAETemporalDownsampler   r   c                    r  )N)r   r   r   )r0   r    r    r  r	  r
  r}   r-   r.   ro     r  z$Emu3VQVAETemporalDownsample.__init__rC   c                 C   s   |  |}|S r   )r   r   r-   r-   r.   r     s   
z#Emu3VQVAETemporalDownsample.forwardr  r-   r-   r}   r.   r    r  r  c                       s(   e Zd Z	d fdd	Zdd Z  ZS )Emu3VQVAETemporalResnetBlockNc                    s   t    || _|d u r|n|| _t|| _t||ddd| _t|| _	t||ddd| _
| j| jkrBtj||dddd| _d S d S )Nr  r  r  r    r   r   )rn   ro   r   r   r\   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   r}   r-   r.   ro     s4   
z%Emu3VQVAETemporalResnetBlock.__init__c                 C   sf   |}|  |}|t|9 }| |}| |}|t|9 }| |}| j| jkr/| |}|| S r   )	r  r*   sigmoidr  r  r  r   r   r  )r|   rC   r   r-   r-   r.   r     s   




z$Emu3VQVAETemporalResnetBlock.forwardr   r   r-   r-   r}   r.   r    s     r  c                       sT   e Zd Z		d
dededB dedB f fddZddejdejdB fdd	Z  ZS )Emu3VQVAEResnetBlockNr   r   quant_channelsc                    s   t    || _|d u r|n|}|| _|| _|d u r/tj|dddd| _tj|dddd| _nt	||| _t	||| _tj
||dddd| _tj
||dddd| _| j| jkrdtj
||dddd| _d S d S )	Nr   r   Tr   r   r    r   r   )rn   ro   r   r   r  r\   r   r  r  r   r   r  r  r  )r|   r   r   r  r}   r-   r.   ro     sB   
zEmu3VQVAEResnetBlock.__init__rC   c                 C   s   | j d u rdn|f}|}| j|g|R  }|t|9 }| |}| j|g|R  }|t|9 }| |}| j| jkrA| 	|}|| S Nr-   )
r  r  r*   r  r  r  r  r   r   r  )r|   rC   r  	norm_argsr   r-   r-   r.   r     s   


zEmu3VQVAEResnetBlock.forward)NNr   r  r-   r-   r}   r.   r    s    $,r  c                
       sX   e Zd ZdZdef fddZ	ddejdejdB deejejdB f fd	d
Z	  Z
S )Emu3VQVAEAttentionBlockrh   ri   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).rk   Fr    )rn   ro   ri   rq   r   rr   	num_headsrK   
ValueErrorscalers   rT   rt   r\   ru   rx   ry   rw   out_projrY   r   r}   r-   r.   ro   2  s&   


z Emu3VQVAEAttentionBlock.__init__NrC   rR   rE   c              
   K   s   |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t	| j
jt}
|
| |||	|| j| j| jsVdn| jd\}}|||| }| |}||fS )z#Input shape: Batch x Time x Channelr    r0   rM   )rt   rS   rT   )r3   rw   rx   ry   r   r  rK   r[   r   r   ri   r   rf   rt   r  rX   rT   rG   ra   r  )r|   rC   rR   rU   r   
seq_lengthr   querieskeysvaluesr   re   rd   r-   r-   r.   r   I  s.   




zEmu3VQVAEAttentionBlock.forwardr   )r&   r'   r(   r)   r#   ro   r*   r   r   r   r   r-   r-   r}   r.   r  /  s    r  c                       s*   e Zd ZdZ fddZdddZ  ZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                    s   t  jdi | d S r  )rn   ro   r|   rU   r}   r-   r.   ro   w  s   zEmu3VQVAEGroupNorm.__init__Nc                 C   s   t || j| j| j| jS r   )r   
group_normr   r   rm   r   )r|   inputr   r-   r-   r.   r   z  s   zEmu3VQVAEGroupNorm.forwardr   )r&   r'   r(   r)   ro   r   r   r-   r-   r}   r.   r#  p  s    r#  c                       s:   e Zd Zd fdd	ZddejdejdB fddZ  ZS )	Emu3VQVAEMiddleBlockNc                    s`   t    t|||d| _t|| _|d u r t|dddd| _nt||| _t|||d| _	d S )Nr   r   r  r   r   Tr   )
rn   ro   r  block_1r  attn_1r#  	attn_normr   block_2)r|   ri   r   r  r}   r-   r.   ro     s   

zEmu3VQVAEMiddleBlock.__init__rC   r   c                 C   s   |  ||}|}| ||}|j\}}}}||||| dd}| |d }|||||dddd}|| }| ||}|S )Nr    r0   r   r   )	r)  r+  r3   r   r[   r*  rG   r   r,  )r|   rC   r   r   r   r   r   r   r-   r-   r.   r     s   zEmu3VQVAEMiddleBlock.forwardr   r&   r'   r(   ro   r*   FloatTensorr   r   r-   r-   r}   r.   r'  ~  s    $r'  c                       ,   e Zd Z fddZdejfddZ  ZS )Emu3VQVAEDownBlockc              
      s(  t    t|j| _|j| _|j}|j}dt| }|| _t	
 | _t| jD ]i}t	
 }t	
 }t	
 }|||  }	|||  }
t| jD ]*}|t|	|
d |
}	|jd urq||jv rq|t| |t	j|	dddd qGt	 }||_||_||_|| jd krt|	|_| j| q(d S )Nr    r   r   r   r   Tr   r    )rn   ro   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsr   in_channel_multiplierr\   
ModuleListdownrangeappendr  attn_resolutionsr  r   Moduleblockattn
attn_normsr   
downsample)r|   ri   r7  r4  r8  i_levelr?  r@  rA  block_in	block_outi_blockr:  r}   r-   r.   ro     sD   


zEmu3VQVAEDownBlock.__init__rC   c           
      C   s   t | jD ]^\}}t| jD ]H}|j| |}t|jdkrV|}|j| |}|j\}}}}	|	||||	 
dd}|j| |d }||||	|dddd}|| }q|| jd krc||}q|S )Nr   r    r0   r   )	enumerater:  r;  r6  r?  r3  r@  rA  r3   r   r[   rG   r   r5  rB  )
r|   rC   rC  blocksrF  r   r   r   r   r   r-   r-   r.   r     s    
zEmu3VQVAEDownBlock.forwardr-  r-   r-   r}   r.   r0    s    %r0  c                       s2   e Zd Z fddZdejdejfddZ  ZS )Emu3VQVAEUpBlockc              	      s  t    t|j| _|j| _|j}|j|jd  }t	 | _
tt| jD ]]}t	 }t	 }t	 }|j|j|  }t| jd D ]"}	|t|||d |}||jv re|t| |t|| qCt }
||
_||
_||
_|dkr|t||
_| j
d|
 q&d S )Nr/   r    r(  r   )rn   ro   r3  r4  r5  r6  r   r7  r\   r9  upreversedr;  r<  r  r=  r  r   r>  r?  r@  rA  r   upsampleinsert)r|   ri   r  rD  rC  r?  r@  rA  rE  rF  rJ  r}   r-   r.   ro     s@   



zEmu3VQVAEUpBlock.__init__rC   r   c                 C   s   t | jd d d D ]d\}}t| jd D ]J}|j| ||}t|jdkr_|}|j| ||}|j\}}}	}
|	|||	|
 
dd}|j| |d }|||	|
|dddd}|| }q|t| jd krn||}q
|S )Nr/   r    r   r0   r   )rG  rJ  r;  r6  r?  r3  r@  rA  r3   r   r[   rG   r   rL  )r|   rC   r   rC  rH  rF  r   r   r   r   r   r-   r-   r.   r     s    
zEmu3VQVAEUpBlock.forwardr-  r-   r-   r}   r.   rI    s    %rI  c                       r/  )Emu3VQVAEEncoderc                    s  t    |j}|j}|j}|j}|j}|rd| n|}||d  }tjj	||dddd| _
t|| _t||| _tjjd|ddd	| _tjj	||dddd| _tt|j}	t | _t | _t|	D ]}
t||}| j| qft|jD ]}t||d
}| j| qyd S )Nr0   r/   r   r    r   r   r   T)r   r   r   r   r2  )rn   ro   r7  r   double_latentlatent_channelsr4  r*   r\   r   conv_inr0  
down_blockr'  middle_blockr   norm_outconv_outr   mathlog2temporal_downsample_factorr9  	time_convtime_res_stackr;  r  r<  r6  r  )r|   ri   r7  r   rO  rP  r4  r   rD  temporal_down_blocksir   r   time_res_convr}   r-   r.   ro     s@   




zEmu3VQVAEEncoder.__init__pixel_valuesc                 C   s   |j d }|jdg|j dd  R  }| |}| |}| |}| |}|t|9 }| |}|jd|g|j dd  R  }|	ddddd}| j
D ]}||}|t|9 }qN| jD ]}||}q_|	ddddd}|S )Nr    r/   r0   r   r   r   )r3   rG   rQ  rR  rS  rT  r*   r  rU  r   rY  rZ  )r|   r^  temporal_dimrC   r   layerr-   r-   r.   r   =  s"   








zEmu3VQVAEEncoder.forward)r&   r'   r(   ro   r*   r+   r   r   r-   r-   r}   r.   rN    s    'rN  c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Emu3VQVAEDecoderri   c           	         s  t    |j}|j|jd  }t | _t|j	D ]}t
|j|jd}| j| qtt|j}t | _t|D ]}t|j|j}| j| q<tj|j|dddd| _t|||d| _t|| _|j|jd  }t||| _tj||jdddd| _d S )Nr/   r2  r   r    r   )r  r   )rn   ro   r   r7  r4  r\   r9  rZ  r;  r6  r  rP  r<  r   rV  rW  rX  rY  r  r   rQ  r'  rS  rI  up_blockr   rT  r   rU  )	r|   ri   r  rD  r   r]  temp_upsample_block_numr\  r   r}   r-   r.   ro   \  s@   



zEmu3VQVAEDecoder.__init__rC   r   c                 C   s  t j||fdd}|ddddd}| jD ]}||}q| jD ]}||}|t |9 }q|ddddd}t j|ddd\}}|jdg|jdd  R  }|jdg|jdd  R  }| 	|}| 
||}| ||}| ||}|t |9 }| |}|S )Nr   r1   r0   r    r   r   r/   )r*   r4   r   rZ  rY  r  chunkrG   r3   rQ  rS  rb  rT  rU  )r|   rC   r   hidden_quant_statesr`  r-   r-   r.   r     s$   




zEmu3VQVAEDecoder.forward)	r&   r'   r(   r#   ro   r*   r   r   r   r-   r-   r}   r.   ra  [  s    'ra  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    custom_introc                
       s   e Zd ZU eed< dZdZdZdZdZ	dZ
dZg dZeegedZe dd	 Zdef fd
dZeedejdejdee defddZdejfddZ  ZS )	Emu3VQVAEri   
emuvideovqr^  )imageT)r  r  r  r   rC   
attentionsc                 C   s  t |tjtjfr5tj|jddd |jd ur3tjj	|j\}}dt
| }t|j| | d S d S t |tjrotj|jt
dd |jd urmtjj	|j\}}|dkr`dt
| nd}t|j| | d S d S t |tjtjtjfrt|jd t|jd	 t|d
d d urt|j t|j t|j d S d S t |tjrt|j |jd urt|jddst|j|j  d S d S d S d S )Nfan_outrelu)r   nonlinearityr       )ar   r   rM   running_mean_is_hf_initializedF)
isinstancer\   r   r   initkaiming_normal_r   rm   r*   _calculate_fan_in_and_fan_outrV  sqrtr   ru   kaiming_uniform_BatchNorm2dr  r   	constant_rp   zeros_rr  ones_running_varnum_batches_trackedr   normal_padding_idx)r|   rN   fan_inr   boundr-   r-   r.   _init_weights  s8   

zEmu3VQVAE._init_weightsc                    s   t  | || _t|| _t|| _t|| _dt	|j
d  | _t|j|jddd| _t|j|jddd| _dt	|j
d  | _|   |   d S )Nr0   r    )r   r    r    r  r  )rn   ro   ri   rN  encoderra  decoderr   quantizer3  r4  vision_spatial_factorr   rP  r   
quant_convpost_quant_convspatial_scale_factoreval	post_initr   r}   r-   r.   ro     s   


zEmu3VQVAE.__init__image_sizesrU   rE   c                    s   |j dk}|r jj}|j\}}}}	|dd|ddd}n|j\}}}}}	 |}
|
ddddd} |}|ddddd} 	|}|rO|
dn|} fddt||D }t|
|dS )	Nr   r    r   r0   r   c                    s@   g | ]\}}|d t |d  j d t |d  j f qS )Nr   r    )r   r  )r   single_imager  r   r-   r.   r     s    .z$Emu3VQVAE.encode.<locals>.<listcomp>)last_hidden_stater%   )ndimri   rX  r3   r:   repeatr  r   r  r  squeezer   r$   )r|   r^  r  rU   is_imager   r   r   r   r   rC   conv_hidden_statescodesr%   r-   r   r.   encode  s&   




zEmu3VQVAE.encoderC   c                 C   s   |j dk}|r|d}|j\}}}}| j| }|jd }||||||ddddd }| 	|}	|ddddd}|	ddddd}	| 
|	|}
|
||| jj | jj|| j || j }
|rn|
d d df S |
S )Nr   r    r/   r   r   r0   )r  r:   r3   r  r   flattenr   r   ra   r  r  rG   ri   rX  r   r  )r|   rC   r  r   r   r   r   quantr   
post_quantvideor-   r-   r.   decode  s&   


$

zEmu3VQVAE.decode)r&   r'   r(   r#   r,   base_model_prefixmain_input_nameinput_modalities_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr  r  r  _can_record_outputsr*   no_gradr  ro   r   r   r   r   r   r$   r  r  r   r-   r-   r}   r.   rh    s8   
 	
!rh  c                   @   s   e Zd ZdZdd Zedd Zedd Zedd	 Zed
d Z	edd Z
edd Zdeej dejfddZdejdejfddZdS )Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 C   s"   || _ |d| _|d| _d S )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r|   r  r-   r-   r.   ro   /  s   z#Emu3ImageVocabularyMapping.__init__c                 C      t dd | j D S )Nc                 S   s   g | ]\}}| d r|qS z<|visual token
startswithr   namevalr-   r-   r.   r   6      z;Emu3ImageVocabularyMapping.image_tokens.<locals>.<listcomp>sortedr  itemsr   r-   r-   r.   r%   4     z'Emu3ImageVocabularyMapping.image_tokensc                 C   r  )Nc                 S   s   g | ]\}}| d r|qS r  r  r  r-   r-   r.   r   :  r  z?Emu3ImageVocabularyMapping.image_tokens_str.<locals>.<listcomp>r  r   r-   r-   r.   image_tokens_str8  r  z+Emu3ImageVocabularyMapping.image_tokens_strc                    s    fdd j D S )Nc                    s$   i | ]}t |d d  j| qS )ir   )r   r  )r   tokenr   r-   r.   
<dictcomp>>  s   $ z6Emu3ImageVocabularyMapping.img2bpe.<locals>.<dictcomp>)r  r   r-   r   r.   img2bpe<     z"Emu3ImageVocabularyMapping.img2bpec                 C   s   dd | j  D S )Nc                 S   s   i | ]\}}||qS r-   r-   )r   r<   vr-   r-   r.   r  B      z6Emu3ImageVocabularyMapping.bpe2img.<locals>.<dictcomp>)r  r  r   r-   r-   r.   bpe2img@  r  z"Emu3ImageVocabularyMapping.bpe2imgc                 C   >   t jt| j d t jd}| j D ]\}}|||< q|S Nr    rV   )r*   zerosmaxr  r!  r   r  r|   mappingr<   r  r-   r-   r.   bpe2img_mapping_tensorD     
z1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorc                 C   r  r  )r*   r  r  r  r!  r   r  r  r-   r-   r.   img2bpe_mapping_tensorK  r  z1Emu3ImageVocabularyMapping.img2bpe_mapping_tensor	img_batchrE   c                 C   sR   |j }tj|jd dftjd| j }| j|d }tj||gdd}||S )Nr   r    r  cpur/   r1   )	devicer*   r   r3   r   r  r  r`   r4   )r|   r  r  eol_row
img_tokensr-   r-   r.   convert_img2bpeR  s
    
z*Emu3ImageVocabularyMapping.convert_img2bpec                 C   s0   |j }|dd df }| j|d }||S )N.r/   r  )r  r  r`   )r|   r  r  r  r-   r-   r.   convert_bpe2imgY  s   
z*Emu3ImageVocabularyMapping.convert_bpe2imgN)r&   r'   r(   r)   ro   r   r%   r  r  r  r  r  listr*   r   r  r  r-   r-   r-   r.   r  *  s"    





r  c                   @   sN   e Zd ZU eed< dZdZdZdgZddgZ	dZ
dZdZdZdZeedZd	S )
Emu3PreTrainedModelri   modelrj  textTr   r   causal_maskrk  N)r&   r'   r(   r!   r,   r  r  supports_gradient_checkpointingr  _skip_keys_device_placementr  r  _can_compile_fullgraphr  r  r   rg   r  r-   r-   r-   r.   r  `  s    
 
r  c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Emu3RotaryEmbeddinginv_freqNri   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr  F)
persistentoriginal_inv_freq)rn   ro   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenri   rope_parametersr  compute_default_rope_parametersr   attention_scalingregister_bufferclone)r|   ri   r  rope_init_fnr  r}   r-   r.   ro   y  s   


zEmu3RotaryEmbedding.__init__r  ztorch.deviceseq_lenrE   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetarK   Nr   r   r0   r  )r  rV   )	r  rp   rq   rr   r*   arangeint64r`   r   )ri   r  r  baser2   attention_factorr  r-   r-   r.   r    s   
&z3Emu3RotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r/   r    mpsr  F)device_typeenabledr0   r1   r  )r  r   rF   r3   r`   r  rt  typestrr   r[   r*   r4   r=   r  r>   rV   )
r|   r5   r   inv_freq_expandedposition_ids_expandedr  freqsembr=   r>   r-   r-   r.   r     s   0&zEmu3RotaryEmbedding.forwardr   )NNN)r&   r'   r(   r*   r   r,   r!   ro   staticmethodr   r   r   r   r  r  r   r   r   r-   r-   r}   r.   r  v  s&   
 

r  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
ej	dB dedB dee defddZ  ZS )Emu3TextModelri   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r-   )r   )r   rj   ri   r-   r.   r     r  z*Emu3TextModel.__init__.<locals>.<listcomp>r   r  F)rn   ro   pad_token_idr  
vocab_sizer\   r   rq   embed_tokensr9  r;  num_hidden_layerslayersr   r   normr  
rotary_embgradient_checkpointingr  r   r}   r  r.   ro     s   zEmu3TextModel.__init__N	input_idsrR   r   r   inputs_embedsr   r   rU   rE   c              
   K   s   |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r<|d ur-| nd}	tj|jd |jd|	 }|d u rE|	d}t
| j|||||d}
|}| j||d}| jd | jj D ]}||f|
|||||d|}qb| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsr  r   r    )r  )ri   r  rR   r   r   r   )r   )rR   r   r   r   r   r   )r  r   )r  r  r
   ri   get_seq_lengthr*   r  r3   r  r:   r   r  r  r  r   r   )r|   r  rR   r   r   r  r   r   rU   past_seen_tokensr  rC   r   decoder_layerr-   r-   r.   r     sP   

	
zEmu3TextModel.forward)NNNNNNN)r&   r'   r(   r!   ro   r   r   r   r*   r+   r   r	   r.  r   r   r   r   r   r   r-   r-   r}   r.   r    s>    	
r  c                       s   e Zd ZU ddiZddiZddgdgfiZeed<  fdd	Ze	e
	
	
	
	
	
	
	
	
	ddejd
B dejd
B dejd
B ded
B dejd
B dejd
B ded
B dejd
B deejB dee defddZ  ZS )Emu3ForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrC   logitsri   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S NFrl   )
rn   ro   r  r  r  r\   ru   rq   r
  r  r   r}   r-   r.   ro     s
   
zEmu3ForCausalLM.__init__Nr   r  rR   r   r   r  labelsr   r   logits_to_keeprU   rE   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```r  rR   r   r   r  r   r   Nr  r  r  lossr  r   rC   rl  r-   )r  r  rt  r   slicer
  loss_functionri   r  r   r   rC   rl  )r|   r  rR   r   r   r  r  r   r   r  rU   outputsrC   slice_indicesr  r  r-   r-   r.   r     s0   !zEmu3ForCausalLM.forward)	NNNNNNNNr   )r&   r'   r(   _tied_weights_keys_tp_plan_pp_planr"   r,   ro   r   r   r*   r+   r   r	   r.  r   r   r   r   r   r   r   r-   r-   r}   r.   r    sP   
 		
r  c                       sL  e Zd ZddiZ fddZdd Zdd Zd	ejd
ej	dej	fddZ
eeddd	ejd
ej	dee deeB fddZe dej	dedefddZdej	dejdejfddZee									d%dej	dB d	ejdB d
ejdB dejdB dej	dB d edB dejdB d!edB d"ej	dB dee deeB fd#d$Z  ZS )&	Emu3Modelztext_model.model
text_modelc                    s>   t  | t|j| _t|j| _t	|j
| _|   d S r   )rn   ro   r  _from_configtext_configr  rh  	vq_configvqmodelr  vocabulary_mapvocabulary_mappingr  r   r}   r-   r.   ro   Y  s
   zEmu3Model.__init__c                 C   
   | j  S r   )r  get_input_embeddingsr   r-   r-   r.   r$  b     
zEmu3Model.get_input_embeddingsc                 C      | j | d S r   )r  set_input_embeddingsr|   rQ   r-   r-   r.   r'  e     zEmu3Model.set_input_embeddingsr^  r  rE   c                    s4    j j||dd} fdd|jD }t|}|S )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        T)return_dictc                       g | ]
} j | qS r-   r"  r  r  r   tokensr   r-   r.   r   u      z.Emu3Model.get_image_tokens.<locals>.<listcomp>)r   r  r%   r*   r4   )r|   r^  r  vqmodel_outputsbpe_tokens_list
bpe_tokensr-   r   r.   get_image_tokensh  s   

zEmu3Model.get_image_tokenszbTokenizes images into discrete tokens with VQGAN module and embeds them with text embeddings layerrf  rU   c           
         sl    j j||fddi|} fdd|D } fdd|jD }t|}  |}t||}	|	|_|S )z
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
            The tensors corresponding to the input images.
        r*  Tc                    s,   g | ]\}}| j j | j j d   qS r1  )r   r  )r   r   r   r   r-   r.   r     s    z0Emu3Model.get_image_features.<locals>.<listcomp>c                    r+  r-   r,  r-  r   r-   r.   r     r/  )r   r  r%   r*   r4   r$  splitpooler_output)
r|   r^  r  rU   r0  split_sizesr1  r2  image_embeddingsimage_featuresr-   r   r.   get_image_features{  s$   


zEmu3Model.get_image_featuresr%   r   r   c                 C   s>   |ddddf  d||d }| j|}| j|}|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nr/   r    )r   r"  r  r   r  )r|   r%   r   r   	sequencesrj  r-   r-   r.   decode_image_tokens  s   "zEmu3Model.decode_image_tokensr  r  r8  c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)rV   r  r/   r   r    z6Image features and image tokens do not match, tokens: z, features: )r$  r*   tensorr"  r  longr  allr   r3   r:   	expand_asr`   r   numel)r|   r  r  r8  special_image_maskn_image_tokensn_image_featuresr-   r-   r.   get_placeholder_mask  s   zEmu3Model.get_placeholder_maskNrR   r   r   r   r   c
              	   K   s   |du |duA rt d|du r|  |}|dur6| ||j}tj|dd}| j|||d}|||}| jd||||||	d|
}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   r1   )r  r8  )rR   r   r   r  r   r   r-   )	r  r$  r9  r5  r*   r4   rE  masked_scatterr  )r|   r  r^  r  rR   r   r   r  r   r   rU   r8  rB  r  r-   r-   r.   r     s0   
zEmu3Model.forward)	NNNNNNNNN)r&   r'   r(   _checkpoint_conversion_mappingro   r$  r'  r*   r.  r+   r3  r   r   r   r   r   r$   r9  r  r   r<  rE  r   r	   r   r   r   r   r-   r-   r}   r.   r  V  s|    	
	
r  c                       s  e Zd ZdZddiZddddZ fdd	Zd
d Zdd Zde	j
fddZdd Zee											d'dejdB dejdB dejdB dejdB dejdB dedB dejdB dedB dejdB dejdB deejB d ee deeB fd!d"Z						#		$d( fd%d&	Z  ZS ))Emu3ForConditionalGenerationr  r	  z$model.text_model.embed_tokens.weightzmodel.text_modelzmodel.vqmodelr
  )z^text_model.modelz^vqmodelz^text_model.lm_headc                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S r  )rn   ro   r  r  r\   ru   r  rq   r  r
  r  r   r}   r-   r.   ro     s   
z%Emu3ForConditionalGeneration.__init__c                 C   r#  r   )r  r$  r   r-   r-   r.   r$    r%  z1Emu3ForConditionalGeneration.get_input_embeddingsc                 C   r&  r   )r  r'  r(  r-   r-   r.   r'    r)  z1Emu3ForConditionalGeneration.set_input_embeddingsrE   c                 C   s   | j S r   )r
  r   r-   r-   r.   get_output_embeddings  s   z2Emu3ForConditionalGeneration.get_output_embeddingsc                 K   s   | j jdi |S r  )r  r<  r$  r-   r-   r.   r<    s   z0Emu3ForConditionalGeneration.decode_image_tokensNr   r  r^  r  rR   r   r   r  r   r   r  r  rU   c              
   K   s   | j d|||||||	d|}|d }t|trt| dn|}| |dd|ddf }d}|
durD| jd||
| jjjd|}t	|||j
|j|jdS )a  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```r  r   Nr  r  r-   )r  rt  r   r  r
  r  ri   r  r  r   r   rC   rl  )r|   r  r^  r  rR   r   r   r  r   r   r  r  rU   r  rC   r  r  r  r-   r-   r.   r     s8   Az$Emu3ForConditionalGeneration.forwardTFc
                    s:   t  j|f||||||||	d|
}|	s|rd |d< |S )N)r   rR   r  r   r   r^  r   is_first_iterationr^  )rn   prepare_inputs_for_generation)r|   r  r   rR   r  r   r   r   r^  rJ  rU   model_inputsr}   r-   r.   rK  p  s"   
z:Emu3ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNTNF)r&   r'   r(   output_modalitiesr  rG  ro   r$  r'  r\   r>  rI  r<  r   r   r*   r+   r.  r   r	   r   r   r   r   r   r   r   rK  r   r-   r-   r}   r.   rH    sz    	
`rH  )rH  r  r  r  rh  r  r1  )rM   )brV  collections.abcr   dataclassesr   	functoolsr   typingr   r*   torch.nnr\   torch.nn.functionalr]   r    r   ru  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_emu3r!   r"   r#   r$   r8   rB   r   r   rL   r>  r   rf   rg   r   r   r   r   r   r   r   r   r  r  r  r  r  r   r#  r'  r0  rI  rN  ra  rh  r  r  r  r  r  r  rH  __all__r-   r-   r-   r.   <module>   s   	
F-"$1?A";:FF 6APM   