o
    ei                     @   sX  d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 edG dd dej8Z9G dd dej8Z:G dd  d ej8Z;G d!d" d"ej8Z<G d#d$ d$ej8Z=d%d& Z>G d'd( d(ej8Z?G d)d* d*ej8Z@G d+d, d,ej8ZAd-d. ZBed/d^d0d1ZCd2ejDd3eEd4ejDfd5d6ZF	7d_d8ej8d9ejDd:ejDd;ejDd<ejDdB d=eGd>eGd?e(e* fd@dAZHeeCG dBdC dCej8ZIG dDdE dEeZJe+G dFdG dGe&ZKe+G dHdI dIe&ZLG dJdK dKej8ZMe+G dLdM dMeKZNe+G dNdO dOeKeZOee+dPdQG dRdS dSe ZPee+dTdQG dUdV dVeZQe+dWdQG dXdY dYeLZRe+dZdQG d[d\ d\eLeZSg d]ZTdS )`    )Callable)	dataclass)OptionalN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )
AriaConfigAriaTextConfigRMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )AriaTextRMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z>
        AriaTextRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer*   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/aria/modeling_aria.pyr-   5   s   

zAriaTextRMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr"   T)keepdim)	dtypetor/   float32powmeanrsqrtr2   r1   )r3   r9   input_dtypevariancer7   r7   r8   forward=   s
   zAriaTextRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler1   shaper2   r3   r7   r7   r8   
extra_reprD   s   zAriaTextRMSNorm.extra_repr)r)   )
__name__
__module____qualname__floatr-   r/   TensorrD   rH   __classcell__r7   r7   r5   r8   r(   3   s    r(   c                       (   e Zd ZdZ fddZdd Z  ZS )AriaProjectorMLPa!  
    Feed-Forward Network module for the Aria Projector.

    Args:
        in_features (`int`):
            Input embedding dimension.
        hidden_features (`int`):
            Hidden dimension of the feed-forward network.
        output_dim (`int`):
            Output dimension.
    c                    s<   t    tj||dd| _tj||dd| _td | _d S )NFbiasgelu_new)r,   r-   r   Linear	linear_in
linear_outr   act)r3   in_featureshidden_features
output_dimr5   r7   r8   r-   U   s   
zAriaProjectorMLP.__init__c                 C   s   |  | |}| |}|S N)rW   rU   rV   )r3   r9   r7   r7   r8   rD   [   s   
zAriaProjectorMLP.forwardrI   rJ   rK   __doc__r-   rD   rN   r7   r7   r5   r8   rP   H   s    rP   c                       s6   e Zd ZdZd
dedef fddZddd	Z  ZS )AriaCrossAttentionzv
    Aria Cross-Attention module.

    Args:
        config (`AriaConfig`):
            The configuration to use.
    r   configdropout_ratec                    s   t    |jj}|jj}|| _tj||dd| _tj||dd| _	tj||dd| _
tj||dd| _t||| _t|| _t|| _t|| _d S )NFrQ   T)batch_first)r,   r-   vision_configr4   num_attention_heads	num_headsr   rT   q_projk_projv_projMultiheadAttentionmultihead_attnlinearDropoutdropout	LayerNorm
layer_normlayer_norm_kv)r3   r_   r`   r4   rd   r5   r7   r8   r-   j   s   
zAriaCrossAttention.__init__Nc           	      C   sX   |  | |}| |}| |}| |}| j||||d\}}| | |}|S )a  
        Forward pass of the AriaCrossAttention module.

        Args:
            key_value_states (`torch.Tensor`):
                Input tensor for key and value.
            hidden_states (`torch.Tensor`):
                Input tensor for query.
            attn_mask (`torch.Tensor`, *optional*, defaults to None):
                Attention mask.

        Returns:
            torch.Tensor:
                Output tensor after cross-attention.
        	attn_mask)re   rn   ro   rf   rg   ri   rl   rj   )	r3   key_value_statesr9   rq   querykeyvalueattn_output_r7   r7   r8   rD   {   s   


zAriaCrossAttention.forward)r   r[   )	rI   rJ   rK   r]   r%   rL   r-   rD   rN   r7   r7   r5   r8   r^   a   s    r^   c                       sB   e Zd ZdZdef fddZd
dejdejdB fdd	Z  Z	S )AriaProjectora  
    Aria Projector module.

    This module projects vision features into the language model's embedding space, enabling interaction between vision and language components.

    Args:
        config (`AriaConfig`):
            Configuration object for the model.
    r_   c                    s   t    |j| _|jj| _|jj| _|jj| _	|j
j| _|j
j| _tt|j| j| _t|| _t| j| _t| j| j| j| _d S r[   )r,   r-   projector_patch_to_query_dictpatch_to_query_dictrb   r4   rX   rc   rd   kv_dimtext_configrY   rZ   r   r.   r/   zeros'max_value_projector_patch_to_query_dictrs   r^   
cross_attnrm   rn   rP   feed_forwardr3   r_   r5   r7   r8   r-      s   






zAriaProjector.__init__Nrr   rq   c           	      C   s   |j d |j d }}|| jvrtd| d| j  d| j| }| jd| d|dd}|durJ|| jd}|d	d|
dd}| j|||d}| | |}|S )	a  
        Forward pass of the Projector module.

        Args:
            key_value_states (`torch.Tensor`):
                Input tensor of shape (batch_size, num_patches, kv_dim).
            attn_mask (`torch.Tensor`, *optional*, default is None):
                Attention mask.

        Returns:
            `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim).
        r   r$   zNumber of patches z: not found in patch_to_query_dict amongst possible values .Nr:   rp   )rF   rz   KeyErrorkeysrs   	unsqueezerepeatrepeat_interleaverd   expandsizer   r   rn   )	r3   rr   rq   
batch_sizenum_patches	query_numqueriesattention_outoutr7   r7   r8   rD      s   

zAriaProjector.forwardr[   )
rI   rJ   rK   r]   r%   r-   r/   rM   rD   rN   r7   r7   r5   r8   rx      s    
$rx   c                       s.   e Zd ZdZdef fddZdd Z  ZS )AriaSharedExpertsMLPa/  
    Shared Expert MLP for shared experts.

    Unlike routed experts, shared experts process all tokens without routing.
    This class reconfigures the intermediate size in comparison to the LlamaMLP.

    Args:
        config (`AriaTextConfig`): Configuration object for the Aria language model.
    r_   c                    s~   t    || _|j| _|j|j | _tj| j| j|jd| _	tj| j| j|jd| _
tj| j| j|jd| _t|j | _d S )NrQ   )r,   r-   r_   r4   intermediate_sizemoe_num_shared_expertsr   rT   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr   r5   r7   r8   r-      s   
zAriaSharedExpertsMLP.__init__c                 C   s$   |  | | || | }|S r[   )r   r   r   r   )r3   xr   r7   r7   r8   rD      s    zAriaSharedExpertsMLP.forward)rI   rJ   rK   r]   r&   r-   rD   rN   r7   r7   r5   r8   r      s    

r   c                 C   s   | j d }|j d }tj||| j| jd}tj|dd}tjdtj|jd}t||f}t|j d D ] }|| }	||d  }
| |	|
 }t	||| }|||	|
< q4|S )a*  
    Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts.

    Args:
        token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features).
        expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features).
        tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert.

    Returns:
        torch.Tensor: Output tensor of shape (num_tokens, out_features).
    r   r:   r<   devicedimr$   )
rF   r/   r}   r<   r   cumsumlongcatrangematmul)token_statesexpert_weightstokens_per_expert
num_tokensout_featuresoutputcumsum_num_tokenszero_tensor
expert_numstartendtokensr   r7   r7   r8   sequential_experts_gemm   s   

r   c                       rO   )AriaGroupedExpertsGemmaP  
    Grouped GEMM (General Matrix Multiplication) module for efficient expert computation.
    This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm)
    for optimized performance. If the grouped_gemm library is not installed, it gracefully
    falls back to a sequential GEMM implementation, which may be slower but ensures
    functionality.

    Args:
        in_features (`int`):
            Number of input features.
        out_features (`int`):
            Number of output features.
        groups (`int`):
            Number of expert groups.
    c                    s6   t    || _|| _|| _tt|||| _	d S r[   )
r,   r-   rX   r   groupsr   r.   r/   emptyr1   )r3   rX   r   r   r5   r7   r8   r-   #  s
   
zAriaGroupedExpertsGemm.__init__c                 C   s   t || j| S )au  
        Perform grouped matrix multiplication.

        Args:
            input (`torch.Tensor`):
                Input tensor of shape (num_tokens, in_features).
            tokens_per_expert (`torch.Tensor`):
                Number of tokens assigned to each expert.

        Returns:
            torch.Tensor: Output tensor of shape (num_tokens, out_features).
        )r   r1   cpu)r3   inputr   r7   r7   r8   rD   *  s
   zAriaGroupedExpertsGemm.forwardr\   r7   r7   r5   r8   r     s    r   c                       s>   e Zd Zdeddf fddZdd Zdejfdd	Z  Z	S )
AriaExpertsr_   r+   Nc                    s@   t    || _t|j|jd |j| _t|j|j|j| _d S )Nr"   )	r,   r-   r_   r   r4   r   moe_num_expertsfc1fc2r   r5   r7   r8   r-   ?  s   
zAriaExperts.__init__c                 C   s0   t j|| jjdd\}}tjj|dd}||fS )Nr$   )kr   r:   r   )r/   topkr_   moe_topkr   
functionalsoftmax)r3   router_logits
top_logitstop_indicesscoresr7   r7   r8   route_tokens_to_expertsE  s   z#AriaExperts.route_tokens_to_expertsc                 C   s  |  |\}}|j}tj| tj| jjd| jjd d|}|}|	d}t
|}	|d|	| jj }
| |
|}tj|ddd\}}tj|| }| ||}tj|jd | jj |df|j|jd}|d|	| |	d| jj|d}||d jdd}|S )Nr   r$   )binsminmaxr:   r"   r   r   )r   r<   r/   histcflattenr=   r>   r_   r   viewargsortindex_selectr   r   chunkr   r   silur   r}   rF   r   r   index_copy_r   sum)r3   r9   r   top_k_indextop_k_weightsoriginal_dtyper   indicesflatten_indicessorted_indicespermuted_tokens
fc1_output
projectiongateexpert_outputunpermuted_tokensr   r7   r7   r8   rD   J  s6   


zAriaExperts.forward)
rI   rJ   rK   r&   r-   r   r/   rM   rD   rN   r7   r7   r5   r8   r   >  s    r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )AriaTextMoELayerr_   c                    s>   t    tj|j|jdd| _t|| _t	|| _
|| _d S NFrQ   )r,   r-   r   rT   r4   r   routerr   expertsr   shared_expertsr_   r   r5   r7   r8   r-   k  s
   



zAriaTextMoELayer.__init__r9   r+   c                 C   sL   |j }|d|d}| |}| |||}| ||}|| S Nr:   )rF   r   r   r   r   r   )r3   r9   original_shaper   r   shared_expert_outputr7   r7   r8   rD   r  s   
zAriaTextMoELayer.forward)	rI   rJ   rK   r&   r-   r/   rM   rD   rN   r7   r7   r5   r8   r   j  s    r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr:   r"   r   )rF   r/   r   )r   x1x2r7   r7   r8   rotate_half{  s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r   )qr   cossinunsqueeze_dimq_embedk_embedr7   r7   r8   apply_rotary_pos_emb  s
   

r   r9   n_repr+   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r$   N)rF   r   reshape)r9   r   batchnum_key_value_headsslenhead_dimr7   r7   r8   	repeat_kv  s
   0r           modulers   rt   ru   attention_maskscalingrl   kwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr"   r   r:   )r   r<   )ptrainingr$   )r   num_key_value_groupsr/   r   	transposer   r   r   r>   r=   r<   rl   r   
contiguous)r   rs   rt   ru   r   r   rl   r   
key_statesvalue_statesattn_weightsrv   r7   r7   r8   eager_attention_forward  s   
r  c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejf fddZ  ZS )AriaTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr_   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr   g      TrQ   )r,   r-   r_   r  getattrr4   rc   r   r   r   r   attention_dropout	is_causalr   rT   attention_biasre   rf   rg   o_projr3   r_   r  r5   r7   r8   r-     s(   
zAriaTextAttention.__init__Nr9   position_embeddingsr   past_key_valuescache_positionr   r+   c                 K   s  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jd|\}}|jg |dR   }| |}||fS )Nr:   r$   r"   )r   r   r  r   )rl   r   )rF   r   re   r   r   rf   rg   r   updater  r   get_interfacer_   _attn_implementationr  r   r  r   r   r   r	  )r3   r9   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacerv   r  r7   r7   r8   rD     s8   	

zAriaTextAttention.forward)NNNN)rI   rJ   rK   r]   r&   intr-   r/   rM   rE   r	   
LongTensorr   r   rD   rN   r7   r7   r5   r8   r    s,    r  c                       s   e Zd ZdZdedef fddZ						ddejd	ejdB d
ej	dB de
dB dedB dej	dB deejejf dB dee dejfddZ  ZS )AriaTextDecoderLayerag  
    Aria Text Decoder Layer.

    This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network.

    Args:
        config (`AriaTextConfig`):
            Configuration object for the text component of the model.
        layer_idx (`int`):
            Index of the layer.
    r_   r  c                    sR   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
d S )N)r_   r  r*   )r,   r-   r4   r  	self_attnr   mlpr(   rms_norm_epsinput_layernormpost_attention_layernormr
  r5   r7   r8   r-     s   

zAriaTextDecoderLayer.__init__NFr9   r   position_idsr  	use_cacher  r  r   r+   c              
   K   s^   |}	|  |}| jd|||||||d|\}}
|	| }|}	| |}| |}|	| }|S )N)r9   r   r  r  r   r  r  r7   )r  r  r  r  )r3   r9   r   r  r  r   r  r  r   residualrw   r7   r7   r8   rD     s&   




zAriaTextDecoderLayer.forward)NNNFNN)rI   rJ   rK   r]   r&   r  r-   r/   rM   r  r	   boolrE   r   r   rD   rN   r7   r7   r5   r8   r    s8    	
r  c                       s\   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZdZeedZe  fd	d
Z  ZS )AriaTextPreTrainedModelr_   model)imagetextr  r   Tr  r9   
attentionsc                    s4   t  | t|trtj|jd| jjd d S d S )Nr   )r@   std)	r,   _init_weights
isinstancer   initnormal_r1   r_   initializer_ranger3   r   r5   r7   r8   r*  Q  s   
z%AriaTextPreTrainedModel._init_weights)rI   rJ   rK   r&   __annotations__base_model_prefixinput_modalities_no_split_modulessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_attention_backendr  r  _can_record_outputsr/   no_gradr*  rN   r7   r7   r5   r8   r#  @  s   
 r#  c                       s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZe  fdd	Z  ZS )
AriaPreTrainedModelr_   r$  TAriaDecoderLayerr  Fr'  c                    s2   t  | t|trtj|j| jjd d S d S )N)r)  )	r,   r*  r+  rx   r,  trunc_normal_rs   r_   r.  r/  r5   r7   r8   r*  i  s   
z!AriaPreTrainedModel._init_weights)rI   rJ   rK   r%   r0  r1  r4  r3  r5  r6  r7  _supports_flex_attn_can_compile_fullgraphr8  r  r  r9  r/   r:  r*  rN   r7   r7   r5   r8   r;  X  s    
 r;  c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )AriaTextRotaryEmbeddinginv_freqNr_   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrA  F)
persistentoriginal_inv_freq)r,   r-   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr_   rope_parametersrB  compute_default_rope_parametersr   attention_scalingregister_bufferclone)r3   r_   r   rope_init_fnrA  r5   r7   r8   r-   s  s   


z AriaTextRotaryEmbedding.__init__r   ztorch.deviceseq_lenr+   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetar   Ng      ?r   r"   r<   )r   r<   )	rI  r  r4   rc   r/   arangeint64r=   rL   )r_   r   rO  baser   attention_factorrA  r7   r7   r8   rJ    s   
&z7AriaTextRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r:   r$   mpsr   F)device_typeenabledr"   r   rQ  )rA  rL   r   rF   r=   r   r+  typestrr   r   r/   r   r   rK  r   r<   )
r3   r   r  inv_freq_expandedposition_ids_expandedrW  freqsembr   r   r7   r7   r8   rD     s   0&zAriaTextRotaryEmbedding.forwardr[   )NNN)rI   rJ   rK   r/   rM   r0  r&   r-   staticmethodr   r  rE   rL   rJ  r:  r   rD   rN   r7   r7   r5   r8   r@  p  s&   
 

r@  c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
ej	dB dedB dee defddZ  ZS )AriaTextModelr_   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r7   )r  ).0r  r_   r7   r8   
<listcomp>  s    z*AriaTextModel.__init__.<locals>.<listcomp>r  rb  F)r,   r-   pad_token_idpadding_idx
vocab_sizer   	Embeddingr4   embed_tokens
ModuleListr   num_hidden_layerslayersr(   r  normr@  
rotary_embgradient_checkpointing	post_initr   r5   rb  r8   r-     s   zAriaTextModel.__init__N	input_idsr   r  r  inputs_embedsr  r   r   r+   c              
   K   s   |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r<|d ur-| nd}	tj|jd |jd|	 }|d u rE|	d}t
| j|||||d}
|}| j||d}| jd | jj D ]}||f|
|||||d|}qb| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsrb  r   r$   )r   )r_   rq  r   r  r  r  )r  )r   r  r  r  r   r  )last_hidden_stater  )
ValueErrorrh  r
   r_   get_seq_lengthr/   rR  rF   r   r   r   rm  rk  rj  rl  r   )r3   rp  r   r  r  rq  r  r   r   past_seen_tokenscausal_maskr9   r  decoder_layerr7   r7   r8   rD     sP   

	
zAriaTextModel.forward)NNNNNNN)rI   rJ   rK   r&   r-   r    r!   r   r/   r  rM   r	   FloatTensorr"  r   r   r   rD   rN   r7   r7   r5   r8   r`    s>    	
r`  c                       s   e Zd ZddiZddiZddgdgfiZdef fdd	Ze	
	
	
	
	
	
	
	
	dde	j
d
B de	jd
B de	j
d
B ded
B de	jd
B de	j
d
B ded
B de	j
d
B dee	jB dee defddZ  ZS )AriaTextForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr9   logitsr_   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r   )
r,   r-   r`  r$  rf  r   rT   r4   r{  ro  r   r5   r7   r8   r-     s
   
zAriaTextForCausalLM.__init__Nr   rp  r   r  r  rq  labelsr   r  logits_to_keepr   r+   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, AriaTextForCausalLM

        >>> model = AriaTextForCausalLM.from_pretrained("meta-aria_text/AriaText-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-aria_text/AriaText-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rp  r   r  r  rq  r   r  Nr}  r~  rf  lossr}  r  r9   r(  r7   )r$  rr  r+  r  slicer{  loss_functionr_   rf  r   r  r9   r(  )r3   rp  r   r  r  rq  r~  r   r  r  r   outputsr9   slice_indicesr}  r  r7   r7   r8   rD     s0   zAriaTextForCausalLM.forward)	NNNNNNNNr   )rI   rJ   rK   _tied_weights_keys_tp_plan_pp_planr&   r-   r   r/   r  rM   r	   rx  r"  r  r   r   r   rD   rN   r7   r7   r5   r8   ry    sL    		
ry  zP
    Base class for Aria causal language model (or autoregressive) outputs.
    custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	AriaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r}  r  r9   r(  image_hidden_states)rI   rJ   rK   r]   r  r/   rx  r0  r}  r  r	   r9   rE   r(  r  r7   r7   r7   r8   r  M  s   
 r  zI
    Base class for Aria outputs, with hidden states and attentions.
    c                   @   s$   e Zd ZU dZdZejdB ed< dS )AriaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  )rI   rJ   rK   r]   r  r/   rx  r0  r7   r7   r7   r8   r  k  s   
 r  zt
    The Aria model which consists of a vision backbone and a language model, without a language modeling head.
    c                       s:  e Zd ZddiZdef fddZdd Zdd	 Zee	e
d
d			d$dejdejdB dededB dee deeB fddZdejdejdejfddZee
									d%dejdB dejdB dejdB dejdB dejdB dedB dejdB dedB dejdB dee deeB fd d!Zd"d# Z  ZS )&	AriaModel^language_model.modellanguage_modelr_   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S r[   )r,   r-   r#   from_configrb   vision_towerrx   multi_modal_projectorr|   r  ro  r   r5   r7   r8   r-     s
   
zAriaModel.__init__c                 C   
   | j  S r[   )r  get_input_embeddingsrG   r7   r7   r8   r       
zAriaModel.get_input_embeddingsc                 C      | j | d S r[   )r  set_input_embeddingsr3   ru   r7   r7   r8   r       zAriaModel.set_input_embeddingszWObtains image last hidden states from the vision tower and apply multimodal projection.r  Nr:   pixel_values
pixel_maskvision_feature_layeroutput_hidden_statesr   r+   c                 K   sb   |  |}| j|f|ddd|}d }|d ur"|d}	t|	}|j| }
| j|
|d|_|S )NT)patch_attention_maskr  return_dictr$   rp   )_create_patch_attention_maskr  r   r/   logical_notr9   r  pooler_output)r3   r  r  r  r  r   r  image_outputsimage_attn_maskflattened_maskselected_image_featurer7   r7   r8   get_image_features  s"   



zAriaModel.get_image_featuresrp  rq  image_featuresc                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr   r:   r   r$   z6Image features and image tokens do not match, tokens: z, features: )r  r/   tensorr_   image_token_idr   r   allr   rF   r   	expand_asr=   r   numel)r3   rp  rq  r  special_image_maskn_image_tokensn_image_featuresr7   r7   r8   get_placeholder_mask  s   zAriaModel.get_placeholder_maskr   r  r  r   r  c
              	   K   s   |d u r
|   |}|d ur7|jd dkr7| j||| jjddj}||j|j}| j	|||d}|
||}| jd||||||	d|
}t|j|rN|jnd |j|j|d ur[|dS d dS )Nr$   T)r  r  r  r  )rq  r  )r   r  r  rq  r   r  )rr  r  r9   r(  r  r7   )r  rF   r  r_   r  r  r=   r   r<   r  masked_scatterr  r  rr  r  r9   r(  )r3   rp  r  r  r   r  r  rq  r   r  r   r  r  r  r7   r7   r8   rD     sF   

zAriaModel.forwardc                 C   sX   |d u rd S |j d| jjj| jjjd}|j d| jjj| jjjd}|jdddk S )Nr$   )	dimensionr   stepr"   )r:   r   r   )unfoldr  r_   
patch_sizer   r"  )r3   r  patches_subgridr7   r7   r8   r     s   z&AriaModel._create_patch_attention_mask)Nr:   N)	NNNNNNNNN)rI   rJ   rK   _checkpoint_conversion_mappingr%   r-   r  r  r   r    r   r/   rx  r  r"  r   r   rE   r   r  r  r  rM   r	   r   r  rD   r  rN   r7   r7   r5   r8   r    s    
	
0r  z
    Aria model for conditional generation tasks.

    This model combines a vision tower, a multi-modal projector, and a language model
    to perform tasks that involve both image and text inputs.
    c                       sR  e Zd ZdddddZddiZdef fd	d
Zdd Zdd Zde	j
fddZe		d)dejdejdB dedee deeB f
ddZee											d*dejdB dejdB dejdB dejdB dejdB dedB dejdB d ejdB d!edB d"eejB d#ejdB dee deeB fd$d%Z								&d+ fd'd(	Z  ZS ),AriaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorr{  )r  z^vision_towerz^multi_modal_projectorz^language_model.lm_headrz  z(model.language_model.embed_tokens.weightr_   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S r   )r,   r-   r  r$  r   rT   r|   r4   rf  r{  ro  r   r5   r7   r8   r-   "  s   
z%AriaForConditionalGeneration.__init__c                 C   r  r[   )r$  r  rG   r7   r7   r8   r  (  r  z1AriaForConditionalGeneration.get_input_embeddingsc                 C   r  r[   )r$  r  r  r7   r7   r8   r  +  r  z1AriaForConditionalGeneration.set_input_embeddingsr+   c                 C   s   | j S r[   )r{  rG   r7   r7   r8   get_output_embeddings.  s   z2AriaForConditionalGeneration.get_output_embeddingsNr:   r  r  r  r   c                 K   s   | j jd|||d|S )N)r  r  r  r7   )r$  r  )r3   r  r  r  r   r7   r7   r8   r  1  s   z/AriaForConditionalGeneration.get_image_featuresr   rp  r   r  r  rq  r~  r   r  r  c                 K   s   | j d||||||||	|d	|}|d }t|
tr!t|
 dn|
}| |dd|ddf }d}|durF| jd||| jjjd|}t	|||j
|j|jdS )a{  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `AriaForConditionalGeneration`).
            Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from io import BytesIO

        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

        >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria")
        >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", dtype=torch.bfloat16, device_map="auto")

        >>> # Create inputs
        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "image"},
        ...             {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
        ...             {"type": "image"},
        ...             {"type": "text", "text": "What can we see in this image?"},
        ...         ]
        ...     },
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "image"},
        ...             {"type": "text", "text": "In which city is that bridge located?"},
        ...         ]
        ...     }
        ... ]

        >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
        >>> images = [[image1, image2], [image3]]
        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)

        >>> # Generate
        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

        >>> print(generated_texts[0])
        Assistant: There are buildings, trees, lights, and water visible in this image.

        >>> print(generated_texts[1])
        Assistant: The bridge is in San Francisco.
        ```)	rp  r  r  r   r  r  rq  r   r  r   Nr  r  r7   )r$  r+  r  r  r{  r  r_   r|   rf  r  r  r9   r(  )r3   rp  r  r  r   r  r  rq  r~  r   r  r  r   r  r9   r  r}  r  r7   r7   r8   rD   @  s<   N
z$AriaForConditionalGeneration.forwardFc
              	      sF   t  j|f||||||	d|
}|	s|
dds!||d< ||d< |S )N)r  rq  r   r  r  is_first_iterationr   Tr  r  )r,   prepare_inputs_for_generationget)r3   rp  r  rq  r  r  r   r  r  r  r   model_inputsr5   r7   r8   r    s    z:AriaForConditionalGeneration.prepare_inputs_for_generationr   )NNNNNNNNNr   N)NNNNNNNF)rI   rJ   rK   r  r  r%   r-   r  r  r   Moduler  r   r/   rx  r  r   r   rE   r   r  r   r  rM   r	   r"  r  rD   r  rN   r7   r7   r5   r8   r    s    
	
or  )r  r;  r#  r`  r  ry  )r$   )r   )Ucollections.abcr   dataclassesr   typingr   r/   r    r   r,  activationsr   cache_utilsr	   r
   
generationr   integrationsr   r   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r    utils.output_capturingr!   autor#   configuration_ariar%   r&   r  r(   rP   r^   rx   r   r   r   r   r   r   r   rM   r  r   rL   r  r  r  r#  r;  r@  r`  ry  r  r  r  r  __all__r7   r7   r7   r8   <module>   s   7A,,
F8APJ  9