o
    i                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 e08e9Z:G dd de	j;Z<G dd de	j;Z=G dd dej	j;Z>G dd de	j;Z?G dd de	j@ZAed G d!d  d e	j;ZBG d"d# d#e	j;ZCd$ejDd%ejDd&ejDd'eEejDejDf fd(d)ZFd*ejDd+eGd'ejDfd,d-ZH	.dfd/e	j;d0ejDd1ejDd2ejDd3eejD d4eId5eIfd6d7ZJ	.dfd/e	j;d0ejDd1ejDd2ejDd3eejD d4eId5eIfd8d9ZKG d:d; d;e	j;ZLG d<d= d=eZMe.G d>d? d?e)ZNe.G d@dA dAeNZOG dBdC dCeNeZPee.dDdEG dFdG dGe#ZQG dHdI dIej	j;ZRG dJdK dKe	j;ZSdLdM ZTG dNdO dOe	j;ZUdPejDd0ejDfdQdRZVd0ejDd1ejDdPejDd'eEejDejDf fdSdTZWG dUdV dVe	j;ZXG dWdX dXe	j;ZYG dYdZ dZeZZG d[d\ d\e	j;Z[G d]d^ d^e	j;Z\G d_d` d`e	j;Z]G dadb dbeNZ^G dcdd ddeNeZ_g deZ`dS )g    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )Llama4ConfigLlama4TextConfigc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Llama4TextExpertsconfigc                    sx   t    |j| _|j| _|j| _| j| _tt	
| j| jd| j | _tt	
| j| j| jf| _t|j | _d S N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr   
hidden_actact_fnselfr$   	__class__ ^/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/llama4/modeling_llama4.pyr(   /   s   
 zLlama4TextExperts.__init__hidden_statesreturnc                 C   sb   | | jjd d| j}t|| j}|jddd\}}t|| | | j}| d| j}|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r&   dim)	viewr2   shaper,   r0   bmmchunkr5   r3   )r7   r<   gate_upgateupnext_statesr:   r:   r;   forward9   s   zLlama4TextExperts.forward)	__name__
__module____qualname__r"   r(   r0   TensorrI   __classcell__r:   r:   r8   r;   r#   .   s    
r#   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Llama4TextMLPNc                    sj   t    |d u r|j}|| _tj|j|dd| _tj|j|dd| _tj||jdd| _	t
|j | _d S NFbias)r'   r(   r+   r$   r.   Linearr,   	gate_projup_projr3   r   r4   activation_fn)r7   r$   r+   r8   r:   r;   r(   P   s   
zLlama4TextMLP.__init__c                 C   s$   |  | || | }| |S N)rV   rT   rU   r3   )r7   xr3   r:   r:   r;   rI   \   s   
zLlama4TextMLP.forwardrW   rJ   rK   rL   r(   rI   rN   r:   r:   r8   r;   rO   O   s    rO   c                       s<   e Zd Zddef fddZdd Zdd Zd	d
 Z  ZS )Llama4TextL2Normư>epsc                    s   t    || _d S rW   )r'   r(   r\   )r7   r\   r8   r:   r;   r(   b   s   

zLlama4TextL2Norm.__init__c                 C   $   |t |djddd| j  S Nr&   r>   T)keepdimr0   rsqrtpowmeanr\   r7   rX   r:   r:   r;   _normf      $zLlama4TextL2Norm._normc                 C   s   |  | |S rW   )re   floattype_asrd   r:   r:   r;   rI   i   s   zLlama4TextL2Norm.forwardc                 C   s   d| j  S )Nzeps=r\   r7   r:   r:   r;   
extra_reprl   s   zLlama4TextL2Norm.extra_repr)r[   )	rJ   rK   rL   rg   r(   re   rI   rk   rN   r:   r:   r8   r;   rZ   a   s
    rZ   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )Llama4TextRMSNormh㈵>c                    s&   t    || _tt|| _dS )z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r'   r(   r\   r.   r/   r0   onesweight)r7   r,   r\   r8   r:   r;   r(   q   s   
zLlama4TextRMSNorm.__init__c                 C   r]   r^   r`   rd   r:   r:   r;   re   y   rf   zLlama4TextRMSNorm._normc                 C   s   |  | |}|| j S rW   )re   rg   rh   ro   )r7   rX   outputr:   r:   r;   rI   |   s   
zLlama4TextRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplero   rB   r\   rj   r:   r:   r;   rk      s   zLlama4TextRMSNorm.extra_repr)rm   )rJ   rK   rL   r(   re   rI   rk   rN   r:   r:   r8   r;   rl   p   s
    rl   c                       s(   e Zd Z fddZ fddZ  ZS )Llama4Routerc                    s*   t  j|j|jdd |j| _|j| _d S rP   )r'   r(   r,   r)   r*   num_experts_per_toktop_kr6   r8   r:   r;   r(      s   zLlama4Router.__init__c                    s^   t  |}tj|| jdd\}}t|tdd||}tjj	
| |j}||fS )Nr    r?   z-inf)r'   rI   r0   topkrt   	full_likerg   scatter_r.   
functionalsigmoidtodtype)r7   r<   router_logitsrouter_top_valuerouter_indicesrouter_scoresr8   r:   r;   rI      s
   zLlama4Router.forwardrY   r:   r:   r8   r;   rr      s    rr   Llama4TextMoec                       $   e Zd Z fddZdd Z  ZS )r   c                    sD   t    |j| _|j| _|j| _t|| _	t
|| _t|| _d S rW   )r'   r(   rs   rt   r,   
hidden_dimr)   r*   r#   expertsrr   routerrO   shared_expertr6   r8   r:   r;   r(      s   


zLlama4TextMoe.__init__c                 C   s   | d| j}| |\}}||jd d}||dd dd }| |}| |}|| |jd d|jd j	dd ||fS )Nr>   r    r   r?   )
reshaper   r   repeatrB   	transposer   r   add_sum)r7   r<   r   r|   	routed_in
routed_outoutr:   r:   r;   rI      s   

(zLlama4TextMoe.forwardrY   r:   r:   r8   r;   r      s    	c                       sD   e Zd ZU ejed< ddef fddZe e	dd Z
  ZS )	Llama4TextRotaryEmbeddinginv_freqNr$   c                    sp   t    |jd urdnd| _|j| _|j| _|| _t| j | _	| 	| j|\}| _
| jd|dd | j| _d S )Nllama3defaultr   F)
persistent)r'   r(   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr$   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r7   r$   devicer   r8   r:   r;   r(      s   
z"Llama4TextRotaryEmbedding.__init__c                 C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}tj	|dd# |
|j| dd}tt||}|| j }W d    |S 1 saw   Y  |S )	Nr   r>   r    mpscpuF)device_typeenabledr&   )r   rg   expandrB   
isinstancer   typestrr0   autocastrz   r   polar	ones_liker   )r7   rX   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_cisr:   r:   r;   rI      s   (&
z!Llama4TextRotaryEmbedding.forwardrW   )rJ   rK   rL   r0   rM   __annotations__r"   r(   no_gradr   rI   rN   r:   r:   r8   r;   r      s   
 
r   xqxkr   r=   c              	   C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t ||d d d d d d d f  d}t ||d d d d d d d f  d}|| ||fS )Nr>   r&   r   )r0   view_as_complexrg   r   rB   view_as_realflattenrh   )r   r   r   xq_xk_xq_outxk_outr:   r:   r;   apply_rotary_emb   s
   ,,,,r   r<   n_repc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rB   r   r   )r<   r   batchnum_key_value_headsslenhead_dimr:   r:   r;   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj	|
|| j
d}
t|
|	}|dd }||
fS )Nr&   r   r>   r?   ptrainingr    )r   num_key_value_groupsr0   matmulr   rB   r.   rx   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr:   r:   r;   eager_attention_forward   s   
&r   c                 K   s   t || j}t || j}	t||dd| jd  }
|d ur6|d d d d d d d |jd f }|
| }
tjj	|
dd}
tjj
|
|| jd}
t|
|	}|dd }||
fS )	Nr&   r         r   r>   r?   r   r    )r   r   r0   r   r   r   rB   r.   rx   r   r   r   r   r   r:   r:   r;   vision_eager_attention_forward   s   
&r   c                       s   e Zd ZdZdef fddZedddd				dd
ejde	ejejf de
ej de
e de
ej dee de	eje
ej e
e	ej  f fddZ  ZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr$   c                    s&  t    || _|| _t|d|j|j | _|j| _|j|j | _	|j| _| jd | _
|j| _|j| _|j| _|j| _d| _|j| | _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jjr| jrt|j| _d S d S d S )Nr   r   TrQ   )r'   r(   r$   	layer_idxgetattrr,   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper.   rS   attention_biasq_projk_projv_projo_projuse_qk_normrZ   rms_norm_epsqk_normr7   r$   r   r8   r:   r;   r(     s:   
zLlama4TextAttention.__init__past_key_valuepast_key_values4.58new_nameversionNr<   position_embeddingsr   cache_positionr   r=   c                 K   s  |j d d }g |d| jR }| ||}	| |jg |d| jR  }
| ||dd}| jrDt|	|
|	|	j
\}	}
t| drS| |	}	| |
}
| jr| jstt| d | j | j d }|d|d ddfg |ddR }|	| 	|	j}	|	dd}	|
dd}
|d urd|i}||
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd	|\}}|jg |dR    }| !|}||fS )
Nr>   r    r&   r         ?r   eagerr   )r   r   )"rB   r   r   rA   r   r   r   r   r   rz   r   hasattrr   r   r0   log1pfloorrg   r   r   r   r{   updater   r   r$   _attn_implementationr   r   r   r   r   r   r   )r7   r<   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   r:   r:   r;   rI   8  sP   
 


&*

zLlama4TextAttention.forwardNN)rJ   rK   rL   __doc__r"   r(   r   r0   rM   rq   r   r	   
LongTensorr   r   rI   rN   r:   r:   r8   r;   r     s*    r   c                       s   e Zd Z fddZedddd						dd	ejd
eej deej dee	 dee
 deej deeejejf  dee deejeeejejf  f fddZ  ZS )Llama4TextDecoderLayerc                    s   t    |j| _|| _|j| | _t||| _||jv | _	| j	r't
|| _nt||jd| _t|j|jd| _t|j|jd| _d S )N)r+   ri   )r'   r(   r,   r   layer_typesattention_typer   	self_attn
moe_layersis_moe_layerr   feed_forwardrO   intermediate_size_mlprl   r   input_layernormpost_attention_layernormr   r8   r:   r;   r(   v  s   
zLlama4TextDecoderLayer.__init__r   r   r   r   NFr<   r   r   	use_cacher   r   r   r=   c              	   K   sr   |}	|  |}| jd||||||d|\}
}|	|
 }|}	| |}| |}| jr/|\}}|	||	j }|S )N)r<   r   r   r   r  r   r:   )r  r  r  r  r
  rA   rB   )r7   r<   r   r   r   r  r   r   r   residualattention_states_r:   r:   r;   rI     s(   

	

zLlama4TextDecoderLayer.forward)NNNFNN)rJ   rK   rL   r(   r   r0   rM   r   r  r	   boolrq   r   r   FloatTensorrI   rN   r:   r:   r8   r;   r  u  s8    	
r  c                   @   s<   e Zd ZU eed< dZdgZdZdZdZ	dZ
dZdd ZdS )Llama4PreTrainedModelr$   Tr   Fc                 C   sJ  t | jdr
| jjn| jjj}t|tjr-|jjj	d|d |j
d ur+|j
j  d S d S t|tjrN|jjj	d|d |jd urL|jj|j   d S d S t|tjrc|jjd |j
j  d S t|trq|jjd d S t|tr|jjj	d|d |jjj	d|d d S t|tr|jjj	|jd |jjj	|jd d S d S )Ninitializer_ranger   )rc   stdr   )r  )r   r$   r  text_configr   r.   rS   ro   datanormal_rR   zero_	Embeddingpadding_idx	LayerNormfill_rl   r#   r2   r3   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r7   r   r  r:   r:   r;   _init_weights  s4   






z#Llama4PreTrainedModel._init_weightsN)rJ   rK   rL   r!   r   supports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr$  r:   r:   r:   r;   r    s   
 r  c                       s   e Zd ZU dgZdZeed< eee	dZ
def fddZeee							ddeej d	eej d
eej dee deej dee deej dee deeef fddZ  ZS )Llama4TextModelr  modelr$   )
attentionsr<   r|   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r:   )r  ).0r   r$   r:   r;   
<listcomp>  s    z,Llama4TextModel.__init__.<locals>.<listcomp>ri   r0  F)r'   r(   pad_token_idr  
vocab_sizer.   r  r,   embed_tokens
ModuleListrangenum_hidden_layerslayersrl   r   normr   
rotary_embgradient_checkpointing	post_initr6   r8   r0  r;   r(     s   zLlama4TextModel.__init__N	input_idsr   r   r   inputs_embedsr  r   r   r=   c              
   K   sF  |d u |d uA rt d|d u r| || jjj}|r'|d u r't| jd}|d u rC|d ur3| nd}	tj	|	|	|j
d  |jd}|d u rL|d}t| }
tsl| j|||||d}td
i |td
i |d}
|}| ||}| jd | jj D ]}||f|
|j |||||d|}q}| |}t||r|d	S d d	S )N:You must specify exactly one of input_ids or inputs_embedsr0  r   r    )r   )r$   input_embedsr   r   r   r   )full_attentionchunked_attention)r   r   r   r  r   r   )last_hidden_stater   r:   )
ValueErrorr4  rz   ro   r   r
   r$   get_seq_lengthr0   arangerB   	unsqueezer   dictr   r   r:  r8  r7  r  r9  r   )r7   r=  r   r   r   r>  r  r   r   past_seen_tokenscausal_mask_mappingmask_kwargsr<   freq_cisdecoder_layerr:   r:   r;   rI     s\   



zLlama4TextModel.forward)NNNNNNN)rJ   rK   rL   _no_split_modulesbase_model_prefixr"   r   r   r  r   _can_record_outputsr(   r   r   r   r   r0   r  rM   r	   r  r  r   r   r   rq   r   rI   rN   r:   r:   r8   r;   r,    sL   
 	

r,  c                       s   e Zd ZU dgZdZdgZddiZeed< def fddZ	e
e																	
ddeej deej deej deeeeej f  deej deej dee deej deeejf dee deeef fddZ  ZS )Llama4ForCausalLMr  language_modelzlm_head.weightlm_headcolwise_repr$   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rP   )
r'   r(   r,  r-  r3  r.   rS   r,   rS  r<  r6   r8   r:   r;   r(   =  s
   
zLlama4ForCausalLM.__init__Nr   r=  r   r   r   r>  labelsr  r   logits_to_keepr   r=   c
              
   K   s   | j d|||||||d|
}|d }t|	trt|	 dn|	}| |dd|ddf }d}|durC| jd||| jjd|
}t|||j	|j
|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r=  r   r   r   r>  r  r   r   N)logitsrU  r3  )lossrW  r   r<   r.  r:   )r-  r   intslicerS  loss_functionr$   r3  r   r   r<   r.  )r7   r=  r   r   r   r>  rU  r  r   rV  r   outputsr<   slice_indicesrW  rX  r:   r:   r;   rI   F  s0   %zLlama4ForCausalLM.forward)	NNNNNNNNr   )rJ   rK   rL   rN  rO  _tied_weights_keys_tp_planr"   r   r(   r   r   r   r0   r  rM   r   r	   listr  r  rY  r   r   rq   r   rI   rN   r:   r:   r8   r;   rQ  6  sR   
 		

rQ  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	Llama4CausalLMOutputWithPasta3  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    NrX  rW  r   r<   r.  image_hidden_states)rJ   rK   rL   r  rX  r   r0   r  r   rW  r   r	   r<   rq   r.  rc  r:   r:   r:   r;   rb    s   
 rb  c                       r   )Llama4VisionMLP2c                    s\   t    |j| _|j| _tj| j|jdd| _tj|j|jdd| _	t
 | _|j| _d S rP   )r'   r(   r,   r+   r.   rS   projector_input_dimfc1projector_output_dimfc2GELUrV   projector_dropoutr   r6   r8   r:   r;   r(     s   

zLlama4VisionMLP2.__init__c                 C   s8   |  |}| |}tj|| j| jd}| | |S )Nr   )rf  rV   Fr   r   rh  r7   r<   r:   r:   r;   rI     s   

zLlama4VisionMLP2.forwardrY   r:   r:   r8   r;   rd    s    	rd  c                       r   )Llama4MultiModalProjectorc                    s(   t    tj|jj|jjdd| _d S rP   )	r'   r(   r.   rS   vision_configvision_output_dimr  r,   linear_1r6   r8   r:   r;   r(     s   
z"Llama4MultiModalProjector.__init__c                 C   s   |  |}|S rW   )rp  )r7   image_featuresr<   r:   r:   r;   rI     s   
z!Llama4MultiModalProjector.forwardrY   r:   r:   r8   r;   rm    s    rm  c           
   	   C   s   | j \}}}tt|}| |||d} |  \}}}}| ||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }||d|j d }	|	S )Nr>   r   r&   r    r   )rB   rY  mathsqrtrA   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensorr:   r:   r;   pixel_shuffle  s    $r  c                       2   e Zd Z fddZdejdejfddZ  ZS )Llama4VisionPixelShuffleMLPc                    s>   t    |j| _t|j| jd  | _|j| _t|| _	d S r%   )
r'   r(   pixel_shuffle_ratiorY  re  	inner_dimrg  
output_dimrd  mlpr6   r8   r:   r;   r(     s
   
z$Llama4VisionPixelShuffleMLP.__init__encoded_patchesr=   c                 C   s   t || j}| |S rW   )r  r  r  )r7   r  r:   r:   r;   rI     s   
z#Llama4VisionPixelShuffleMLP.forwardrJ   rK   rL   r(   r0   rM   rI   rN   r:   r:   r8   r;   r        r  freqs_cic                    s(   |j   fddt|jD }| j| S )Nc                    s,   g | ]\}}|d ks| d  kr|nd qS )r    r:   )r/  idndimr:   r;   r1    s   , z)reshape_for_broadcast.<locals>.<listcomp>)r  	enumeraterB   rA   )r  r   rB   r:   r  r;   reshape_for_broadcast  s   
r  c                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t||d}||j}t || 	d}t || 	d}|
| |
|fS )Nr>   r&   )r  r   r   )r0   r   rg   r   rB   r  rz   r   r   r   rh   )r   r   r  query_key_	query_outkey_outr:   r:   r;   vision_apply_rotary_emb  s   ,,r  c                       sx   e Zd Zdef fddZ		ddejdejdeej dee d	e	e
 d
eejeej eeej  f fddZ  ZS )Llama4VisionAttentionr$   c                    s   t    || _|j| _|j| _|j|j | _d| _|j	| _	| jd | _
tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j | jdd| _d S )Nr    r   TrQ   )r'   r(   r$   r,   	embed_dimr   	num_headsr   r   r   r   r.   rS   r   r   r   r   r6   r8   r:   r;   r(     s   
 zLlama4VisionAttention.__init__Nr<   r  r   r   r   r=   c                 K   s   |j d d }g |d| jR }| ||}| ||}	| ||}
t||	|d\}}	|dd}|	dd}	|
dd}
t}| j	j
dvrRt| j	j
 }|| ||	|
d f| js^dn| jd dd|\}}|jg |dR   }| |}||fS )	Nr>   )r  r    r&   )r   flex_attentionr   F)r   r   r   )rB   r   r   rA   r   r   r  r   r   r$   r   r   r   r   r   r   r   )r7   r<   r  r   r   r   r   r   r   r   r   r  r   r   r:   r:   r;   rI   
  s8   	

zLlama4VisionAttention.forwardr  )rJ   rK   rL   r   r(   r0   rM   r   r	   r   r   rq   rI   rN   r:   r:   r8   r;   r    s"    r  c                       r  )Llama4VisionMLPc                    sJ   t    || _t | _tj|j|jdd| _	tj|j|jdd| _
d S )NTrQ   )r'   r(   r$   r.   ri  rV   rS   r,   r+   rf  rh  r6   r8   r:   r;   r(   6  s
   

zLlama4VisionMLP.__init__r<   r=   c                 C   s"   |  |}| |}| |}|S rW   )rf  rV   rh  rl  r:   r:   r;   rI   =  s   


zLlama4VisionMLP.forwardr  r:   r:   r8   r;   r  5  r  r  c                
       sP   e Zd Zdef fddZ		ddejdejdeej dee fd	d
Z	  Z
S )Llama4VisionEncoderLayerr$   c                    sF   t    |j| _t|| _t|| _t|j| _	t|j| _
d S rW   )r'   r(   r,   r  r  r  r  r.   r  r  r  r6   r8   r:   r;   r(   E  s   


z!Llama4VisionEncoderLayer.__init__Nhidden_stater  r   output_attentionsc                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r/||f7 }|S )N)r  r   )r  r  r  r  )r7   r  r  r   r  r  r   r\  r:   r:   r;   rI   O  s    




z Llama4VisionEncoderLayer.forwardr  )rJ   rK   rL   r   r(   r0   rM   r   r  rI   rN   r:   r:   r8   r;   r  D  s    r  c                       st   e Zd ZdZdef fddZ				ddejdejdeej d	ee	 d
ee	 dee	 de
eef fddZ  ZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r$   c                    s@   t     | _t fddt jD | _d| _ | _d S )Nc                    s   g | ]}t  qS r:   )r  )r/  r  r0  r:   r;   r1  |  s    z0Llama4VisionEncoder.__init__.<locals>.<listcomp>F)	r'   r(   r$   r.   r5  r6  r7  r8  r;  r6   r8   r0  r;   r(   y  s
   
 
zLlama4VisionEncoder.__init__Nr<   r  r   r  output_hidden_statesreturn_dictr=   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}| jD ]}	|r6||f }|	||||d}
|rG||
d f }|
d }q-|rS||f }|satdd |||fD S t|||dS )	ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr:   )r  r   r  r  r    r   c                 s       | ]	}|d ur|V  qd S rW   r:   r/  vr:   r:   r;   	<genexpr>      z.Llama4VisionEncoder.forward.<locals>.<genexpr>rC  r<   r.  )r$   r  r  use_return_dictr8  rq   r   )r7   r<   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputsr:   r:   r;   rI     s2   



zLlama4VisionEncoder.forwardNNNN)rJ   rK   rL   r  r   r(   r0   rM   r   r  r   rq   r   rI   rN   r:   r:   r8   r;   r  p  s,    
r  c                       r  )Llama4UnfoldConvolutionc                    s`   t    |j}t|tr||f}tjj||jd| _tj	|j
|d  |d  |jdd| _d S )N)kernel_sizestrider   r    FrQ   )r'   r(   r{  r   rY  r0   r.   UnfoldunfoldrS   num_channelsr,   linear)r7   r$   r  r8   r:   r;   r(     s   

z Llama4UnfoldConvolution.__init__r<   r=   c                 C   s&   |  |}|ddd}| |}|S )Nr   r&   r    )r  ru  r  rl  r:   r:   r;   rI     s   

zLlama4UnfoldConvolution.forwardr  r:   r:   r8   r;   r    s    r  c                       r   )Llama4VisionRotaryEmbeddingc                    sd  t    |j|j }tj|d tjd|d d}tj||d d gdd}d|d< || }|| }|j	|j
 d }d|jtd|dd |d   |   }|d d	 |d d d d f  jdd
d}|d d	 |d d d d f  jdd
d}	tj||	gd
d  dd d df }
|
|d
dddk d}
ttjt|
t|
gd
d}|| _d S )Nr&   )r{   r    r   r?   r   )r>   r>   r   ).Nr>   .)r'   r(   
image_sizer{  r0   rF  int32r   catr,   r   
rope_thetarg   repeat_interleaver   masked_fillr   stackcossinr  )r7   r$   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   rL  r8   r:   r;   r(     s   
 ,((($
z$Llama4VisionRotaryEmbedding.__init__c                 C   s   | j |jS rW   )r  rz   r   rl  r:   r:   r;   rI     s   z#Llama4VisionRotaryEmbedding.forwardrY   r:   r:   r8   r;   r    s    r  c                       s   e Zd ZU dZdgZeed< def fddZdd Z				dd	e	j
d
ee	j
 dee dee dee deeee	j
df f fddZ  ZS )r   vision_modelr  r$   c                    s   t  | |j| _|j| _|j| _|j| _| j| j d d | _|jd | _t|| _	t
| jt| j | _t
| jt| j| j | _t|| _t
| j| _t
| j| _t|| _t|| _|   d S )Nr&   r    r   )r'   r(   r  r{  r,   r  ry  r"  r  patch_embeddingr.   r/   r0   randnr!  r#  r  rotary_embeddingr  layernorm_prelayernorm_postr  r-  r  vision_adapterr<  r6   r8   r:   r;   r(     s    



zLlama4VisionModel.__init__c                 C   s   | j S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rj   r:   r:   r;   get_input_embeddings
  s   z&Llama4VisionModel.get_input_embeddingsNpixel_valuesr   r  r  r  r=   .c                 C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|j\}}}}	d}
d}| |}|j\}}}|||
 | ||}| j|jd d|jd }t	j
||gdd}|d7 }|||
 |||}| jj|j|jd}|| }| |}||d|}| |}| j|d|||d}|j}| |}|ddddddf }| |}|r|jnd}|r|d }nd}|std	d
 |||fD S t|||dS )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr    r   r>   r?   r{   r   )r   r  r  r  r&   c                 s   r  rW   r:   r  r:   r:   r;   r  i  r  z,Llama4VisionModel.forward.<locals>.<genexpr>r  )r$   r  r  r  rB   r  r   r!  r   r0   r  r#  rz   r{   r   r  rA   r  r-  rC  r  r  r<   rq   r   )r7   r  r   r  r  r  batch_size_times_num_tilesr  r|  r}  num_concurrent_media
num_chunksr  r  ry  r   r!  positional_embeddingr  rp   r<   r.  r:   r:   r;   rI     sZ   





zLlama4VisionModel.forwardr  )rJ   rK   rL   rO  rN  r   r   r(   r  r0   rM   r   r  r   r   rq   rI   rN   r:   r:   r8   r;   r     s.   
 	r   c                '       s  e Zd ZU ddgZi ZdZeed< def fddZdd Z	d	d
 Z
dd Zdd Zdd Zdd ZdejdefddZdejdejdejfddZeeddd															 d1deej deej d!eej d"eej d#ee deej deeeee f  dee d$eej d%ee d&ee d'ee d(ee d)eej d*eeejf d+ee d,ee e!f f"d-d.Z"						d2d/d0Z#  Z$S )3Llama4ForConditionalGenerationr  r   r$   c                    s^   t  | t|j| _t|| _t|j| _	|jj
| _
| jjd ur&| jjnd| _|   d S )Nr>   )r'   r(   r   rn  r  rm  multi_modal_projectorrQ  r  rR  r3  r$   r2  r<  r6   r8   r:   r;   r(   x  s   

z'Llama4ForConditionalGeneration.__init__c                 C   
   | j  S rW   )rR  r  rj   r:   r:   r;   r       
z3Llama4ForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rW   )rR  set_input_embeddings)r7   r   r:   r:   r;   r       z3Llama4ForConditionalGeneration.set_input_embeddingsc                 C   r  rW   )rR  get_output_embeddingsrj   r:   r:   r;   r    r  z4Llama4ForConditionalGeneration.get_output_embeddingsc                 C   r  rW   )rR  set_output_embeddings)r7   new_embeddingsr:   r:   r;   r    r  z4Llama4ForConditionalGeneration.set_output_embeddingsc                 C   r  rW   )rR  set_decoder)r7   decoderr:   r:   r;   r    r  z*Llama4ForConditionalGeneration.set_decoderc                 C   r  rW   )rR  get_decoderrj   r:   r:   r;   r    r  z*Llama4ForConditionalGeneration.get_decoderr  vision_feature_select_strategyc                 K   sJ   |dvrt d| j dd | D }| j|fddi|}|j}|S )aj  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   fullz$Unexpected select feature strategy: c                 S   s   i | ]\}}|d ur||qS rW   r:   )r/  kr  r:   r:   r;   
<dictcomp>  s    zELlama4ForConditionalGeneration.get_image_features.<locals>.<dictcomp>r  F)rD  r  itemsr  rC  )r7   r  r  r   image_outputsr  r:   r:   r;   get_image_features  s   z1Llama4ForConditionalGeneration.get_image_featuresr=  r>  rq  c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}||  | krItd| d|jd  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr  r>   z6Image features and image tokens do not match: tokens: z, features r   )r  r0   tensorr$   image_token_idlongr   allr   rG  	expand_asrz   numelrD  rB   )r7   r=  r>  rq  special_image_maskn_image_tokensr:   r:   r;   get_placeholder_mask  s   z3Llama4ForConditionalGeneration.get_placeholder_maskvision_feature_layerr   )r   Nr   r   r   r   rU  r  r  r  r  r   rV  r   r=   c                 K   sd  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j jj}|du |duA r5td|durA|durAtd|du rK|  |}|durx| j||d}|	d|
d}| ||j|j}| j|||d}|||}| jd|||||
|||||d
|}|d }d}|	dur|dur|dd|jd	 d	  df |j}|d
ddddf ||jdk  }|	d
d	df ||	jdk  }n|d
ddddf  }|	d
d	df  }t }||	d|
d|	d|j}|s|f|d	d  }|dur|f| S |S t|||j|j|j|dur.|dS ddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nr?  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r  r  r>   )r>  rq  )
r   r   r   r>  r  r  r  r  r   rV  r   r    .)rX  rW  r   r<   r.  rc  r:   )r$   r  r  r  rn  r  rD  r  r  rA   rt  r  rz   r   r{   r  masked_scatterrR  rB   r   r.   CrossEntropyLossrb  r   r<   r.  )r7   r=  r  r   r   r   r>  r  r  rU  r  r  r  r  r   rV  r   rq  vision_flatprojected_vision_flatr  r\  rW  rX  shift_attention_maskshift_logitsshift_labelsloss_fctrp   r:   r:   r;   rI     s   1

(*& z&Llama4ForConditionalGeneration.forwardc           
      K   s8   | j j|f|||||d|}	|d dkr||	d< |	S )N)r   r>  r   r   rV  r   r  )rR  prepare_inputs_for_generation)
r7   r=  r   r>  r  r   r   rV  r   model_inputsr:   r:   r;   r  J  s   
z<Llama4ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNNNNNr   )NNNNNN)%rJ   rK   rL   rN  r_  rO  r!   r   r(   r  r  r  r  r  r  r0   r  r   r  r  r  r   r   r   rM   r	   r   rY  r`  r  r   r   rq   rb  rI   r  rN   r:   r:   r8   r;   r  r  s   
 


	

 r  )r  r,  r   rQ  r  )r   )arr  dataclassesr   typingr   r   r   r0   torch.nnr.   torch.nn.functionalrx   rk  /transformers.models.llama4.configuration_llama4r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   configuration_llama4r!   r"   
get_loggerrJ   loggerModuler#   rO   rZ   rl   rS   rr   r   r   rM   rq   r   rY  r   rg   r   r   r   r  r  r,  rQ  rb  rd  rm  r  r  r  r  r  r  r  r  r  r  r   r  __all__r:   r:   r:   r;   <module>   s   
!!

"
^6&cQ
;,R  x