o
    ei                     @   s2  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z= e4>e?Z@G dd de	jAZBG dd de	jAZCG dd dej	jAZDG dd de	jAZEG d d! d!e	jFZGed"G d#d" d"e	jAZHG d$d% d%e	jAZId&ejJd'ejJd(ejJd)eKejJejJf fd*d+ZLd,ejJd-eMd)ejJfd.d/ZN	0dhd1e	jAd2ejJd3ejJd4ejJd5ejJdB d6eOd7eOfd8d9ZP	0dhd1e	jAd2ejJd3ejJd4ejJd5ejJdB d6eOd7eOfd:d;ZQG d<d= d=e	jAZRG d>d? d?e!ZSe2G d@dA dAe-ZTe2G dBdC dCeTZUG dDdE dEeTeZVee2dFdGG dHdI dIe'ZWG dJdK dKej	jAZXG dLdM dMe	jAZYdNdO ZZG dPdQ dQe	jAZ[dRejJd2ejJfdSdTZ\d2ejJd3ejJdRejJd)eKejJejJf fdUdVZ]G dWdX dXe	jAZ^G dYdZ dZe	jAZ_G d[d\ d\e!Z`G d]d^ d^e	jAZaG d_d` d`e	jAZbG dadb dbe	jAZcG dcdd ddeTZdG dedf dfeTeZeg dgZfdS )i    N)Callable)	dataclass)Optional)Llama4VisionConfig   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )Llama4ConfigLlama4TextConfigc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Llama4TextExpertsconfigc                    sx   t    |j| _|j| _|j| _| j| _tt	
| j| jd| j | _tt	| j| j| jf| _t|j | _d S N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchzerosgate_up_projempty	down_projr   
hidden_actact_fnselfr'   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/llama4/modeling_llama4.pyr+   9   s   
 zLlama4TextExperts.__init__hidden_statesreturnc                 C   sb   | | jjd d| j}t|| j}|jddd\}}t|| | | j}| d| j}|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r)   dim)	viewr5   shaper/   r3   bmmchunkr9   r7   )r;   r@   gate_upgateupnext_statesr>   r>   r?   forwardC   s   zLlama4TextExperts.forward)	__name__
__module____qualname__r%   r+   r3   TensorrM   __classcell__r>   r>   r<   r?   r&   8   s    
r&   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Llama4TextMLPNc                    sj   t    |d u r|j}|| _tj|j|dd| _tj|j|dd| _tj||jdd| _	t
|j | _d S NFbias)r*   r+   r.   r'   r1   Linearr/   	gate_projup_projr7   r   r8   activation_fn)r;   r'   r.   r<   r>   r?   r+   Z   s   
zLlama4TextMLP.__init__c                 C   s$   |  | || | }| |S N)rZ   rX   rY   r7   )r;   xr7   r>   r>   r?   rM   f   s   
zLlama4TextMLP.forwardr[   rN   rO   rP   r+   rM   rR   r>   r>   r<   r?   rS   Y   s    rS   c                       s<   e Zd Zddef fddZdd Zdd Zd	d
 Z  ZS )Llama4TextL2Normư>epsc                    s   t    || _d S r[   )r*   r+   r`   )r;   r`   r<   r>   r?   r+   l   s   

zLlama4TextL2Norm.__init__c                 C   $   |t |djddd| j  S Nr)   rB   T)keepdimr3   rsqrtpowmeanr`   r;   r\   r>   r>   r?   _normp      $zLlama4TextL2Norm._normc                 C   s   |  | |S r[   )ri   floattype_asrh   r>   r>   r?   rM   s   s   zLlama4TextL2Norm.forwardc                 C   s   d| j  S )Nzeps=r`   r;   r>   r>   r?   
extra_reprv   s   zLlama4TextL2Norm.extra_repr)r_   )	rN   rO   rP   rk   r+   ri   rM   ro   rR   r>   r>   r<   r?   r^   k   s
    r^   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )Llama4TextRMSNormh㈵>c                    s&   t    || _tt|| _dS )z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r*   r+   r`   r1   r2   r3   onesweight)r;   r/   r`   r<   r>   r?   r+   {   s   
zLlama4TextRMSNorm.__init__c                 C   ra   rb   rd   rh   r>   r>   r?   ri      rj   zLlama4TextRMSNorm._normc                 C   s   |  | |}|| j S r[   )ri   rk   rl   rs   )r;   r\   outputr>   r>   r?   rM      s   
zLlama4TextRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplers   rF   r`   rn   r>   r>   r?   ro      s   zLlama4TextRMSNorm.extra_repr)rq   )rN   rO   rP   r+   ri   rM   ro   rR   r>   r>   r<   r?   rp   z   s
    rp   c                       s(   e Zd Z fddZ fddZ  ZS )Llama4Routerc                    s*   t  j|j|jdd |j| _|j| _d S rT   )r*   r+   r/   r,   r-   num_experts_per_toktop_kr:   r<   r>   r?   r+      s   zLlama4Router.__init__c                    s^   t  |}tj|| jdd\}}t|tdd||}tjj	
| |j}||fS )Nr#   rC   z-inf)r*   rM   r3   topkrx   	full_likerk   scatter_r1   
functionalsigmoidtodtype)r;   r@   router_logitsrouter_top_valuerouter_indicesrouter_scoresr<   r>   r?   rM      s
   zLlama4Router.forwardr]   r>   r>   r<   r?   rv      s    rv   Llama4TextMoec                       $   e Zd Z fddZdd Z  ZS )r   c                    sD   t    |j| _|j| _|j| _t|| _	t
|| _t|| _d S r[   )r*   r+   rw   rx   r/   
hidden_dimr,   r-   r&   expertsrv   routerrS   shared_expertr:   r<   r>   r?   r+      s   


zLlama4TextMoe.__init__c                 C   s   | d| j}| |\}}||jd d}||dd dd }| |}| |}|| |jd d|jd j	dd ||fS )NrB   r#   r   rC   )
reshaper   r   repeatrF   	transposer   r   add_sum)r;   r@   r   r   	routed_in
routed_outoutr>   r>   r?   rM      s   

(zLlama4TextMoe.forwardr]   r>   r>   r<   r?   r      s    	c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Llama4TextRotaryEmbeddinginv_freqNr'   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r*   r+   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr'   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r;   r'   devicerope_init_fnr   r<   r>   r?   r+      s   


z"Llama4TextRotaryEmbedding.__init__r   ztorch.deviceseq_lenrA   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimN      ?r   r)   r   )r   r   )	r   getattrr/   num_attention_headsr3   arangeint64r~   rk   )r'   r   r   baserD   attention_factorr   r>   r>   r?   r      s   
&z9Llama4TextRotaryEmbedding.compute_default_rope_parametersc                 C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}t|dd# |	|j| 
dd}tt||}|| j }W d    |S 1 s`w   Y  |S )	Nr   rB   r#   mpscpuF)device_typeenabledr)   )r   rk   expandrF   
isinstancer   typestrr    r~   r   r3   polar	ones_liker   )r;   r\   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_cisr>   r>   r?   rM      s   (&
z!Llama4TextRotaryEmbedding.forwardr[   )NNN)rN   rO   rP   r3   rQ   __annotations__r%   r+   staticmethodr   intru   rk   r   no_gradr   rM   rR   r>   r>   r<   r?   r      s&   
 

r   xqxkr   rA   c              	   C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t ||d d d d d d d f  d}t ||d d d d d d d f  d}|| ||fS )NrB   r)   r   )r3   view_as_complexrk   r   rF   view_as_realflattenrl   )r   r   r   xq_xk_xq_outxk_outr>   r>   r?   apply_rotary_emb   s
   ,,,,r   r@   n_repc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r#   N)rF   r   r   )r@   r   batchnum_key_value_headsslenr   r>   r>   r?   	repeat_kv  s
   0r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dd}
tjj|
|| j	d}
t|
|	}|dd
 }||
fS )Nr)   r   rB   rC   ptrainingr#   )r   num_key_value_groupsr3   matmulr   r1   r|   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputr>   r>   r?   eager_attention_forward  s   
r   c                 K   s   t || j}t || j}	t||dd| jd  }
|d ur#|
| }
tjj|
dd}
tjj	|
|| j
d}
t|
|	}|dd }||
fS )Nr)   r         rB   rC   r   r#   )r   r   r3   r   r   r   r1   r|   r   r   r   r   r   r>   r>   r?   vision_eager_attention_forward(  s   
r   c                       s   e Zd ZdZdef fddZ		ddejdeejejf dejdB d	e	dB d
ej
dB dee deejejdB eej dB f fddZ  ZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr'   c                    s&  t    || _|| _t|d|j|j | _|j| _|j|j | _	|j| _| jd | _
|j| _|j| _|j| _|j| _d| _|j| | _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jjr| jrt|j| _d S d S d S )Nr   r   TrU   )r*   r+   r'   	layer_idxr   r/   r   r   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper1   rW   attention_biasq_projk_projv_projo_projuse_qk_normr^   rms_norm_epsqk_normr;   r'   r   r<   r>   r?   r+   D  s:   
zLlama4TextAttention.__init__Nr@   position_embeddingsr   past_key_valuescache_positionr   rA   c                 K   s  |j d d }g |d| jR }| ||}	| |jg |d| jR  }
| ||dd}| jrDt|	|
|	|	j
\}	}
t| drS| |	}	| |
}
| jr| jstt| d | j | j d }|d|d ddfg |ddR }|	| 	|	j}	|	dd}	|
dd}
|d urd|i}||
|| j|\}
}t| jjt}|| |	|
||f| jsdn| j| jd|\}}|j g |dR  ! }| "|}||fS )	NrB   r#   r)   r   r   r   r   )r   r   )#rF   r   r   rE   r   r   r   r   r   r~   r   hasattrr   r   r3   log1pfloorrk   r   r   r   r   updater   r   get_interfacer'   _attn_implementationr   r   r   r   r   r   r   )r;   r@   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   r>   r>   r?   rM   b  sP   	 


&*

zLlama4TextAttention.forwardNN)rN   rO   rP   __doc__r%   r+   r3   rQ   ru   r	   
LongTensorr   r   rM   rR   r>   r>   r<   r?   r   A  s(    #r   c                       s   e Zd Z fddZ						ddejdejdB dejdB dedB d	edB d
ejdB de	ejejf dB de
e de	eje	ejejf dB f fddZ  ZS )Llama4TextDecoderLayerc                    s   t    |j| _|| _|j| | _t||| _||jv | _	| j	r't
|| _nt||jd| _t|j|jd| _t|j|jd| _d S )N)r.   rm   )r*   r+   r/   r   layer_typesattention_typer   	self_attn
moe_layersis_moe_layerr   feed_forwardrS   intermediate_size_mlprp   r   input_layernormpost_attention_layernormr   r<   r>   r?   r+     s   
zLlama4TextDecoderLayer.__init__NFr@   r   r   r   	use_cacher   r   r   rA   c              	   K   sr   |}	|  |}| jd||||||d|\}
}|	|
 }|}	| |}| |}| jr/|\}}|	||	j }|S )N)r@   r   r   r   r  r   r>   )r  r  r  r  r  rE   rF   )r;   r@   r   r   r   r  r   r   r   residualattention_states_r>   r>   r?   rM     s(   

	

zLlama4TextDecoderLayer.forward)NNNFNN)rN   rO   rP   r+   r3   rQ   r
  r	   boolru   r   r   FloatTensorrM   rR   r>   r>   r<   r?   r    s6    	
r  c                       sP   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZe  fddZ  ZS )Llama4PreTrainedModelr'   )imagetextTr   Fc                    s   t  | t| jdr| jjn| jjj}t|tr.tj	|j
d|d tj	|jd|d d S t|trGtj	|j|jd tj	|j|jd d S d S )Ninitializer_ranger   )rg   std)r  )r*   _init_weightsr   r'   r  text_configr   r&   initnormal_r5   r7   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r;   r   r  r<   r>   r?   r     s   



z#Llama4PreTrainedModel._init_weights)rN   rO   rP   r$   r   input_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr3   r   r   rR   r>   r>   r<   r?   r    s   
 r  c                       s   e Zd ZU dgZdZdZeed< ee	e
dZdef fddZeeee							dd	ejdB d
ejdB dejdB dedB dejdB dedB dejdB dee deeB fddZ  ZS )Llama4TextModelr  model)r  r'   )
attentionsr@   r   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r>   )r  ).0r   r'   r>   r?   
<listcomp>  s    z,Llama4TextModel.__init__.<locals>.<listcomp>rm   r4  F)r*   r+   pad_token_idpadding_idx
vocab_sizer1   	Embeddingr/   embed_tokens
ModuleListrangenum_hidden_layerslayersrp   r   normr   
rotary_embgradient_checkpointing	post_initr:   r<   r4  r?   r+     s   zLlama4TextModel.__init__N	input_idsr   r   r   inputs_embedsr  r   r   rA   c              
   K   sF  |d u |d uA rt d|d u r| || jjj}|r'|d u r't| jd}|d u rC|d ur3| nd}	tj	|	|	|j
d  |jd}|d u rL|d}t| }
tsl| j|||||d}td
i |td
i |d}
|}| ||}| jd | jj D ]}||f|
|j |||||d|}q}| |}t||r|d	S d d	S )N:You must specify exactly one of input_ids or inputs_embedsr4  r   r#   )r   )r'   rD  r   r   r   r   )full_attentionchunked_attention)r   r   r   r  r   r   )last_hidden_stater   r>   )
ValueErrorr:  r~   rs   r   r
   r'   get_seq_lengthr3   r   rF   	unsqueezer   dictr   r   r@  r>  r=  r  r?  r   )r;   rC  r   r   r   rD  r  r   r   past_seen_tokenscausal_mask_mappingmask_kwargsr@   freq_cisdecoder_layerr>   r>   r?   rM     s\   



zLlama4TextModel.forward)NNNNNNN)rN   rO   rP   _no_split_modulesbase_model_prefixr(  r%   r   r   r  r   _can_record_outputsr+   r   r!   r"   r   r3   r
  rQ   r	   r  r  r   r   ru   r   rM   rR   r>   r>   r<   r?   r0    sP   
 	
r0  c                       s   e Zd ZU dgZdZddiZddiZeed< def fdd	Z	e
e	
	
	
	
	
	
	
	
	ddejd
B dejd
B dejd
B ded
B dejd
B dejd
B ded
B dejd
B deejB dee deeB fddZ  ZS )Llama4ForCausalLMr  language_modelzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr'   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rT   )
r*   r+   r0  r1  r8  r1   rW   r/   rW  rB  r:   r<   r>   r?   r+   ]  s
   
zLlama4ForCausalLM.__init__Nr   rC  r   r   r   rD  labelsr  r   logits_to_keepr   rA   c
              
   K   s   | j d|||||||d|
}|d }t|	trt|	 dn|	}| |dd|ddf }d}|durC| jd||| jjd|
}t|||j	|j
|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rC  r   r   r   rD  r  r   r   N)logitsrY  r8  )lossr[  r   r@   r2  r>   )r1  r   r   slicerW  loss_functionr'   r8  r   r   r@   r2  )r;   rC  r   r   r   rD  rY  r  r   rZ  r   outputsr@   slice_indicesr[  r\  r>   r>   r?   rM   f  s0   %zLlama4ForCausalLM.forward)	NNNNNNNNr   )rN   rO   rP   rR  rS  _tied_weights_keys_tp_planr%   r   r+   r   r   r3   r
  rQ   r	   r  r  r   r   r   ru   r   rM   rR   r>   r>   r<   r?   rU  V  sR   
 		
rU  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	Llama4CausalLMOutputWithPasta3  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr\  r[  r   r@   r2  image_hidden_states)rN   rO   rP   r	  r\  r3   r  r   r[  r   r	   r@   ru   r2  rf  r>   r>   r>   r?   re    s   
 re  c                       r   )Llama4VisionMLP2c                    s\   t    |j| _|j| _tj| j|jdd| _tj|j|jdd| _	t
 | _|j| _d S rT   )r*   r+   r/   r.   r1   rW   projector_input_dimfc1projector_output_dimfc2GELUrZ   projector_dropoutr   r:   r<   r>   r?   r+     s   

zLlama4VisionMLP2.__init__c                 C   s8   |  |}| |}tj|| j| jd}| | |S )Nr   )ri  rZ   Fr   r   rk  r;   r@   r>   r>   r?   rM     s   

zLlama4VisionMLP2.forwardr]   r>   r>   r<   r?   rg    s    	rg  c                       r   )Llama4MultiModalProjectorc                    s(   t    tj|jj|jjdd| _d S rT   )	r*   r+   r1   rW   vision_configvision_output_dimr!  r/   linear_1r:   r<   r>   r?   r+     s   
z"Llama4MultiModalProjector.__init__c                 C   s   |  |}|S r[   )rs  )r;   image_featuresr@   r>   r>   r?   rM     s   
z!Llama4MultiModalProjector.forwardr]   r>   r>   r<   r?   rp    s    rp  c           
   	   C   s   | j \}}}tt|}| |||d} |  \}}}}| ||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }||d|j d }	|	S )NrB   r   r)   r#   r   )rF   r   mathsqrtrE   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensorr>   r>   r?   pixel_shuffle  s    $r  c                       2   e Zd Z fddZdejdejfddZ  ZS )Llama4VisionPixelShuffleMLPc                    s>   t    |j| _t|j| jd  | _|j| _t|| _	d S r(   )
r*   r+   pixel_shuffle_ratior   rh  	inner_dimrj  
output_dimrg  mlpr:   r<   r>   r?   r+     s
   
z$Llama4VisionPixelShuffleMLP.__init__encoded_patchesrA   c                 C   s   t || j}| |S r[   )r  r  r  )r;   r  r>   r>   r?   rM      s   
z#Llama4VisionPixelShuffleMLP.forwardrN   rO   rP   r+   r3   rQ   rM   rR   r>   r>   r<   r?   r        r  freqs_cic                    s(   |j   fddt|jD }| j| S )Nc                    s,   g | ]\}}|d ks| d  kr|nd qS )r#   r>   )r3  idndimr>   r?   r5    s   , z)reshape_for_broadcast.<locals>.<listcomp>)r  	enumeraterF   rE   )r  r   rF   r>   r  r?   reshape_for_broadcast  s   
r  c                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t||d}||j}t || 	d}t || 	d}|
| |
|fS )NrB   r)   )r  r   r   )r3   r   rk   r   rF   r  r~   r   r   r   rl   )r   r   r  query_key_	query_outkey_outr>   r>   r?   vision_apply_rotary_emb  s   ,,r  c                       sx   e Zd Zdef fddZ		ddejdejdejdB dedB d	ee	 d
e
ejejdB e
ej dB f fddZ  ZS )Llama4VisionAttentionr'   c                    s   t    || _|j| _|j| _|j|j | _d| _|j	| _	| jd | _
tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j | jdd| _d S )Nr#   r   TrU   )r*   r+   r'   r/   	embed_dimr   	num_headsr   r   r   r   r1   rW   r   r   r   r   r:   r<   r>   r?   r+     s   
 zLlama4VisionAttention.__init__Nr@   r  r   r   r   rA   c                 K   s   |j d d }g |d| jR }| ||}| ||}	| ||}
t||	|d\}}	|dd}|	dd}	|
dd}
t	| j
jt}|| ||	|
d f| jsXdn| jd dd|\}}|jg |dR   }| |}||fS )NrB   )r  r#   r)   r   F)r   r   r   )rF   r   r   rE   r   r   r  r   r   r   r'   r  r   r   r   r   r   r   )r;   r@   r  r   r   r   r  r  r  r   r   r  r   r   r>   r>   r?   rM   *  s8   	

zLlama4VisionAttention.forwardr  )rN   rO   rP   r   r+   r3   rQ   r	   r   r   ru   rM   rR   r>   r>   r<   r?   r    s"    r  c                       r  )Llama4VisionMLPc                    sJ   t    || _t | _tj|j|jdd| _	tj|j|jdd| _
d S )NTrU   )r*   r+   r'   r1   rl  rZ   rW   r/   r.   ri  rk  r:   r<   r>   r?   r+   U  s
   

zLlama4VisionMLP.__init__r@   rA   c                 C   s"   |  |}| |}| |}|S r[   )ri  rZ   rk  ro  r>   r>   r?   rM   \  s   


zLlama4VisionMLP.forwardr  r>   r>   r<   r?   r  T  r  r  c                
       sP   e Zd Zdef fddZ		ddejdejdejdB dedB fd	d
Z  Z	S )Llama4VisionEncoderLayerr'   c                    sF   t    |j| _t|| _t|| _t|j| _	t|j| _
d S r[   )r*   r+   r/   r  r  r  r  r1   	LayerNormr  r  r:   r<   r>   r?   r+   d  s   


z!Llama4VisionEncoderLayer.__init__Nhidden_stater  r   output_attentionsc                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r/||f7 }|S )N)r  r   )r  r  r  r  )r;   r  r  r   r  r  r   r_  r>   r>   r?   rM   n  s    




z Llama4VisionEncoderLayer.forwardr  )
rN   rO   rP   r   r+   r3   rQ   r  rM   rR   r>   r>   r<   r?   r  c  s    r  c                       sp   e Zd ZdZdef fddZ				ddejdejdejdB d	edB d
edB dedB de	e
B fddZ  ZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r'   c                    s@   t     | _t fddt jD | _d| _ | _d S )Nc                    s   g | ]}t  qS r>   )r  )r3  r  r4  r>   r?   r5    s    z0Llama4VisionEncoder.__init__.<locals>.<listcomp>F)	r*   r+   r'   r1   r;  r<  r=  r>  rA  r:   r<   r4  r?   r+     s
   
 
zLlama4VisionEncoder.__init__Nr@   r  r   r  output_hidden_statesreturn_dictrA   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}| jD ]}	|r6||f }|	||||d}
|rG||
d f }|
d }q-|rS||f }|satdd |||fD S t|||dS )	ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr>   )r  r   r  r  r#   r   c                 s       | ]	}|d ur|V  qd S r[   r>   r3  vr>   r>   r?   	<genexpr>      z.Llama4VisionEncoder.forward.<locals>.<genexpr>rH  r@   r2  )r'   r  r  use_return_dictr>  ru   r   )r;   r@   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputsr>   r>   r?   rM     s2   



zLlama4VisionEncoder.forwardNNNN)rN   rO   rP   r	  r   r+   r3   rQ   r  ru   r   rM   rR   r>   r>   r<   r?   r    s,    r  c                       r  )Llama4UnfoldConvolutionc                    s`   t    |j}t|tr||f}tjj||jd| _tj	|j
|d  |d  |jdd| _d S )N)kernel_sizestrider   r#   FrU   )r*   r+   r~  r   r   r3   r1   UnfoldunfoldrW   num_channelsr/   linear)r;   r'   r  r<   r>   r?   r+     s   

z Llama4UnfoldConvolution.__init__r@   rA   c                 C   s&   |  |}|ddd}| |}|S )Nr   r)   r#   )r  rx  r  ro  r>   r>   r?   rM     s   

zLlama4UnfoldConvolution.forwardr  r>   r>   r<   r?   r    s    r  c                       s*   e Zd Zdef fddZdd Z  ZS )Llama4VisionRotaryEmbeddingr'   c                    sh  t    |j|j }tj|d tjd|d d}tj||d d gdd}d|d< || }|| }|j	|j
 d }d|jd	 td|dd |d   |   }|d d
 |d d d d f  jddd}|d d
 |d d d d f  jddd}	tj||	gdd  dd d df }
|
|ddddk d}
ttjt|
t|
gdd}|| _d S )Nr)   r   r#   r   rC   )rB   rB   r   r   ).NrB   .)r*   r+   
image_sizer~  r3   r   int32r   catr/   r   r   rk   repeat_interleaver   masked_fillr   stackcossinr  )r;   r'   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   rP  r<   r>   r?   r+     s&   
  ((($
z$Llama4VisionRotaryEmbedding.__init__c                 C   s   | j |jS r[   )r  r~   r   ro  r>   r>   r?   rM   
  s   z#Llama4VisionRotaryEmbedding.forward)rN   rO   rP   r   r+   rM   rR   r>   r>   r<   r?   r    s    r  c                       s   e Zd ZU dZdZdgZeed< def fddZdd Z									dd
e
jde
jd	B ded	B ded	B ded	B deee
jdf B fddZ  ZS )r$  vision_model)r  r  r'   c                    s   t  | |j| _|j| _|j| _|j| _| j| j d d | _|jd | _t|| _	t
| jt| j | _t
| jt| j| j | _t|| _t
| j| _t
| j| _t|| _t|| _|   d S )Nr)   r#   r   )r*   r+   r  r~  r/   r  r|  r&  r  patch_embeddingr1   r2   r3   randnr%  r'  r  rotary_embeddingr  layernorm_prelayernorm_postr  r1  r  vision_adapterrB  r:   r<   r>   r?   r+     s    



zLlama4VisionModel.__init__c                 C   s   | j S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rn   r>   r>   r?   get_input_embeddings-  s   z&Llama4VisionModel.get_input_embeddingsNpixel_valuesr   r  r  r  rA   .c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|j\}}}	}
d}d}| |}|j\}}}||| | ||}| j|jd d|jd }t	j
||gdd}|d7 }||| |||}| jj|j|jd}|| }| |}||d|}| |}| j|d|||d}|j}| |}|ddddddf }| |}|r|jnd}|r|d }nd}|std	d
 |||fD S t|||dS )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr#   r   rB   rC   r   r   )r   r  r  r  r)   c                 s   r  r[   r>   r  r>   r>   r?   r    r  z,Llama4VisionModel.forward.<locals>.<genexpr>r  )r'   r  r  r  rF   r  r   r%  r   r3   r  r'  r~   r   r   r  rE   r  r1  rH  r  r  r@   ru   r   )r;   r  r   r  r  r  r   batch_size_times_num_tilesr  r  r  num_concurrent_media
num_chunksr  r  r|  r   r%  positional_embeddingr  rt   r@   r2  r>   r>   r?   rM   3  sZ   "





zLlama4VisionModel.forwardr  )rN   rO   rP   rS  r(  rR  r   r   r+   r  r3   rQ   r  r   ru   rM   rR   r>   r>   r<   r?   r$    s0   
 	r$  c                %       s  e Zd ZU ddgZi ZdZeed< def fddZdd Z	d	d
 Z
dd Zdd Zdd Zdd Zeeddedddejdedee deeB fddZdejdejdejfd d!Zeedde	"	"	"	"	"	"	"	"	"	"	"	"	"	#d2dejd"B dejd"B d$ejd"B d%ejd"B d&ed"B dejd"B ded"B d'ejd"B d(ed"B d)ed"B d*ed"B d+ed"B d,ejd"B d-eejB dee dee B f d.d/Z!	"	"	"	"	"	"	d3d0d1Z"  Z#S )4Llama4ForConditionalGenerationr  r  r1  r'   c                    sl   t  | t|j| _t|| _t|j| _	|jj
| _
t| jdr(| jj| _n| jjjp.d| _|   d S )Nr6  rB   )r*   r+   r$  rq  r  rp  multi_modal_projectorrU  r!  rV  r8  r   r'   r6  rB  r:   r<   r>   r?   r+     s   

z'Llama4ForConditionalGeneration.__init__c                 C   
   | j  S r[   )rV  r  rn   r>   r>   r?   r       
z3Llama4ForConditionalGeneration.get_input_embeddingsc                 C      | j | d S r[   )rV  set_input_embeddings)r;   r   r>   r>   r?   r       z3Llama4ForConditionalGeneration.set_input_embeddingsc                 C   r  r[   )rV  get_output_embeddingsrn   r>   r>   r?   r    r  z4Llama4ForConditionalGeneration.get_output_embeddingsc                 C   r  r[   )rV  set_output_embeddings)r;   new_embeddingsr>   r>   r?   r    r  z4Llama4ForConditionalGeneration.set_output_embeddingsc                 C   r  r[   )rV  set_decoder)r;   decoderr>   r>   r?   r    r  z*Llama4ForConditionalGeneration.set_decoderc                 C   r  r[   )rV  get_decoderrn   r>   r>   r?   r    r  z*Llama4ForConditionalGeneration.get_decoderF)tie_last_hidden_stateszOObtains image last hidden states from the vision tower and apply al projection.rc  r  vision_feature_select_strategyr   rA   c                 K   s$   dd |  D }| j|fi |S )aj  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
            The tensors corresponding to the input images.
        vision_feature_select_strategy (`str`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`
        c                 S   s   i | ]\}}|d ur||qS r[   r>   )r3  kr  r>   r>   r?   
<dictcomp>  s    zELlama4ForConditionalGeneration.get_image_features.<locals>.<dictcomp>)itemsr  )r;   r  r  r   r>   r>   r?   get_image_features  s   z1Llama4ForConditionalGeneration.get_image_featuresrC  rD  rt  c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}t||  | kd| d|jd   |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr  rB   z6Image features and image tokens do not match, tokens: z, features: r   )r  r3   tensorr'   image_token_idlongr   allr   rK  	expand_asr~   r   numelrF   )r;   rC  rD  rt  special_image_maskn_image_tokensr>   r>   r?   get_placeholder_mask  s   z3Llama4ForConditionalGeneration.get_placeholder_maskNr   r   r   r   rY  r  r  r  r  r   rZ  c                 K   sP  |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|du |duA r*td|dur6|dur6td|du r@|  |}|duro| j||ddj}|d|	d}| 
||j|j}| j|||d}|||}| jd|||||	|
||||d
|}|d	 }d}|dur|dur|dd|jd
 d
  df |j}|dddddf ||jd	k  }|dd
df ||jd	k  }n|dddddf  }|dd
df  }t }||d|	d|d|j}|s|f|d
d  }|dur|f| S |S t|||j|j|j|dur$|dS ddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```NrE  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneT)r  r  r  rB   )rD  rt  )
r   r   r   rD  r  r  r  r  r   rZ  r   r#   .)r\  r[  r   r@   r2  rf  r>   )r'   r  r  r  rI  r  r  rH  rE   rw  r  r~   r   r   r  masked_scatterrV  rF   r   r1   CrossEntropyLossre  r   r@   r2  )r;   rC  r  r   r   r   rD  r  rY  r  r  r  r  r   rZ  r   rt  vision_flatprojected_vision_flatr  r_  r[  r\  shift_attention_maskshift_logitsshift_labelsloss_fctrt   r>   r>   r?   rM     s   3
(*& z&Llama4ForConditionalGeneration.forwardc	              	   K   s>   | j j|f||||||d|	}
|s|	dds||
d< |
S )N)r   rD  r   r   rZ  is_first_iterationr  Tr  )rV  prepare_inputs_for_generationget)r;   rC  r   rD  r  r   r   rZ  r  r   model_inputsr>   r>   r?   r  k  s   z<Llama4ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNNNNr   )NNNNNNF)$rN   rO   rP   rR  rb  rS  r$   r   r+   r  r  r  r  r  r  r!   r"   r   r3   r  r   r   r   ru   r   r  r
  r  rQ   r	   r  r   re  rM   r  rR   r>   r>   r<   r?   r    s   
 
	
 r  )r  r0  r$  rU  r  )r   )gru  collections.abcr   dataclassesr   typingr   r3   torch.nnr1   torch.nn.functionalr|   rn  /transformers.models.llama4.configuration_llama4r    r   r"  activationsr   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr    r!   utils.output_capturingr"   configuration_llama4r$   r%   
get_loggerrN   loggerModuler&   rS   r^   rp   rW   rv   r   r   rQ   ru   r   r   r   rk   r   r   r   r  r  r0  rU  re  rg  rp  r  r  r  r  r  r  r  r  r  r  r$  r  __all__r>   r>   r>   r?   <module>   s   
!B

!
]5eQ
:,R  w