o
    wi                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 e04e5Z6G dd de	j7Z8G dd de	j7Z9G dd dej	j7Z:G dd de	j7Z;edG dd de	j7Z<G dd de	j7Z=d ej>d!ej>d"ej>d#e?ej>ej>f fd$d%Z@d&ej>d'eAd#ej>fd(d)ZB	*ddd+e	j7d,ej>d-ej>d.ej>d/eej> d0eCd1eCfd2d3ZD	*ddd+e	j7d,ej>d-ej>d.ej>d/eej> d0eCd1eCfd4d5ZEG d6d7 d7e	j7ZFG d8d9 d9eZGe.G d:d; d;e)ZHe.G d<d= d=eHZIG d>d? d?ee-ZJG d@dA dAeHeZKee.dBdCG dDdE dEe#ZLG dFdG dGej	j7ZMG dHdI dIe	j7ZNdJdK ZOG dLdM dMe	j7ZPdNej>d,ej>fdOdPZQd,ej>d-ej>dNej>d#e?ej>ej>f fdQdRZRG dSdT dTe	j7ZSG dUdV dVe	j7ZTG dWdX dXeZUG dYdZ dZe	j7ZVG d[d\ d\e	j7ZWG d]d^ d^e	j7ZXG d_d` d`eHZYG dadb dbeHeZZg dcZ[dS )e    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tuplelogging   )Llama4ConfigLlama4TextConfigc                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Llama4TextExpertsconfigc                    sx   t    |j| _|j| _|j| _| j| _tt	
| j| jd| j | _tt	
| j| j| jf| _t|j | _d S N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr   
hidden_actact_fnselfr"   	__class__ g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/llama4/modeling_llama4.pyr&   -   s   
 zLlama4TextExperts.__init__hidden_statesreturnc                 C   s\   | | jd| j}t|| j}|jddd\}}t|| | | j}| d| j}|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r$   dim)	viewr(   r*   r.   bmmr0   chunkr3   r1   )r5   r:   gate_upgateupnext_statesr8   r8   r9   forward7   s   zLlama4TextExperts.forward)	__name__
__module____qualname__r    r&   r.   TensorrF   __classcell__r8   r8   r6   r9   r!   ,   s    
r!   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Llama4TextMLPNc                    sj   t    |d u r|j}|| _tj|j|dd| _tj|j|dd| _tj||jdd| _	t
|j | _d S NFbias)r%   r&   r)   r"   r,   Linearr*   	gate_projup_projr1   r   r2   activation_fn)r5   r"   r)   r6   r8   r9   r&   N   s   
zLlama4TextMLP.__init__c                 C   s$   |  | || | }| |S N)rS   rQ   rR   r1   )r5   xr1   r8   r8   r9   rF   Z   s   
zLlama4TextMLP.forwardrT   rG   rH   rI   r&   rF   rK   r8   r8   r6   r9   rL   M   s    rL   c                       s<   e Zd Zddef fddZdd Zdd Zd	d
 Z  ZS )Llama4TextL2Normư>epsc                    s   t    || _d S rT   )r%   r&   rY   )r5   rY   r6   r8   r9   r&   `   s   

zLlama4TextL2Norm.__init__c                 C   $   |t |djddd| j  S Nr$   r<   T)keepdimr.   rsqrtpowmeanrY   r5   rU   r8   r8   r9   _normd      $zLlama4TextL2Norm._normc                 C   s   |  | |S rT   )rb   floattype_asra   r8   r8   r9   rF   g   s   zLlama4TextL2Norm.forwardc                 C   s   d| j  S )Nzeps=rY   r5   r8   r8   r9   
extra_reprj      zLlama4TextL2Norm.extra_repr)rX   )	rG   rH   rI   rd   r&   rb   rF   rh   rK   r8   r8   r6   r9   rW   _   s
    rW   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )Llama4TextRMSNormh㈵>c                    s&   t    || _tt|| _dS )z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r%   r&   rY   r,   r-   r.   onesweight)r5   r*   rY   r6   r8   r9   r&   o   s   
zLlama4TextRMSNorm.__init__c                 C   rZ   r[   r]   ra   r8   r8   r9   rb   w   rc   zLlama4TextRMSNorm._normc                 C   s   |  | |}|| j S rT   )rb   rd   re   rm   )r5   rU   outputr8   r8   r9   rF   z   s   
zLlama4TextRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerm   shaperY   rg   r8   r8   r9   rh   ~   s   zLlama4TextRMSNorm.extra_repr)rk   )rG   rH   rI   r&   rb   rF   rh   rK   r8   r8   r6   r9   rj   n   s
    rj   Llama4TextMoec                       $   e Zd Z fddZdd Z  ZS )rq   c                    sP   t    |j| _|j| _|j| _t|| _	t
j|j|jdd| _t|| _d S rM   )r%   r&   num_experts_per_toktop_kr*   
hidden_dimr'   r(   r!   expertsr,   rP   routerrL   shared_expertr4   r6   r8   r9   r&      s   

zLlama4TextMoe.__init__c           	      C   s   | d| j}| |}tj|| jdd\}}t|tdd||	dd}t
| |j}|| jd}|| dd }| |}| |}|| | jd| jjdd ||fS )Nr<   r   r=   z-infr   )reshaperu   rw   r.   topkrt   	full_likerd   scatter_	transposesigmoidtodtyperepeatr(   rv   rx   add_sum)	r5   r:   router_logitsrouter_top_valuerouter_indicesrouter_scores	routed_in
routed_outoutr8   r8   r9   rF      s   
 

 zLlama4TextMoe.forwardrV   r8   r8   r6   r9   rq      s    	c                       s8   e Zd Zddef fddZe edd Z  Z	S )Llama4TextRotaryEmbeddingNr"   c                    sp   t    |jd urdnd| _|j| _|j| _|| _t| j | _	| 	| j|\}| _
| jd|dd | j| _d S )Nllama3defaultinv_freqF)
persistent)r%   r&   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr"   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r5   r"   devicer   r6   r8   r9   r&      s   
z"Llama4TextRotaryEmbedding.__init__c                 C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtr2|jjdkr2|jjnd}tj	|dd# |
|j| dd}tt||}|| j }W d    |S 1 saw   Y  |S )	Nr   r<   r   mpscpuF)device_typeenabledr$   )r   rd   expandrp   
isinstancer   typestrr.   autocastr   r}   polar	ones_liker   )r5   rU   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_cisr8   r8   r9   rF      s   (&
z!Llama4TextRotaryEmbedding.forwardrT   )
rG   rH   rI   r    r&   r.   no_gradr   rF   rK   r8   r8   r6   r9   r      s
    r   xqxkr   r;   c              	   C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t ||d d d d d d d f  d}t ||d d d d d d d f  d}|| ||fS )Nr<   r$   r   )r.   view_as_complexrd   ry   rp   view_as_realflattenre   )r   r   r   xq_xk_xq_outxk_outr8   r8   r9   apply_rotary_emb   s
   ,,,,r   r:   n_repc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rp   r   ry   )r:   r   batchnum_key_value_headsslenhead_dimr8   r8   r9   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj	|
|| j
d}
t|
|	}|dd }||
fS )Nr$   r   r<   r=   ptrainingr   )r   num_key_value_groupsr.   matmulr}   rp   r,   
functionalsoftmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr8   r8   r9   eager_attention_forward   s   
&r   c                 K   s   t || j}t || j}	t||dd| jd  }
|d ur6|d d d d d d d |jd f }|
| }
tjj	|
dd}
tjj
|
|| jd}
t|
|	}|dd }||
fS )	Nr$   r         r   r<   r=   r   r   )r   r   r.   r   r}   r   rp   r,   r   r   r   r   r   r   r8   r8   r9   vision_eager_attention_forward   s   
&r   c                       s   e Zd ZdZdef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr"   c                    s&  t    || _|| _t|d|j|j | _|j| _|j|j | _	|j| _| jd | _
|j| _|j| _|j| _|j| _d| _|j| | _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jjr| jrt|j| _d S d S d S )Nr   r   TrN   )r%   r&   r"   	layer_idxgetattrr*   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper,   rP   attention_biasq_projk_projv_projo_projuse_qk_normrW   rms_norm_epsqk_normr5   r"   r   r6   r8   r9   r&     s:   
zLlama4TextAttention.__init__Nr:   position_embeddingsr   past_key_valuecache_positionr   r;   c                 K   s  |j d d }g |d| jR }| ||}	| |jg |d| jR  }
| ||dd}| jrDt|	|
|	|	j
\}	}
t| drS| |	}	| |
}
| jr| jstt| d | j d | j d }|d|d ddfg |ddR }|	| 	|	j}	|	dd}	|
dd}
|d urd|i}||
|| j|\}
}t}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd	|\}}|jg |dR    }| !|}||fS )
Nr<   r   r$   r         ?r   eagerr   )r   r   )"rp   r   r   r?   r   r   r}   r   r   r   r   hasattrr   r   r.   logfloorrd   r   r   r   r   updater   r   r"   _attn_implementationr   r   r   r   ry   r   r   )r5   r:   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   r8   r8   r9   rF   0  sP   	 


**

zLlama4TextAttention.forwardNN)rG   rH   rI   __doc__r    r&   r.   rJ   ro   r   r	   
LongTensorr   r   rF   rK   r8   r8   r6   r9   r     s(    #r   c                       s   e Zd Z fddZ								ddejdeej deej deeej  d	ee	 d
ee	 dee	 deej deeejejf  de
e deejeeejejf  f fddZ  ZS )Llama4TextDecoderLayerc                    s   t    |j| _|| _|j| | _t||| _||jv | _	| j	r't
|| _nt||jd| _t|j|jd| _t|j|jd| _d S )N)r)   rf   )r%   r&   r*   r   layer_typesattention_typer   	self_attn
moe_layersis_moe_layerrq   feed_forwardrL   intermediate_size_mlprj   r   input_layernormpost_attention_layernormr   r6   r8   r9   r&   m  s   
zLlama4TextDecoderLayer.__init__NFr:   r   r   r   output_attentionsoutput_router_logits	use_cacher   r   r   r;   c
              
   K   s   |}|  |}| jd||	|||||d|
\}}|| }|}| |}| |}| jr1|\}}nd }|||j }|f}|rE||f7 }|rL||f7 }|S )N)r:   r   r   r   r  r	  r   r8   )r  r   r  r  r  r?   rp   )r5   r:   r   r   r   r  r  r	  r   r   r   residualattention_statesself_attn_weightsr   outputsr8   r8   r9   rF   |  s6   







zLlama4TextDecoderLayer.forward)NNNFFFNN)rG   rH   rI   r&   r.   rJ   r   r   ro   boolr   r   FloatTensorrF   rK   r8   r8   r6   r9   r   l  sB    	
r   c                   @   s>   e Zd ZeZdZdgZdZdZdZ	dZ
dZdZdZdd ZdS )Llama4PreTrainedModelTpast_key_valuesFc                 C   sJ  t | jdr
| jjn| jjj}t|tjr-|jjj	d|d |j
d ur+|j
j  d S d S t|tjrN|jjj	d|d |jd urL|jj|j   d S d S t|tjrc|jjd |j
j  d S t|trq|jjd d S t|tr|jjj	d|d |jjj	d|d d S t|tr|jjj	|jd |jjj	|jd d S d S )Ninitializer_ranger   )r`   stdr   )r  )r   r"   r  text_configr   r,   rP   rm   datanormal_rO   zero_	Embeddingpadding_idx	LayerNormfill_rj   r!   r0   r1   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r5   r   r  r8   r8   r9   _init_weights  s4   






z#Llama4PreTrainedModel._init_weightsN)rG   rH   rI   r   config_classsupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr   r8   r8   r8   r9   r    s    r  c                       s   e Zd ZdgZdZeZdef fddZdd Zdd	 Z	e
e	
	
	
	
	
	
	
	
	
	
ddejdeej deej dee deej dee dee dee dee deej dee deeef fddZ  ZS )Llama4TextModelr   modelr"   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r8   )r   ).0r   r"   r8   r9   
<listcomp>  s    z,Llama4TextModel.__init__.<locals>.<listcomp>rf   r.  F)r%   r&   pad_token_idr  
vocab_sizer,   r  r*   embed_tokens
ModuleListrangenum_hidden_layerslayersrj   r   normr   
rotary_embgradient_checkpointing	post_initr4   r6   r.  r9   r&     s   zLlama4TextModel.__init__c                 C      | j S rT   r2  rg   r8   r8   r9   get_input_embeddings     z$Llama4TextModel.get_input_embeddingsc                 C   
   || _ d S rT   r<  r5   r   r8   r8   r9   set_input_embeddings     
z$Llama4TextModel.set_input_embeddingsN	input_idsr   r   r  inputs_embedsr	  r  output_hidden_statesreturn_dictr   flash_attn_kwargsr;   c                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|	d ur$|	n| j j}	|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rR| 
|| j
jj}|r[|d u r[t }|
d u rw|d urg| nd}tj|||jd  |jd}
|d u r|
d}t| }ts| j |||
||d}td	i |td	i |d}|}| ||}|rd	nd }|rd	nd }| jd | j j D ])}|r||f7 }||f||j |||||
|d
|}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )r   )r"   input_embedsr   r   r  r   )full_attentionchunked_attentionr8   )r   r   r   r  r	  r   r   )last_hidden_stater  r:   
attentions)r"   r  rE  r	  use_return_dict
ValueErrorr9  r   loggerwarning_oncer2  r   rm   r   r
   get_seq_lengthr.   arangerp   	unsqueezer   dictr   r   r8  r6  r5  r   r7  r   )r5   rC  r   r   r  rD  r	  r  rE  rF  r   rG  past_seen_tokenscausal_mask_mappingmask_kwargsr:   freq_cisall_hidden_statesall_self_attnsdecoder_layerlayer_outputsr8   r8   r9   rF     s   


	


zLlama4TextModel.forward)
NNNNNNNNNN)rG   rH   rI   _no_split_modulesbase_model_prefixr    r!  r&   r=  rA  r   r   r.   r   r   rJ   r	   r  r  r   r   r   ro   r   rF   rK   r8   r8   r6   r9   r+    sX    	

r+  c                   @   s   e Zd ZdS )KwargsForCausalLMN)rG   rH   rI   r8   r8   r8   r9   r`  _  s    r`  c                !       s*  e Zd ZdgZdZdgZddiZeZdef fddZ	d	d
 Z
dd Zdd Zdd Zdd Zdd Zee												d'dejdeej deej deeeeej f  deej deej dee dee dee d ee d!eej d"eeejf d#ee d$eeef fd%d&Z   Z!S )(Llama4ForCausalLMr   language_modelzlm_head.weightlm_headcolwise_repr"   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rM   )
r%   r&   r+  r,  r1  r,   rP   r*   rc  r:  r4   r6   r8   r9   r&   i  s
   
zLlama4ForCausalLM.__init__c                 C   s   | j jS rT   r,  r2  rg   r8   r8   r9   r=  r  s   z&Llama4ForCausalLM.get_input_embeddingsc                 C   s   || j _d S rT   re  r@  r8   r8   r9   rA  u  ri   z&Llama4ForCausalLM.set_input_embeddingsc                 C   r;  rT   rc  rg   r8   r8   r9   get_output_embeddingsx  r>  z'Llama4ForCausalLM.get_output_embeddingsc                 C   r?  rT   rf  r5   new_embeddingsr8   r8   r9   set_output_embeddings{  rB  z'Llama4ForCausalLM.set_output_embeddingsc                 C   r?  rT   r,  r5   decoderr8   r8   r9   set_decoder~  rB  zLlama4ForCausalLM.set_decoderc                 C   r;  rT   rk  rg   r8   r8   r9   get_decoder  r>  zLlama4ForCausalLM.get_decoderNr   rC  r   r   r  rD  labelsr	  r  rE  rF  r   logits_to_keepr   r;   c                 K   s   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| jd||||||||	d|d
|}|d }t|tr@t| dn|}| |dd|ddf }d}|durd| j	d||| j j
d|}t|||j|j|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```NT)
rC  r   r   r  rD  r	  r  rE  rF  r   r   )logitsrp  r1  )lossrr  r  r:   rM  r8   )r"   r  rE  rN  r,  r   intslicerc  loss_functionr1  r   r  r:   rM  )r5   rC  r   r   r  rD  rp  r	  r  rE  rF  r   rq  r   r  r:   slice_indicesrr  rs  r8   r8   r9   rF     s>   (zLlama4ForCausalLM.forward)NNNNNNNNNNNr   )"rG   rH   rI   r^  r_  _tied_weights_keys_tp_planr    r!  r&   r=  rA  rg  rj  rn  ro  r   r   r.   r   r   rJ   r   r	   listr  r  rt  r   r`  ro   r   rF   rK   r8   r8   r6   r9   ra  b  sp    		

ra  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	Llama4CausalLMOutputWithPastav  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nrs  rr  r  r:   rM  image_hidden_states)rG   rH   rI   r   rs  r   r.   r  __annotations__rr  r  rz  r:   ro   rM  r}  r8   r8   r8   r9   r|    s   
 r|  c                       rr   )Llama4VisionMLP2c                    s\   t    |j| _|j| _tj| j|jdd| _tj|j|jdd| _	t
 | _|j| _d S rM   )r%   r&   r*   r)   r,   rP   projector_input_dimfc1projector_output_dimfc2GELUrS   projector_dropoutr   r4   r6   r8   r9   r&     s   

zLlama4VisionMLP2.__init__c                 C   s8   |  |}| |}tj|| j| jd}| | |S )Nr   )r  rS   Fr   r   r  r5   r:   r8   r8   r9   rF     s   

zLlama4VisionMLP2.forwardrV   r8   r8   r6   r9   r    s    	r  c                       rr   )Llama4MultiModalProjectorc                    s(   t    tj|jj|jjdd| _d S rM   )	r%   r&   r,   rP   vision_configvision_output_dimr  r*   linear_1r4   r6   r8   r9   r&     s   
z"Llama4MultiModalProjector.__init__c                 C   s   |  |}|S rT   )r  )r5   image_featuresr:   r8   r8   r9   rF     s   
z!Llama4MultiModalProjector.forwardrV   r8   r8   r6   r9   r    s    r  c           
   	   C   s   | j \}}}tt|}| |||d} |  \}}}}| ||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }||d|j d }	|	S )Nr<   r   r$   r   r   )rp   rt  mathsqrtr?   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensorr8   r8   r9   pixel_shuffle  s    $r  c                       2   e Zd Z fddZdejdejfddZ  ZS )Llama4VisionPixelShuffleMLPc                    s>   t    |j| _t|j| jd  | _|j| _t|| _	d S r#   )
r%   r&   pixel_shuffle_ratiort  r  	inner_dimr  
output_dimr  mlpr4   r6   r8   r9   r&   %  s
   
z$Llama4VisionPixelShuffleMLP.__init__encoded_patchesr;   c                 C   s   t || j}| |S rT   )r  r  r  )r5   r  r8   r8   r9   rF   ,  s   
z#Llama4VisionPixelShuffleMLP.forwardrG   rH   rI   r&   r.   rJ   rF   rK   r8   r8   r6   r9   r  $      r  freqs_cic                    s(   |j   fddt|jD }| j| S )Nc                    s,   g | ]\}}|d ks| d  kr|nd qS )r   r8   )r-  idndimr8   r9   r/  4  s   , z)reshape_for_broadcast.<locals>.<listcomp>)r  	enumeraterp   r?   )r  r   rp   r8   r  r9   reshape_for_broadcast2  s   
r  c                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t||d}||j}t || 	d}t || 	d}|
| |
|fS )Nr<   r$   )r  r   r   )r.   r   rd   ry   rp   r  r   r   r   r   re   )r   r   r  query_key_	query_outkey_outr8   r8   r9   vision_apply_rotary_emb8  s   ,,r  c                       sx   e Zd Zdef fddZ		ddejdejdeej dee d	e	e
 d
eejeej eeej  f fddZ  ZS )Llama4VisionAttentionr"   c                    s   t    || _|j| _|j| _|j|j | _d| _|j	| _	| jd | _
tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j | jdd| _d S )Nr   r   TrN   )r%   r&   r"   r*   	embed_dimr   	num_headsr   r   r   r   r,   rP   r   r   r   r   r4   r6   r8   r9   r&   G  s   
 zLlama4VisionAttention.__init__Nr:   r  r   r   r   r;   c                 K   s   |j d d }g |d| jR }| ||}| ||}	| ||}
t||	|d\}}	|dd}|	dd}	|
dd}
t}| j	j
dvrRt| j	j
 }|| ||	|
d f| js^dn| jd dd|\}}|jg |dR   }| |}||fS )	Nr<   )r  r   r$   )r   flex_attentionr   F)r   r   r   )rp   r   r   r?   r   r   r  r}   r   r"   r   r   r   r   ry   r   r   )r5   r:   r  r   r   r   r   r   r   r   r   r   r   r   r8   r8   r9   rF   V  s8   	

zLlama4VisionAttention.forwardr   )rG   rH   rI   r   r&   r.   rJ   r   r	   r   r   ro   rF   rK   r8   r8   r6   r9   r  F  s"    r  c                       r  )Llama4VisionMLPc                    sJ   t    || _t | _tj|j|jdd| _	tj|j|jdd| _
d S )NTrN   )r%   r&   r"   r,   r  rS   rP   r*   r)   r  r  r4   r6   r8   r9   r&     s
   

zLlama4VisionMLP.__init__r:   r;   c                 C   s"   |  |}| |}| |}|S rT   )r  rS   r  r  r8   r8   r9   rF     s   


zLlama4VisionMLP.forwardr  r8   r8   r6   r9   r    r  r  c                
       sP   e Zd Zdef fddZ		ddejdejdeej dee fd	d
Z	  Z
S )Llama4VisionEncoderLayerr"   c                    sF   t    |j| _t|| _t|| _t|j| _	t|j| _
d S rT   )r%   r&   r*   r  r   r  r  r,   r  r  r  r4   r6   r8   r9   r&     s   


z!Llama4VisionEncoderLayer.__init__Nhidden_stater  r   r  c                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r/||f7 }|S )N)r  r   )r  r   r  r  )r5   r  r  r   r  r
  r   r  r8   r8   r9   rF     s    




z Llama4VisionEncoderLayer.forwardr   )rG   rH   rI   r   r&   r.   rJ   r   r  rF   rK   r8   r8   r6   r9   r    s    r  c                       st   e Zd ZdZdef fddZ				ddejdejdeej d	ee	 d
ee	 dee	 de
eef fddZ  ZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r"   c                    s@   t     | _t fddt jD | _d| _ | _d S )Nc                    s   g | ]}t  qS r8   )r  )r-  _r.  r8   r9   r/    s    z0Llama4VisionEncoder.__init__.<locals>.<listcomp>F)	r%   r&   r"   r,   r3  r4  r5  r6  r9  r4   r6   r.  r9   r&     s
   
 
zLlama4VisionEncoder.__init__Nr:   r  r   r  rE  rF  r;   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}| jD ]}	|r6||f }|	||||d}
|rG||
d f }|
d }q-|rS||f }|satdd |||fD S t|||dS )	ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr8   )r  r   r  r  r   r   c                 s       | ]	}|d ur|V  qd S rT   r8   r-  vr8   r8   r9   	<genexpr>      z.Llama4VisionEncoder.forward.<locals>.<genexpr>rL  r:   rM  )r"   r  rE  rN  r6  ro   r   )r5   r:   r  r   r  rE  rF  encoder_statesall_attentionsencoder_layerr]  r8   r8   r9   rF     s2   



zLlama4VisionEncoder.forwardNNNN)rG   rH   rI   r   r   r&   r.   rJ   r   r  r   ro   r   rF   rK   r8   r8   r6   r9   r    s,    
r  c                       r  )Llama4UnfoldConvolutionc                    s`   t    |j}t|tr||f}tjj||jd| _tj	|j
|d  |d  |jdd| _d S )N)kernel_sizestrider   r   FrN   )r%   r&   r  r   rt  r.   r,   UnfoldunfoldrP   num_channelsr*   linear)r5   r"   r  r6   r8   r9   r&     s   

z Llama4UnfoldConvolution.__init__r:   r;   c                 C   s&   |  |}|ddd}| |}|S )Nr   r$   r   )r  r  r  r  r8   r8   r9   rF     s   

zLlama4UnfoldConvolution.forwardr  r8   r8   r6   r9   r    s    r  c                       rr   )Llama4VisionRotaryEmbeddingc                    sd  t    |j|j }tj|d tjd|d d}tj||d d gdd}d|d< || }|| }|j	|j
 d }d|jtd|dd |d   |   }|d d	 |d d d d f  jdd
d}|d d	 |d d d d f  jdd
d}	tj||	gd
d  dd d df }
|
|d
dddk d}
ttjt|
t|
gd
d}|| _d S )Nr$   )r   r   r   r=   r   )r<   r<   r   ).Nr<   .)r%   r&   
image_sizer  r.   rS  int32ry   catr*   r   
rope_thetard   repeat_interleaver   masked_fillr   stackcossinr  )r5   r"   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   rY  r6   r8   r9   r&   #  s   
 ,((($
z$Llama4VisionRotaryEmbedding.__init__c                 C   s   | j |jS rT   )r  r   r   r  r8   r8   r9   rF   4  s   z#Llama4VisionRotaryEmbedding.forwardrV   r8   r8   r6   r9   r  "  s    r  c                       s   e Zd ZdZdgZeZdef fddZdd Z				dd	e	j
d
ee	j
 dee dee dee deeee	j
df f fddZ  ZS )r  vision_modelr  r"   c                    s   t  | |j| _|j| _|j| _|j| _| j| j d d | _|jd | _t|| _	t
| jt| j | _t
| jt| j| j | _t|| _t
| j| _t
| j| _t|| _t|| _|   d S )Nr$   r   r   )r%   r&   r  r  r*   r  r  r  r  patch_embeddingr,   r-   r.   randnr  r  r  rotary_embeddingr  layernorm_prelayernorm_postr  r,  r  vision_adapterr:  r4   r6   r8   r9   r&   =  s    



zLlama4VisionModel.__init__c                 C   r;  )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rg   r8   r8   r9   r=  V  s   z&Llama4VisionModel.get_input_embeddingsNpixel_valuesr   r  rE  rF  r;   .c                 C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|j\}}}}	d}
d}| |}|j\}}}|||
 | ||}| j|jd d|jd }t	j
||gdd}|d7 }|||
 |||}| jj|j|jd}|| }| |}||d|}| |}| j|d|||d}|j}| |}|ddddddf }| |}|r|jnd}|r|d }nd}|std	d
 |||fD S t|||dS )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr   r   r<   r=   )r   r   )r   rE  r  r  r$   c                 s   r  rT   r8   r  r8   r8   r9   r    r  z,Llama4VisionModel.forward.<locals>.<genexpr>r  )r"   r  rE  rN  rp   r  ry   r  r   r.   r  r  r   r   r   r  r?   r  r,  rL  r  r  r:   ro   r   )r5   r  r   r  rE  rF  batch_size_times_num_tilesr  r  r  num_concurrent_media
num_chunksr  r  r  ru   r  positional_embeddingr  rn   r:   rM  r8   r8   r9   rF   \  sZ   





zLlama4VisionModel.forwardr  )rG   rH   rI   r_  r^  r   r!  r&   r=  r.   rJ   r   r  r   r   ro   rF   rK   r8   r8   r6   r9   r  8  s.    	r  c                (       s  e Zd ZddgZi ZdZeZdef fddZdd Z	d	d
 Z
dd Zdd Zdd Zdd Zdejdeeee f defddZe																d-dejdejdeej deej deeej  deej deeeee f  dee deej d ee d!ee d"ee d#ee d$eej d%eeejf d&ejd'ee d(eeef f$d)d*Z						d.d+d,Z   Z!S )/Llama4ForConditionalGenerationr   r   r"   c                    s^   t  | t|j| _t|| _t|j| _	|jj
| _
| jjd ur&| jjnd| _|   d S )Nr<   )r%   r&   r  r  r  r  multi_modal_projectorra  r  rb  r1  r"   r0  r:  r4   r6   r8   r9   r&     s   

z'Llama4ForConditionalGeneration.__init__c                 C   
   | j  S rT   )rb  r=  rg   r8   r8   r9   r=    rB  z3Llama4ForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rT   )rb  rA  r@  r8   r8   r9   rA       z3Llama4ForConditionalGeneration.set_input_embeddingsc                 C   r  rT   )rb  rg  rg   r8   r8   r9   rg    rB  z4Llama4ForConditionalGeneration.get_output_embeddingsc                 C   r  rT   )rb  rj  rh  r8   r8   r9   rj    r  z4Llama4ForConditionalGeneration.set_output_embeddingsc                 C   r  rT   )rb  rn  rl  r8   r8   r9   rn    r  z*Llama4ForConditionalGeneration.set_decoderc                 C   r  rT   )rb  ro  rg   r8   r8   r9   ro    rB  z*Llama4ForConditionalGeneration.get_decoderr  vision_feature_layervision_feature_select_strategyc                 K   sJ   |dvrt d| j dd | D }| j|fddi|}|j}|S )a  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   fullz$Unexpected select feature strategy: c                 S   s   i | ]\}}|d ur||qS rT   r8   )r-  kr  r8   r8   r9   
<dictcomp>  s    zELlama4ForConditionalGeneration.get_image_features.<locals>.<dictcomp>rE  F)rO  r  itemsr  rL  )r5   r  r  r  r   image_outputsr  r8   r8   r9   get_image_features  s   z1Llama4ForConditionalGeneration.get_image_featuresNr   rC  r   r   r  rD  rp  r	  r  rE  rF  r   rq  image_sizesr   r;   c           #      K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j jj}|dur/|n| j jj}|du |duA r@td|durL|durLtd|du rV|  |}|dur| j	||||d}|j
}|d|d}| |}|| j jkd}||j}|d|d}|d d}| }||dkrtd| d	|d |dd|d}|||}||}| jd|||||
|||||d

|}|d }d}|	durO|dur!|dd|j
d d  df |j}|dddddf ||jdk  }|	dddf ||	jdk  } n|dddddf  }|	dddf  } t }!|!|d|d| d|j}|sg|f|dd  }"|dure|f|" S |"S t|||j|j|j|dury|dS ddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```NrH  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r  r  r  r  r<   ).r   r   zMismatch: final_mask wants z0 embeddings, but multi_modal_projector returned )
r   r   r  rD  r	  r  rE  rF  r   rq  r   .)rs  rr  r  r:   rM  r}  r8   )r"   r  rE  rN  r  r  r  rO  r=  r  rp   r?   r  r  image_token_idrT  r   r   ry   r   r   masked_scatterrb  r   r,   CrossEntropyLossr|  r  r:   rM  )#r5   rC  r  r   r   r  rD  r  r  rp  r	  r  rE  rF  r   rq  r  r   r  original_inputs_embeds_shapevision_flatprojected_vision_flatspecial_image_mask
final_maskfinal_mask_1dnum_tokens_to_fillexpanded_maskr  rr  rs  shift_attention_maskshift_logitsshift_labelsloss_fctrn   r8   r8   r9   rF     s   1



(*& z&Llama4ForConditionalGeneration.forwardc           
      K   s8   | j j|f|||||d|}	|d dkr||	d< |	S )N)r  rD  r   r   rq  r   r  )rb  prepare_inputs_for_generation)
r5   rC  r  rD  r  r   r   rq  r   model_inputsr8   r8   r9   r    s   
z<Llama4ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNNNNNr   N)NNNNNN)"rG   rH   rI   r^  ry  r_  r   r!  r&   r=  rA  rg  rj  rn  ro  r.   r  r   rt  rz  r   r  r   r   r   rJ   r  r   r`  ro   r|  rF   r  rK   r8   r8   r6   r9   r    s    
	

 r  )r  r+  r  ra  r  )r   )\r  dataclassesr   typingr   r   r   r.   torch.nnr,   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r   activationsr   cache_utilsr	   r
   
generationr   integrations.hub_kernelsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_llama4r   r    
get_loggerrG   rP  Moduler!   rL   rW   rj   rq   r   rJ   ro   r   rt  r   rd   r   r   r   r   r  r+  r`  ra  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  __all__r8   r8   r8   r9   <module>   s   
!

"
]C' p
;,R  y