o
    	۷i                     @   s   d dl mZ d dlmZmZ d dlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZ d	d
lmZ ddlmZ eeZeG dd de	ZeG dd deZdgZdS )    )	dataclass)OptionalUnionN   )Cache)$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging   )AutoModelForImageTextToText   )ShieldGemma2Configc                   @   s$   e Zd ZU dZdZeej ed< dS )0ShieldGemma2ImageClassifierOutputWithNoAttentionz^ShieldGemma2 classifies imags as violative or not relative to a specific policy
    Args:
    Nprobabilities)	__name__
__module____qualname____doc__r   r   torchTensor__annotations__ r   r   l/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/shieldgemma2/modeling_shieldgemma2.pyr   #   s   
 r   c                        s(  e Zd ZU eed< dddddZdef fddZd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Ze														d*deej deej deej deej dee deej deej d eej d!eej d"ee d#ee d$ee d%ee d&eeejf d'efd(d)Z  ZS )+"ShieldGemma2ForImageClassificationconfigzmodel.model.language_modelzmodel.model.vision_towerz!model.model.multi_modal_projectorzmodel.lm_head)zmodel.language_model.modelzmodel.vision_towerzmodel.multi_modal_projectorzmodel.language_model.lm_headc                    s<   t  j|d t|dd| _t|dd| _tj|d| _d S )N)r   yes_token_indexi *  no_token_indexi  )super__init__getattrr   r   r   from_configmodel)selfr   	__class__r   r   r   6   s   z+ShieldGemma2ForImageClassification.__init__c                 C      | j j S N)r"   language_modelget_input_embeddingsr#   r   r   r   r)   <      z7ShieldGemma2ForImageClassification.get_input_embeddingsc                 C      | j j| d S r'   )r"   r(   set_input_embeddings)r#   valuer   r   r   r-   ?      z7ShieldGemma2ForImageClassification.set_input_embeddingsc                 C   r&   r'   )r"   r(   get_output_embeddingsr*   r   r   r   r0   B   r+   z8ShieldGemma2ForImageClassification.get_output_embeddingsc                 C   r,   r'   )r"   r(   set_output_embeddings)r#   new_embeddingsr   r   r   r1   E   r/   z8ShieldGemma2ForImageClassification.set_output_embeddingsc                 C   r,   r'   )r"   r(   set_decoder)r#   decoderr   r   r   r3   H   r/   z.ShieldGemma2ForImageClassification.set_decoderc                 C   r&   r'   )r"   r(   get_decoderr*   r   r   r   r5   K   r+   z.ShieldGemma2ForImageClassification.get_decoderc                 C   r&   r'   )r"   r(   tie_weightsr*   r   r   r   r6   N   r+   z.ShieldGemma2ForImageClassification.tie_weightsNr   	input_idspixel_valuesattention_maskposition_idspast_key_valuestoken_type_idscache_positioninputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictlogits_to_keepreturnc                 K   sh   | j d|||||||||	|
||||d|}|j}|ddd| j| jgf }tj|dd}t||dS )aY  
        Returns:
            A `ShieldGemma2ImageClassifierOutputWithNoAttention` instance containing the logits and probabilities
            associated with the model predicting the `Yes` or `No` token as the response to that prompt, captured in the
            following properties.

                *   `logits` (`torch.Tensor` of shape `(batch_size, 2)`):
                    The first position along dim=1 is the logits for the `Yes` token and the second position along dim=1 is
                    the logits for the `No` token.
                *   `probabilities` (`torch.Tensor` of shape `(batch_size, 2)`):
                    The first position along dim=1 is the probability of predicting the `Yes` token and the second position
                    along dim=1 is the probability of predicting the `No` token.

            ShieldGemma prompts are constructed such that predicting the `Yes` token means the content *does violate* the
            policy as described. If you are only interested in the violative condition, use
            `violated = outputs.probabilities[:, 1]` to extract that slice from the output tensors.

            When used with the `ShieldGemma2Processor`, the `batch_size` will be equal to `len(images) * len(policies)`,
            and the order within the batch will be img1_policy1, ... img1_policyN, ... imgM_policyN.
        )r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   N)dim)logitsr   r   )r"   rH   r   r   r   softmaxr   )r#   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   	lm_kwargsoutputsrH   selected_logitsr   r   r   r   forwardQ   s2   'z*ShieldGemma2ForImageClassification.forward)NNNNNNNNNNNNNr   )r   r   r   r   r   _checkpoint_conversion_mappingr   r)   r-   r0   r1   r3   r5   r6   r	   r   r   
LongTensorFloatTensorr   r   boolr   intr   rM   __classcell__r   r   r$   r   r   ,   sz   
 	
r   )dataclassesr   typingr   r   r   cache_utilsr   modeling_outputsr   modeling_utilsr   utilsr	   r
   autor   configuration_shieldgemma2r   
get_loggerr   loggerr   r   __all__r   r   r   r   <module>   s    
f