o
    ei6"                     @   s  d dl ZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ G dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZg dZdS )    N   )PreTrainedConfig)VideoMetadata   )CONFIG_MAPPING
AutoConfig	AutoModel)Glm4vImageProcessor)Glm4vImageProcessorFast)Glm4vForConditionalGeneration
Glm4vModelGlm4vPreTrainedModel)Glm4vProcessor)Glm4vVideoProcessorc                       sH   e Zd ZdZdZeedZdgZ								
		d fdd	Z  Z	S )Glm46VConfiga  
    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
    GLM-4.6V model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of
    GLM-4.1V-9B-Thinking [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vVisionConfig`):
            The config object or dictionary of the vision backbone.
        image_token_id (`int`, *optional*, defaults to 151343):
            The image token index to encode the image prompt.
        video_token_id (`int`, *optional*, defaults to 151344):
            The video token index to encode the image prompt.
        image_start_token_id (`int`, *optional*, defaults to 151339):
            The image start token index to encode the start of image.
        image_end_token_id (`int`, *optional*, defaults to 151340):
            The image end token index to encode the end of image.
        video_start_token_id (`int`, *optional*, defaults to 151361):
            The video start token index to encode the start of video.
        video_end_token_id (`int`, *optional*, defaults to 151362):
            The video end token index to encode the end of video.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import Glm46VForConditionalGeneration, Glm46VConfig

    >>> # Initializing a GLM-4.6V style configuration
    >>> configuration = Glm46VConfig()

    >>> # Initializing a model from the GLM-4.6V style configuration
    >>> model = Glm4vForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glm46v)text_configvision_configpast_key_valuesN/O 0O +O ,O AO BO Fc
                    s   t |tr|dd|d< t|d  di || _n
|d u r$td  | _t |tr>|dd|d< t|d  di || _n
|d u rHtd  | _|| _|| _|| _|| _	|| _
|| _|	| _t jdi |
 d S )N
model_typeglm4v_vision
glm4v_text )
isinstancedictgetr   r   r   image_token_idvideo_token_idvideo_start_token_idvideo_end_token_idimage_start_token_idimage_end_token_idtie_word_embeddingssuper__init__)selfr   r   r"   r#   r&   r'   r$   r%   r(   kwargs	__class__r   g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/glm46v/modular_glm46v.pyr*   K   s$   

zGlm46VConfig.__init__)	NNr   r   r   r   r   r   F)
__name__
__module____qualname____doc__r   r   sub_configskeys_to_ignore_at_inferencer*   __classcell__r   r   r-   r/   r      s    *
r   c                   @   s   e Zd ZdZdZdd ZdS )Glm46VPreTrainedModelNc                 C   s   t d)Nz
Not needed)AttributeError)r+   moduler   r   r/   _init_weightss   s   z#Glm46VPreTrainedModel._init_weights)r0   r1   r2   _can_record_outputs_no_split_modulesr:   r   r   r   r/   r7   o   s    r7   c                       s    e Zd ZdZ fddZ  ZS )Glm46VModelNc                    s,   t  | t|j| _t|j| _d S N)r)   r*   r   from_configr   visualr   language_model)r+   configr-   r   r/   r*   z   s   zGlm46VModel.__init__)r0   r1   r2   r<   r*   r6   r   r   r-   r/   r=   w   s    r=   c                   @      e Zd ZdS )Glm46VForConditionalGenerationNr0   r1   r2   r   r   r   r/   rD          rD   c                   @   s   e Zd Zdd ZdS )Glm46VProcessorc                 C   s   d| j  d|ddS )Nz<|begin_of_image|>z<|end_of_image|>z.1fz seconds)image_token)r+   timestamp_secr   r   r/   replace_frame_token_id   s   z&Glm46VProcessor.replace_frame_token_idN)r0   r1   r2   rJ   r   r   r   r/   rG      s    rG   c                   @   rC   )Glm46VImageProcessorNrE   r   r   r   r/   rK      rF   rK   c                   @   rC   )Glm46VImageProcessorFastNrE   r   r   r   r/   rL      rF   rL   c                   @   s*   e Zd Z	ddedeeB dB fddZdS )Glm46VVideoProcessorNmetadatafpsc                    s  |d u st |dd d u rtd|j}|d }|jp"t||j d }dddd}d}d}	t||	}
|
d	kr;|d	 }n|
d
krD|d
 }n|d }t|
| | j }t||}d|j   fddt	|D }t|}||k r|t
jd|d |td }n'g }d}d| j|  }t	|D ]}|| |kr||7 }|| ||kr nqt||k rt|dkrdt|d d}}n	|d |d }}t
j|||td }nt||krt
jd|d |td }t g }}|D ]}||vr|| || qt|d@ r	||d  t
|S )NrO   zAsked to sample frames per second but no video metadata was provided which is required when sampling in Glm46V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`   r   g      ?)   ,  `	  i  rS   rQ   rR   c                    s   g | ]}|  qS r   r   ).0iduration_per_framer   r/   
<listcomp>   s    z6Glm46VVideoProcessor.sample_frames.<locals>.<listcomp>r   )dtype)getattr
ValueErrortotal_num_framesdurationroundrO   mininttemporal_patch_sizerangenplinspacetolistappendlenmaxsetaddarray)r+   rN   rO   r,   total_framesmax_frame_idxr^   DYNAMIC_FPS_THRESMAX_FRAME_COUNT_DYNAMICMAX_DURATIONeffective_duration
target_fps	extract_t
timestamps
max_secondframe_indicescurrent_secondinv_fpsframe_indexstartendseenuniqidxr   rV   r/   sample_frames   sb   








z"Glm46VVideoProcessor.sample_framesr>   )r0   r1   r2   r   ra   floatr   r   r   r   r/   rM      s    
rM   )r   r=   r7   rD   rG   rK   rL   rM   )numpyrd   configuration_utilsr   video_utilsr   autor   r   r   glm4v.image_processing_glm4vr	   !glm4v.image_processing_glm4v_fastr
   glm4v.modeling_glm4vr   r   r   glm4v.processing_glm4vr   glm4v.video_processing_glm4vr   r   r7   r=   rD   rG   rK   rL   rM   __all__r   r   r   r/   <module>   s$   S	D