o
    ̳i                  	   @   sj   d dl mZ d dlmZ d dlmZ d dlmZ 		ddede	d	e
d
efddZd
efddZdd ZdS )    clip_text_encoder)CLIPTextEncoderCLIPTokenizerCLIPImageTransformM   Tpathmax_seq_lentruncatereturnc                 C   s   t | ||dS )a{  
    Builder for the CLIP text tokenizer.

    Args:
        path (str): Path to the CLIP merges file
        max_seq_len (bool): Context length. Default: 77
        truncate (bool): Truncate the token sequence if it exceeds max_seq_len (otherwise raises AssertionError)
            Default: True

    Returns:
        CLIPTokenizer: Instantiation of the CLIP text tokenizer
    )r   r   r   )r
   r   r    r   Y/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/clip/_model_builders.pyclip_tokenizer   s   r   c                   C   s   t dddddddS )a4  
    Builder for the CLIP text encoder for CLIP-ViT-L/14.

    CLIP is a model that encodes text and images into a shared vector space.
    Blog post: https://openai.com/index/clip/
    Paper: https://arxiv.org/abs/2103.00020

    Returns:
        CLIPTextEncoder: Instantiation of the CLIP text encoder
    i      i   r	   gh㈵>)	embed_dim	num_heads
num_layers
vocab_sizer   norm_epsr   r   r   r   r   clip_text_vit_large_patch14    s   r   c               	   C   s"   t g dg ddd dddd} | S )N)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?      bilinearT)
image_mean	image_std	tile_sizepossible_resolutionsmax_num_tilesresampleresize_to_max_canvasr   )image_transformr   r   r   clip_vit_224_transform5   s   
r#   N)r	   T))torchtune.models.clip._component_buildersr   #torchtune.models.clip._text_encoderr    torchtune.models.clip._tokenizerr    torchtune.models.clip._transformr   strintboolr   r   r#   r   r   r   r   <module>   s"   
