o
    ¸iõ!  ã                   @   sš   d dl Z d dlmZ d dlmZ d dl mZ d dlmZmZ G dd„ dej	ƒZ
G dd„ dej	ƒZG d	d
„ d
ejƒZG dd„ dejƒZG dd„ dej	ƒZdS )é    N)Ú	rearrange)ÚTensor)Ú
all_reduceÚreduce_scatterc                       s0   e Zd Z				d‡ fdd„	Zddd„Z‡  ZS )ÚGPT2EmbeddingsNc           	         s    ||dœ}t ƒ  ¡  |du rtj||fd|i|¤Ž| _d| _ntj||fd|i|¤Ž| _tj||fddi|¤Ž| _|| _| jdkrNtj||fi |¤Ž| _dS dS )zÐ
        If max_position_embeddings <= 0, there's no position embeddings
        If word_embe_proj_dim is not None (e.g., OPT-350m), we embed to that dimension
            the project up to embed_dim
        ©ÚdeviceÚdtypeNÚpadding_idxÚbiasFr   )	ÚsuperÚ__init__ÚnnÚ	EmbeddingÚword_embeddingsÚ
project_inÚLinearÚmax_position_embeddingsÚposition_embeddings)	ÚselfÚ	embed_dimÚ
vocab_sizer   r
   Úword_embed_proj_dimr   r	   Úfactory_kwargs©Ú	__class__© úR/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/modules/embedding.pyr      sB   

ÿÿÿÿÿÿÿÿÿ
ÿÿÿzGPT2Embeddings.__init__c                 C   sd   |j \}}|  |¡}| jdur|  |¡}| jdkr0|du r'tj|tj|jd}|  |¡}|| }|S )úR
        input_ids: (batch, seqlen)
        position_ids: (batch, seqlen)
        Nr   ©r	   r   )	Úshaper   r   r   ÚtorchÚarangeÚlongr   r   )r   Ú	input_idsÚposition_idsÚ
batch_sizeÚseqlenÚ
embeddingsr   r   r   r   Úforward/   s   





zGPT2Embeddings.forward)NNNN)N©Ú__name__Ú
__module__Ú__qualname__r   r)   Ú__classcell__r   r   r   r   r      s    ø#r   c                       s.   e Zd Z			d‡ fdd„	Zddd„Z‡  ZS )ÚBertEmbeddingsNc           	         s‚   ||dœ}t ƒ  ¡  tj||fd|i|¤Ž| _|| _|| _| jdkr-tj||fi |¤Ž| _| jdkr?tj||fi |¤Ž| _dS dS )z“
        If max_position_embeddings <= 0, there's no position embeddings
        If type_vocab_size <= 0, there's no token type embeddings
        r   r
   r   N)	r   r   r   r   r   r   Útype_vocab_sizer   Útoken_type_embeddings)	r   r   r   r   r0   r
   r   r	   r   r   r   r   r   A   s(   

ÿÿÿ
ÿÿ
ÿzBertEmbeddings.__init__c           	      C   sˆ   |j \}}|  |¡}| jdkr&|du rtj|tj|jd}|  |¡}|| }| jdkrB|du r9tj	|tj|jd}|  
|¡}|| }|S )zz
        input_ids: (batch, seqlen)
        position_ids: (batch, seqlen)
        token_type_ids: (batch, seqlen)
        r   Nr   )r    r   r   r!   r"   r#   r   r   r0   Úzerosr1   )	r   r$   r%   Útoken_type_idsr&   r'   r(   r   r1   r   r   r   r)   ]   s   





zBertEmbeddings.forward)NNN)NNr*   r   r   r   r   r/   @   s    ør/   c                       s:   e Zd Zdddœ‡ fdd„
Zdedef‡ fdd„Z‡  ZS )	ÚVocabParallelEmbeddingN)Úprocess_groupr
   c                   s€   || _ |d ur+tj |¡}|| dkrtd|› d|› dƒ‚|dkr*|d ur*tdƒ‚nd}tƒ j|| g|¢R d|i|¤Ž d S )Nr   znum_embeddings (ú#) must be divisible by world_size (ú)é   z.ParallelEmbedding does not support padding_idxr
   )r5   r!   ÚdistributedÚget_world_sizeÚ
ValueErrorÚRuntimeErrorr   r   )r   Únum_embeddingsr5   r
   ÚargsÚkwargsÚ
world_sizer   r   r   r   s   s   ÿÿ€&zVocabParallelEmbedding.__init__ÚinputÚreturnc                    sx   | j d u rtƒ  |¡S tj | j ¡}| j}|| |d | }}||k ||kB }|| }d||< tƒ  |¡}d||< |S )Nr8   r   g        )r5   r   r)   r!   r9   Úget_rankr=   )r   rA   Úrankr   Úvocab_start_indexÚvocab_end_indexÚinput_ids_maskr(   r   r   r   r)   ‚   s   
zVocabParallelEmbedding.forward)r+   r,   r-   r   r   r)   r.   r   r   r   r   r4   r   s    r4   c                       s"   e Zd Zddœ‡ fdd„
Z‡  ZS )ÚColumnParallelEmbeddingN)r5   c                   sf   || _ |d urtj |¡}|| dkrtd|› d|› dƒ‚nd}tƒ j||| g|¢R i |¤Ž d S )Nr   zembedding_dim (r6   r7   r8   )r5   r!   r9   r:   r;   r   r   )r   r=   Úembedding_dimr5   r>   r?   r@   r   r   r   r   “   s   ÿÿÿ$z ColumnParallelEmbedding.__init__)r+   r,   r-   r   r.   r   r   r   r   rH   ’   s    rH   c                       s0   e Zd Z				d‡ fdd„	Zd	dd„Z‡  ZS )
ÚParallelGPT2EmbeddingsNTc	           
         sj   ||dœ}	t ƒ  ¡  || _|| _t||f||dœ|	¤Ž| _|| _| jdkr3t||fd|i|	¤Ž| _dS dS )zQ
        If max_position_embeddings <= 0, there's no position embeddings
        r   )r
   r5   r   r5   N)	r   r   r5   Úsequence_parallelr4   r   r   rH   r   )
r   r   r   r   r5   r
   rK   r   r	   r   r   r   r   r   ¢   s.   

þüû
ÿÿÿÿzParallelGPT2Embeddings.__init__Fc                 C   sÒ   |j \}}tj | j¡}|  |¡}| jdkrO|du r$tj|tj|j	d}|  
|¡}|dkr2|| }n| j
j}	tj | j¡}
|d|
|	 |
d |	 …f  |7  < |rVt|dƒ}| jr[tnt}|dkrc|S ||| jƒS )r   r   Nr   r8   .zb s d -> (b s) d)r    r!   r9   r:   r5   r   r   r"   r#   r   r   rI   rC   r   rK   r   r   )r   r$   r%   Úcombine_batch_seqlen_dimr&   r'   r@   r(   r   Úpartition_dimrD   Ú	reduce_fnr   r   r   r)   Á   s(   




ÿþ
zParallelGPT2Embeddings.forward)NTNN)NFr*   r   r   r   r   rJ   ¡   s    ÷rJ   )r!   Útorch.nnr   Úeinopsr   r   Úflash_attn.utils.distributedr   r   ÚModuler   r/   r   r4   rH   rJ   r   r   r   r   Ú<module>   s   52 