o
    NiH                     @  sR  d dl mZ d dlmZmZ d dlmZmZ d dlZd dlm	Z	m
Z
mZ d dlmZ d dlm	  mZ d dlmZ d dlmZmZmZmZ d	d
 Zdd Zdd Zd<ddZd=ddZdd Zd>ddZdd Zd?ddZd d! Z d@dAd&d'Z!dBd(d)Z"dCd+d,Z#dDd.d/Z$dEd2d3Z%dFd4d5Z&e'e#e$e%e&d6Z(	-	7dGd8d9Z)G d:d; d;eZ*dS )H    )annotations)ceillog)TupleCallableN)nntensorTensor)Module)pad_sequence)	rearrangerepeatpackunpackc                 C  s   | d uS N )valr   r   Y/home/ubuntu/.local/lib/python3.10/site-packages/x_transformers/autoregressive_wrapper.pyexists   s   r   c                 C  s   t | r| S |S r   )r   )r   dr   r   r   default      r   c                 O  s   | S r   r   )targskwargsr   r   r   identity   s   r   , c                 C  s
   | | S r   )join)arr	delimiterr   r   r   r      s   
r      c                 C  s   t | tr| S | f| S r   )
isinstancetuple)r   lengthr   r   r   
cast_tuple   s   r$   c                   s    fdd}|S )Nc                   s2   | j }|    | g|R i |}| | |S r   )trainingevaltrain)selfr   r   was_trainingoutfnr   r   inner   s
   
zeval_decorator.<locals>.innerr   )r,   r-   r   r+   r   eval_decorator   s   r.   #B;c                 C  s   | j |d S )N)min)clampr   )r   epsr   r   r   r   (   r   r   c                 C  s   t t t|   S r   )r   torch	rand_liker   r   r   r   gumbel_noise+   s   r6         ?ư>c                 C  s"   t | }| t|| | jddS )Ndim)r6   maxargmax)logitstemperaturer2   noiser   r   r   gumbel_sample.   s   rA   c                   s0   | j D ]}|jdkr fdd|jD |_qd S )Nac                   s   g | ]} |qS r   r   .0r   r+   r   r   
<listcomp>7   s    z$modify_cached_kv.<locals>.<listcomp>)attn_intermediates
layer_type	cached_kv)cacher,   interr   r+   r   modify_cached_kv4   s
   

rK   r9           padtuple[int, int]c                 C  sN   |dkr| S |dk r| d n| j | d }d| }tj| g ||R |dS )N)r   r   r   r    value)ndimFrM   )r   rM   r;   rP   dims_from_rightzerosr   r   r   
pad_at_dim;   s
    rU   c                 C  s   g | j d d | j| jR \}}}}|jdkr |j d |ks"J | |ks*J || }| }tj||tjdd }	tj||tjd}
t| |df|dd} || }| |	|
|d  df }|S )N   r    r   )devicedtype).N)rP   r;   .)	shaperW   rX   rQ   amaxr3   arangelongrU   )r   lenspad_idbatchseq_lenrW   rX   pad_lensmax_pad_lenbatch_arangeprompt_len_arangeoffsetalignedr   r   r   align_rightC   s   (rg   ?c                 C  s\   t j| dd\}}t jtj|dddd}||k}tj|ddd}td||< |d	||S )
NT)
descendingr9   r:   r    r9   FrO   -infr    )r3   sortcumsumrR   softmaxrM   floatscatter)r>   thressorted_logitssorted_indices	cum_probssorted_indices_to_remover   r   r   top_pW   s   rv   皙?c                 C  sX   | j d }t|t|| }t||}t| |\}}t| td}|d|| |S )Nr9   rk   r    )	rY   r   r   r0   r3   topk	full_likero   scatter_)r>   frac_num_tokensk
num_tokensr   indprobsr   r   r   top_kc   s   

r          @{Gz?c                 C  s@   | j dd}|jddd}t||| }t||k td| S Nr9   r:   Tr;   keepdimrk   )rn   rZ   r3   powwherero   )r>   	min_p_powmin_p_ratior   	max_probslimitr   r   r   top_ap   s   r   c                 C  s8   | j dd}|jddd}|| }t||k td| S r   )rn   rZ   r3   r   ro   )r>   min_pr   r   r   r   r   r   r   y   s   r   )rv   r   r   r         ?c                 C  sJ   t || jddd }d| |  ||  }|| |k t| jj }|S )zE
    Appendix A Algorithm 2
    https://arxiv.org/abs/2309.09117
    r9   Tr   r    )r   rZ   masked_fillr3   finforX   r<   )expert_logitsamateur_logitsalphabetacutoffdiffscontrastive_decode_logitsr   r   r   contrastive_decode_fn   s   r   c                      s   e Zd Z					d  fdd	Ze eddd	d
dd	ede df
d!ddZ	e ed	d
d	e
dd	e eddddf	d"ddZ			d#ddZ  ZS )$AutoregressiveWrapperr   rL   Frw   c                   sN   t    || _|| _|| _|j| _|dk sJ || _|| _|j| _|| _	d S )Nr7   )
super__init__	pad_valueignore_indexnetmax_seq_len	mask_probadd_attn_z_lossadd_continuous_pred_headnext_embed_loss_weight)r(   r   r   r   r   r   r   	__class__r   r   r      s   
	
zAutoregressiveWrapper.__init__   Nr7   Tprompt_lensTensor | Nonefilter_logits_fnstr | Callablefilter_kwargsdictc           )        sv  t |rJ d| j|dk|j}}t|gd\}}|j\}}t|	tr:|	tv s6J dtt	  dt|	 }	d }t |rLt
||| jd}|| }|}d }|oU| jj}tj|f|d}tj||d}t|D ]}|dk}|
r|jd	 k}|r|r| jjsJ d
|d d  d f }t |rt|fdd | j|fd||d|\}}|r|}|d d d	f }|jd	d}|r|s|	|fi |}|| t| }|j d	dj} |d	| }!t|d d}||! }t|d d}t| d} |r|rt| fdd tj|| fd	d}t|d|d}|jd	 }"|" krR|jd	dd\}}#|d d d  f }|#d d d  f }$|"|d d d f  |$ }$t|$d}%||% }t|d}t |s]qi||k}&|&jd	d rl nqit |rt |&d}'|'! j"d	ddk}(|#|(| j}t|d|d}|d|d f }t$||d\}|s|ddd d f S t|d|d}t|d}||fS ) Nzeos token not supported yetrL   * nonly  are availabler^   rW   r   r9   the network cannot use cached key values when decoding outside the max sequence length. most likely because you are using absolute positional embedding. you can switch to rotary embeddings to resolve this issuec                   s   | d d  d d d f S )N.r    r   r5   r   r   r   <lambda>   s    z3AutoregressiveWrapper.beam_search.<locals>.<lambda>Treturn_intermediatesrI   seq_start_posr:   zb -> b beamsbeamsb ... -> (b beams) ...zb beams -> (b beams) 1c                   s   t | d dS )Nr   r   )r   r5   r   r   r   r   '  s    z6(b prev_beams) next_beams -> b (prev_beams next_beams))b)r;   ri   zb beams -> (b beams)rj   r    z(b beams) seq -> b beams seq.z	* beams nz(b beams) -> beams bzb beams n -> beams b n)%r   r   rW   r   rY   r!   strFILTER_LOGITS_FNr   keysrg   r   r   can_cache_kvr3   rT   r[   range can_cache_kv_outside_max_seq_lenrK   log_softmaxr6   rx   indicesgatherr   r   catrl   anyallrR   rM   ro   rm   r   r   ))r(   promptsr`   r   return_beams_and_scores	eos_tokenr?   
stochasticr   r   restrict_to_max_seq_lenr   cache_kvr   greedyrW   packed_shaper_   orig_seq_lenr   r*   rI   should_cachescoresrc   iis_firstmax_len_exceededxr>   	new_cache	log_probssamplesnext_scorescurr_num_beamssort_indicestop_beams_indicesflattened_beam_indicesis_eos_tokensshifted_is_eos_tokensmaskr   )r   r   r   beam_search   s   

 








z!AutoregressiveWrapper.beam_searchr   )r   r   r   list[Tensor] | Tensoramateur_modelModule | Tuple[Module] | Nonecontrastive_decode_kwargsdict | Tuple[dict]c           )        s  | j |dk }t|tr3t|dksJ dt|rJ dtdd |D |d jd}t|dd	}t|gd
\}}g |j	|jR \}}}t|t
r`|tv s\J dtt  dt| }d }t|rrt||| jd}|| }|}d }t|rt|}t|
}
t|t|
ksJ d gt| }t}t|D ]\}}t|tr|j||< |  qt|D ]}|r|j	d  k}|r|r| jjsJ d|d d   d f }t|r|jD ]}|jdkr fdd|jD |_q| j|fd||d|\}}|r| jjr|}|d d df }t|rXtt|||
D ]=\}\}}} ||fd||d|\}!}"|!d d df }!|!j	|j	ksBJ dt||!fi | }|rV|jrV|"||< q|rc|jddd}#n||fi |	}$tj |$| dd}%t!"|%d}#t!j#||#fdd}t|sq||k}&|&j$dd% r nqt|rt&|&d}'|'' j(dddk}(|)|(| j}|d d |d f }t*||d
\}|S )NrL   r   zprompts cannot be empty listzM`prompt_len` will be auto derived if prompts are passed in as list of Tensorsc                 S  s   g | ]}|j d  qS r   )rY   rC   r   r   r   rE   {  s    z2AutoregressiveWrapper.generate.<locals>.<listcomp>r   T)batch_firstr   r   r   r   r9   r   rB   c                   s(   g | ]}|d  d  dddf qS ).r    Nr   rC   r   r   r   rE     s   ( r   zBlogits dimension are not the same between amateur and expert modelr   r:   r    rj   )+r   r!   listlenr   r   rW   r   r   rY   r   r   r   r   rg   r   r$   r   	enumerater   r   r&   r   r   rF   rG   rH   r   zipr   r=   rR   rn   r3   multinomialr   r   r   rM   ro   rm   r   r   ))r(   r   r`   r   r?   r   r   r   r   r   r   r   r   r   psr   r   rW   r   r*   rI   amateur_cachesr   module_r   r   rJ   r>   r   amateuramateur_cache!amateur_contrastive_decode_kwargsr   next_amateur_cachesamplefiltered_logitsr   r   r   r   r   r   r   generate_  s   

 









zAutoregressiveWrapper.generatec                 K  s  |j d | j| j| jf\}}}}||d d dd f }	}
t|	|k| j|	}	| jdkrjtj|	j |j	d}t
|jj |d d df< tt|| j |d }|j|ddj}t|	d|d  }|j|d | j|	fd	|||d
|\}}|r|\}\}}n|}t|r|j d }|d d |d f }|d d d df }| jjstjntj}|t|d|
|d}|r||j }|r|
|k}|d d d df }|d d dd f  }tj||dd}||  }||| j   }|s|S |||ffS )Nr    rL   r   r   r9   r:   r7   )self_attn_kv_maskT)r   return_attn_z_lossreturn_next_embed_predprepend_embedszb n c -> b c n)r   none)	reduction)!rY   r   r   r   r3   r   r   r   randnrW   r   rX   r<   r0   intrx   r   
zeros_likerp   boolupdater   r   output_is_log_probrR   cross_entropynll_lossr   attn_z_lossdetachl1_lossmeanr   )r(   r   return_outputsr   r   seqr   r   add_next_embed_lossinptargetrandnum_maskr   r   r*   rI   r>   next_embed_predinit_embedsprepend_lenloss_fnloss
embed_predcont_targets	cont_lossr   r   r   forward  sZ    



zAutoregressiveWrapper.forward)r   r   rL   Frw   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   )FN)__name__
__module____qualname__r   r3   no_gradr.   r   r   r   r   r   r  __classcell__r   r   r   r   r      sP     & "r   )r   )r    )r/   )r7   r8   )r9   rL   )rM   rN   r   )rh   )rw   N)r   r   )rw   )rw   r   )+
__future__r   mathr   r   typingr   r   r3   r   r   r	   torch.nnr
   torch.nn.functional
functionalrR   torch.nn.utils.rnnr   einopsr   r   r   r   r   r   r   r   r$   r.   r6   rA   rK   rU   rg   rv   r   r   r   r   r   r   r   r   r   r   r   <module>   sF    








	
