o
    ,i`n                  
   @   s$  d dl Z d dlmZmZ d dlZd dlZd dlm  mZ	 d dl
mZmZmZmZmZmZ d dlmZ ddlmZmZmZ e eZ				d=d	ejd
ejdejdeej fddZd>dedefddZd>dedefddZ	d>dejdedefddZ d>defddZ!d>dedefddZ"d>dedefddZ#d>dedefdd Z$d>dedefd!d"Z%d#d$ Z&d%ejdeeje'e'f fd&d'Z(d(ejfd)d*Z)d(ejd+e'd,e'd-e'dejf
d.d/Z*d0d1 Z+d(ejd2e'd3edejfd4d5Z,d6d7 Z-d8ejfd9d:Z.				d=d	ejd
ejdejdeej fd;d<Z/dS )?    N)OptionalTuple)can_use_efficient_attentioncan_use_flash_attentionflash_sdp_enabledmath_sdp_enabledmem_efficient_sdp_enabled
SDPAParams)
SDPBackend   )buffer_from_jaggedNestedTensorViewNestedFromBuffer        Fquerykeyvalue	attn_maskc                 C   sN  t | trt |trt |ts td| j d|j d|j d| j|jks,| j|jkr=td| j d|j d|j d| j|jksI| j|jkrZtd| j d	|j d
|j d|  dk sl| dk sl| dk rtd|   d|  d|  d| j|jks| j|jkrtd| j d|j d|j d|d urtdd S )NzNExpected query, key, and value to be nested tensors, but got query.is_nested: z, key.is_nested: z, and value.is_nested: z	 instead.zLExpected query, key, and value to have the same dtype, but got query.dtype: z, key.dtype: z, and value.dtype: zSExpected query, key, and value to have the same device type, but got query.device: z, key.device: z, and value.device:    zUExpected query, key, and value to all be  at least 2 dimensional, but got query.dim: z, key.dim: z and value.dim: z[Expected query, key, and value to all be ragged on the same dimension, but got ragged dims z, z, and z, respectively.zMasks are not yet supported!)

isinstancer   
ValueError	is_nesteddtypedevicedim_ragged_idxtorchbool)r   r   r   r   	dropout_p	is_causalscale r!   Y/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch/nested/_internal/sdpa.py_validate_sdpa_input   st   
$r#   paramsreturnc                 C   s4   | j d}| jd}| jd}||ko||kS )Nr   )r   sizer   r   )r$   debugq_batch_sizek_batch_sizev_batch_sizer!   r!   r"   _check_batch_size_nestedJ   s   r+   c                 C   sl   d}| j d}| jd}| jd}||ko||k}|r(|d dkr(||ks4|r2td||| dS dS )N      r   zFor NestedTensor inputs, Flash attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 256. Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.FT)r   r&   r   r   logwarning)r$   r'   max_sizequery_size_lastkey_size_lastvalue_size_lastsame_head_dim_sizer!   r!   r"   !_check_head_dim_size_flash_nestedW   s&   r6   param
param_namec                 C   sR   t | ts	J d| jdkr|rtd| dS | jdkr'|r%td| dS dS )Nzparam should be a jagged NTr   zMFused kernels do not support ragged num_head_dims, %s has a ragged num_heads.Fr   zAFused kernels do not support seq_len == 0, %s has a seq len of 0.T)r   r   r   r/   r0   _min_seqlen)r7   r8   r'   r!   r!   r"   :_check_for_seq_len_0_and_consistent_head_dim_nested_helperq   s    

r:   c              
   C   s`   t | ||}| |kr| dks||kr|dks||kr.|dkr.|r,td||| |||| dS dS )Nr   zzBoth fused kernels require query, key and value to have broadcastable %s, got Query %s %d, Key %s %d, Value %s %d instead.FT)maxr/   r0   )q_sizek_sizev_sizer8   r'   r1   r!   r!   r"   _try_broadcast_param_size   s"   r?   c           	      C   s   | j jrt| j d|nd}|sdS | jjrt| jd|nd}|s"dS | jjr-t| jd|nd}|s3dS | j d}| jd}| jd}||koL||k}|sl| j js[| jjs[| jjrd|rbtd dS t	|||d|S dS )	Nr   TFr   r   r   zFBoth fused kernels do not support training with broadcasted NT inputs.z	num heads)
r   r   r:   r   r   r&   requires_gradr/   r0   r?   )	r$   r'   	q_is_safe	k_is_safe	v_is_safeq_num_headsk_num_headsv_num_headssame_num_headsr!   r!   r"   _check_for_seq_len_0_nested   sX   
rH   c                 C   s(   t ttf}|D ]
}|| |s dS qdS NFT)r+   r6   rH   r$   r'   constraints
constraintr!   r!   r"   _can_use_flash_sdpa_jagged   s   
rM   c                 C   s&   t tf}|D ]
}|| |s dS qdS rI   )r+   rH   rJ   r!   r!   r"   _can_use_efficient_sdpa_jagged   s   
rN   c                 C   sd   | j dd r| jdd r| jdd s$|r"td dS | jr0|r.td dS dS )Nr   r   zGIf inputs are nested tensors they must be contiguous after transposing.FzENested tensors for query / key are not supported when is_causal=True.T)r   	transposeis_contiguousr   r   r/   r0   r   )r$   r'   r!   r!   r"   _can_use_math_sdpa_jagged   s$   rQ   c           	      C   s  t  st st stjS tjtjtjf}t| |||||}|D ]7}|tjkr3t	|r3t
|r3tj  S |tjkrEt|rEt|rEtj  S |tjkrVt rVt|rVtj  S qtd t|dd t|dd td t	|dd t
|dd td t|dd tjS )Nz)Memory efficient kernel not used because:T)r'   z(Flash attention kernel not used because:z'Math attention kernel not used because:)r   r   r   r
   ERRORFLASH_ATTENTIONEFFICIENT_ATTENTIONMATHr	   r   rM   r   rN   rQ   r/   r0   )	r   r   r   r   dropoutr   orderingr$   backendr!   r!   r"   _select_sdp_backend  sD   









rY   qkvc                 C   s   t | ts	td|  d u r%|  jtj| jd}| j	}| 
 jd }n|  djtj| jd}| d}| j	}t|d  }|||fS )Nz<QKV must be nested for flash cumulative_seq_len calculation.)r   r   r   r-   )r   r   r   lengthsoffsetstor   int32r   _max_seqlenvaluesshapecumsumr&   intitem)rZ   cumulative_seqlen
max_seqlenn_elem
batch_sizer!   r!   r"   _cumulative_and_max_seq_len_nnz0  s   


ri   tensorc                 C   sf   t | tsJ |  }| j}|dd }|dkrdS |d }|dd  D ]}||kr. dS |}q%dS )Nr   r   Tr   F)r   r   r\   _stridesr&   )rj   r\   strides	n_tensorsprev_stridestrider!   r!   r"   !_is_safe_to_get_storage_as_tensorL  s   
rp   Nnz	num_headshead_dimc                 C   s   | j rt| S | |||S )N)r   r   view)rj   rq   rr   rs   r!   r!   r"   _view_as_densek  s   ru   c                 C   s\  |  d}| d}| d}|  d}| d}| d}||kr.||kr.||kr.||ks2td|  d}	|  d}
| d}| dd}|dd}|dd}t|\}}}t|\}}}| smt|sm| }| syt|sy| }| st|s| }t|||	|
}t|||	|
}t|||	|}| |j	|j
d}||||||||fS )Nr   r   z<This path is currently not implemented for jagged layout NT.   r   )r\   r_   r9   )r&   RuntimeErrorrO   ri   rP   rp   
contiguousru   r\   r_   r9   )r   r   r   r(   r)   r*   rD   rE   rF   rr   head_dim_qk
head_dim_vq_tk_tv_tcumulative_sequence_length_qmax_seqlen_batch_qNnz_qcumulative_sequence_length_kvmax_seqlen_batch_kvNnz_kvquery_buffer_reshapedkey_buffer_reshapedvalue_buffer_reshapedoutput_nt_infor!   r!   r"   _sdpa_nested_preprocessing  sb   








	r   alignment_sizeslicec                 C   sR   |  d}|| dkr| S |||  }tjj| d|g} |r'| dd|f S | S )Nr-   r   .)r&   r   nn
functionalpad)rj   r   r   last_dim_size	pad_countr!   r!   r"   _pad_last_dimJ  s   
r   c                 C   s(   |d ur|}|S t d| d }|S )Ng      ?r-   )r   sym_sqrtr&   )r   r    softmax_scaler!   r!   r"   _calculate_scale]  s   r   outc                 C   s(   | j s| d|kr| dd|f } | S )Nr-   .r   )r   r&   )r   og_sizer!   r!   r"   _post_process_flash_outputc  s   r   c           )      C   s  t | |||||| t| trt|trt|tsJ |  dkrY| dkrY| dkrY| jdkrYddlm} tj| j	|j	|j	t|trH|j	n||||d}t|fi || S | j
pa|j
pa|j
}	t| |||||}
|
tjkr| d}t| dd}t|dd}t|dd}t| |}t|||\}}}}}}}}tjjj|||||||||d|d	\}}}}}t||d
 dd}t||S |
tjkr	t| ||\}}}}}}}}tjjj|d|d|dd |||||t||	|d	\}} }!}"}#}t|d|d
 ddS |
tjkrZ|   }$| j!d }%|j!d }&dd }'|'| } |'|}|'|}tj"| ||||||d	d }(|(dd# $ }(|(%d|%|&}(t|(|$}(|(dd}(|(S t&d)Nrv   r   r   )extract_kwargs)r   r   r   r    r-   r.   F)r    r\   r   c                 S   sd   | j dd  | j d d  }t| dd}t|jt|dd}tjt|}|dd }|S )Nr   r-   r   r   )r   )	_offsetsr   rO   r   splitlistnestedas_nested_tensorrx   )jagged_layout_ntr[   rO   tensor_list
strided_ntr!   r!   r"    get_strided_layout_nested_tensor  s   zMjagged_scaled_dot_product_attention.<locals>.get_strided_layout_nested_tensorz=No viable backend for scaled_dot_product_attention was found.)'r#   r   r   r   r   torch.nested._internal.opsr   Fscaled_dot_product_attention_valuesr@   rY   r
   rS   r&   r   r   r   r   opsaten_flash_attention_forwardr   applyrO   r   rT   _efficient_attention_forward	unsqueezerc   squeezerU   r\   _size"_scaled_dot_product_attention_mathrx   r`   rt   rw   ))r   r   r   r   r   r   r    r   outputcompute_logsumexpbackend_choicer   query_padded
key_paddedvalue_paddedog_scaler   r   r   r~   r   r   r   r   	attention	logsumexpphilox_seedphilox_offsetdebug_attn_maskquery_reshapedkey_reshapedvalue_reshaped
log_sumexpseedoffsetmax_seqlen_qr\   d1d2r   attn_outr!   r!   r"   #jagged_scaled_dot_product_attentioni  s   	.	




	


	

r   )Nr   FN)F)0loggingtypingr   r   r   torch.nntorch.nn.functionalr   r   r   torch.backends.cudar   r   r   r   r   r	   torch.nn.attentionr
   nested_tensorr   r   r   	getLogger__name__r/   Tensorr#   r   r+   r6   strr:   r?   rH   rM   rN   rQ   rY   rc   ri   rp   ru   r   r   r   r   r   r!   r!   r!   r"   <module>   s    	

3
; (
 I

