o
    i#                     @   s  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
mZmZ ddlmZmZ ee
e ef ZedZeee geee  f Zddded	ed
ede	eegef  def
ddZdddeded
ede	eegef  def
ddZ	ddede	eegef  defddZddefdee ded	ed
ededeee  fddZddefdee dedeee  fddZefde
e dedeee  fddZdS )     N)partial)	AnyCallableIterableIteratorListOptionalSequenceTypeVarUnion   )	minibatchregistryItemT
get_lengthsizebufferdiscard_oversizer   returnc                 C   ,   |durd|ini }t tf| ||d|S )a  Create a batcher that uses the `batch_by_padded_size` strategy.

    The padded size is defined as the maximum length of sequences within the
    batch multiplied by the number of sequences in the batch.

    size (int or Sequence[int]): The largest padded size to batch sequences into.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    buffer (int): The number of sequences to accumulate before sorting by length.
        A larger buffer will result in more even sizing, but if the buffer is
        very large, the iteration order will be less random, which can result
        in suboptimal training.
    discard_oversize (bool): Whether to discard sequences that are by themselves
        longer than the largest padded batch size.
    get_length (Callable or None): Function to get the length of a sequence item.
        The `len` function is used by default.
    Nr   )r   r   r   )r   minibatch_by_padded_size)r   r   r   r   	optionals r   K/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/batchers.py"configure_minibatch_by_padded_size   s   r   	tolerancec                 C   r   )a  Create a batcher that uses the "minibatch by words" strategy.

    size (int or Sequence[int]): The target number of words per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    tolerance (float): What percentage of the size to allow batches to exceed.
    discard_oversize (bool): Whether to discard sequences that by themselves
        exceed the tolerated size.
    get_length (Callable or None): Function to get the length of a sequence
        item. The `len` function is used by default.
    Nr   )r   r   r   )r   minibatch_by_words)r   r   r   r   r   r   r   r   configure_minibatch_by_words8   s   r   c                 C   s(   |durd|ini }t tfd| i|S )zCreate a batcher that creates batches of the specified size.

    size (int or Sequence[int]): The target number of items per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    Nr   r   )r   r   )r   r   r   r   r   r   configure_minibatchS   s   r      Fseqsc           
      #   s    t |trt|}nt|}t| |dD ]3 t  t|}t ||D ]"} fdd|D }t	dd |D t
| }	|rE|	|krEq&|V  q&qdS )aq  Minibatch a sequence by the size of padded batches that would result,
    with sequences binned by length within a window.

    The padded size is defined as the maximum length of sequences within the
    batch multiplied by the number of sequences in the batch.

    size (int or Sequence[int]): The largest padded size to batch sequences into.
    buffer (int): The number of sequences to accumulate before sorting by length.
        A larger buffer will result in more even sizing, but if the buffer is
        very large, the iteration order will be less random, which can result
        in suboptimal training.
    discard_oversize (bool): Whether to discard sequences that are by themselves
        longer than the largest padded batch size.
    get_length (Callable or None): Function to get the length of a sequence item.
        The `len` function is used by default.
    )r   c                    s   g | ]} | qS r   r   ).0iouter_batchr   r   
<listcomp>~   s    z,minibatch_by_padded_size.<locals>.<listcomp>c                 s       | ]}t |V  qd S Nlen)r"   seqr   r   r   	<genexpr>       z+minibatch_by_padded_size.<locals>.<genexpr>N)
isinstanceint	itertoolsrepeatiterr   listnext_batch_by_lengthmaxr*   )
r!   r   r   r   r   size_target_sizeindicessubbatchpadded_sizer   r$   r   r   _   s   
r   g?c                 c   sj   t |trt|}nt|}t|}|| }g }g }	d}
d}| D ]}||}||| kr5|s4|gV  q"|dkrI|
| |krI|| |
|7 }
q"|
| | || kr]|	| ||7 }q"|rb|V  t|}|| }|	}|}
g }	d}|
| |kr|| |
|7 }
q"|
| || kr|	| ||7 }q"|r|V  t|}|| }|g}|}
q"||	 |r|V  dS dS )a  Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
    themselves, or be discarded if discard_oversize=True.

    seqs (Iterable[Sequence]): The sequences to minibatch.
    size (int or Sequence[int]): The target number of words per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    tolerance (float): What percentage of the size to allow batches to exceed.
    discard_oversize (bool): Whether to discard sequences that by themselves
        exceed the tolerated size.
    get_length (Callable or None): Function to get the length of a sequence
        item. The `len` function is used by default.
    r   N)r.   r/   r0   r1   r2   r4   appendextend)r!   r   r   r   r   r7   r8   tol_sizebatchoverflow
batch_sizeoverflow_sizer+   n_wordsr   r   r   r      s\   










r   	max_wordsc                    s    fddt | D }|  g }g }|D ]$\}}|s!|| q|t|d  |kr1|| q|| |g}q|rA|| tdd |D t| ksPJ dd |D }|  |S )zGiven a list of sequences, return a batched list of indices into the
    list, where the batches are grouped by length, in descending order.

    Batches may be at most max_words in size, defined as max sequence length * size.
    c                    s   g | ]
\}} ||fqS r   r   )r"   r#   r+   r   r   r   r&      s    z$_batch_by_length.<locals>.<listcomp>   c                 s   r'   r(   r)   )r"   br   r   r   r,      r-   z#_batch_by_length.<locals>.<genexpr>c                 S   s   g | ]}t t|qS r   )r3   sorted)r"   r?   r   r   r   r&      s    )	enumeratesortr<   r*   sumreverse)r!   rD   r   lengths_indicesbatchesr?   lengthr#   r   r   r   r5      s"   	

r5   r(   )r0   	functoolsr   typingr   r   r   r   r   r   r	   r
   r   utilr   r   r/   Sizingr   BatcherTboolr   floatr   r   r*   r   r   r5   r   r   r   r   <module>   s    ,
'



*

M
