o
    i                     @   sp  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ ddl m!Z! ddl"m#Z# ej$dddddedddddeddddddedddddfded ed!ee d"ee% fd#d$Z&i dd%d ed&ee%ef d"ee% fd'd(Z'd)e#d*ee fd+d,Z(dS )-    )Path)AnyDictListOptionalcastN)util)	debug_cliimport_codeparse_config_overridesshow_validation_error)ConfigSchemaTraining)Doc)registryresolve_dot_names)Argument)Context)Option)Printermsg   )CuratedTransformer)Tok2PiecesModelTpiecesT)allow_extra_argsignore_unknown_options)context_settings.zPath to config file)helpexists
allow_dashz--code-pathz--codez-czNPath to Python file with additional code (registered functions) to be imported)r   z--namez-nz^Name of the transformer pipe to gather piece statistics for (default: first transformer pipe).ctxconfig_path	code_pathtransformer_namec                 C   s$   t | j}t| t|||d dS )zd
    Analyze word- or sentencepiece statistics.
    DOCS: https://spacy.io/api/cli#debug-pieces
    config_overridesr#   N)r   argsr
   debug_pieces)r    r!   r"   r#   	overrides r)   _/home/ubuntu/.local/lib/python3.10/site-packages/spacy_curated_transformers/cli/debug_pieces.pydebug_pieces_cli   s   

r+   r$   r%   c                   s  t  }t| $ tj| |d}tj|dd  j }tj|d t	d}W d    n1 s.w   Y  |d |d g}t
||\}  fdd	 |d u rfd
d  jD }	|	sa|jddd |	d }
n3z |}W n ty   d }|jd| ddd Y nw t|ts|jd| ddd tt|}
|
jd}|d|j  dd  D }dd | D }|d t|| |d t|| d S )N)r(   T)	auto_filltraining)schematrain_corpus
dev_corpusc                      s    S )Nr)   r)   nlpr/   r)   r*   <lambda>B   s    zdebug_pieces.<locals>.<lambda>c                 S   s   g | ]\}}t |tr|qS r)   )
isinstancer   ).0_piper)   r)   r*   
<listcomp>E   s
    z debug_pieces.<locals>.<listcomp>z%Pipeline does not contain transformer   )exitsr   z(Pipeline does not contain a pipe named ''zPipe named 'z' is not a transformerpiece_encoderzFound piece encoder: c                 S      g | ]}|j qS r)   	predictedr5   egr)   r)   r*   r8   ]       c                 S   r=   r)   r>   r@   r)   r)   r*   r8   ^   rB   zTraining corpus statisticszDevelopment corpus statistics)r   r   r   load_configload_model_from_configconfiginterpolater   resolver   r   
initializepipelinefailget_pipeKeyErrorr4   r   r   modelget_refinfonamedividerprint_piece_stats)r!   r%   r#   r   cfgrE   T	dot_namesr0   transformerstransformer_pipetransformer_pipe_callabler<   
train_docsdev_docsr)   r1   r*   r'   1   sH   








r'   r<   docsc                 C   s   |  |}g }|D ]}|j}||  q	t|}tdt|  tdt	|d td|
  d|  d d S )NzMedian token length: zMean token length: z.2fzToken length range: [z, ])predictlengthsextendtolistnumpyarrayr   textmedianmeanminmax)r<   r[   docs_pieceslens
doc_piecesdoc_piece_lenslens_xpr)   r)   r*   rR   f   s   

$rR   ))pathlibr   typingr   r   r   r   r   ra   spacyr   spacy.cli._utilr	   r
   r   r   spacy.schemasr   spacy.tokensr   
spacy.utilr   r   typerr   Argr   r   Optwasabir   r   pipeline.transformerr   tokenization.typesr   commandstrr+   r'   rR   r)   r)   r)   r*   <module>   sP    

5