o
    ॵiE&                     @   s   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ e ZdgZejejejdG dd deZ dS )    N)AnyDictListUnion)Dataset)	Pipelines)Model)
OutputKeys)PipelineTensor)	PIPELINES),DocumentSegmentationTransformersPreprocessor)Tasks)
get_loggerDocumentSegmentationPipeline)module_namec                	       s   e Zd Z				ddeeef dededef fdd	Zd
eeee  ee ef de	ee
f fddZd
eeee  ee ef de	ee
f fddZde	eef de	eef fddZdeeee  ee ef fddZdd Z  ZS )r   NgpuTmodelpreprocessorconfig_filedevicec                    sv   t  jd|||||d| |dd |dd | jj| _| jj| _|du r9t| j| jjjfi || _	dS dS )a8  The document segmentation pipeline.

        Args:
            model (str or Model): Supply either a local model dir or a model id from the model hub
            preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
            the model if supplied.
        )r   r   r   r   auto_collatecompileNcompile_options )
super__init__popr   	model_dir	model_cfgr   configmax_position_embeddingsr   )selfr   r   r   r   r   kwargs	__class__r   k/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/pipelines/nlp/document_segmentation_pipeline.pyr      s(   

z%DocumentSegmentationPipeline.__init__	documentsreturnc                 C   s   |  |}| |}|S )N)predictpostprocess)r"   r'   outputr   r   r&   __call__>   s   

z%DocumentSegmentationPipeline.__call__c                    s    |} jd dkr|d}t|} | j}t| jj }t| jj } jd dkr9|d |d}|d}	| jj}
 j	sU j
r\ jd	 r\ js\   t   fd
d| D } j	jdi |j }W d    n1 sw   Y  tj|dd}t|	t|ksJ d|t|	t| fddt||D } fddt||D }g }t|D ]} jd dkr|g g g || d q|g g g d qt||	||
D ]f\}}}} jd dkr3t|t|k r	|d |d t|t|ksJ dt|t|t|t|ks3J dt|t||| d | || d | || d | q jd dkrt|D ]*}t|| d d t|| d ksrJ || d d || d d q[|S )Nleveltopic
paragraphstypebertsegment_idslabels	sentencesr   c                    s$   i | ]\}}|t | jqS r   )torchtensortor   ).0keyvalr"   r   r&   
<dictcomp>c   s    z8DocumentSegmentationPipeline.predict.<locals>.<dictcomp>   )axisz(sample {}  infer_sample {} prediction {}c                    (   g | ]\}} fd dt ||D qS )c                    s$   g | ]\}}|d kr j j| qS ir   
label_listr8   plr;   r   r&   
<listcomp>p       
CDocumentSegmentationPipeline.predict.<locals>.<listcomp>.<listcomp>zipr8   
predictionlabelr;   r   r&   rF   o       
z8DocumentSegmentationPipeline.predict.<locals>.<listcomp>c                    r?   )c                    s$   g | ]\}}|d kr j j| qS r@   rA   rC   r;   r   r&   rF   w   rG   rH   rI   rK   r;   r   r&   rF   v   rN   )r4   r3   predictionsr/   )r4   r3   rO   docB-EOPz{} {}rO      r   )cut_documentsr   r   r   	from_dictr   lencontext_column_nameexample_id_column_namer   has_multiple_modelsmodels_model_prepareprepare_modelr5   no_graditemsforwardlogitscpunpargmaxformatrJ   rangeappendextend)r"   r'   pred_samplesr/   predict_examplespredict_datasetnum_examplesnum_samplesr3   r4   example_idsinputrO   true_predictionstrue_labelsoutirL   sentence_listrM   
example_idr   r;   r&   r)   E   s   















z$DocumentSegmentationPipeline.predictinputsc                 C   sn  g }g }t |}| jd dkrlt|D ]W}g }g }t|| d || d || d D ]+\}}	}
| }|	dkrEd|dg}|d	 nd|d
g}|d || q+|| dd|  }|| qn;t|D ]6}g }t|| d || d D ]\}}	| }|	dkrd|d
g}|| qdd| }|| qp|d	krtj|d iS tj|iS )zprocess the prediction results

        Args:
            inputs (Dict[str, Any]): _description_

        Returns:
            Dict[str, str]: the prediction results
        r-   r.   r/   rO   r3   rQ    z

	rR   z
	r   	r4   )	rU   r   rd   rJ   stripjoinre   r	   TEXT)r"   rt   result	res_preds
list_countnumrespredsrD   rE   documentr   r   r&   r*      sH   	





z(DocumentSegmentationPipeline.postprocessparac                 C   sH  |}g }g }g }g }d}| j d dkrmt|tr|gg}n
t|d tr'|g}|D ]<}g }	g }
|D ]}| |}|	| |
dgt|d  dg  q1|| ||	 ||
 || |d7 }q)||||dS t|tru|g}|D ]&}| |}	dgt|	d  dg }
||	 ||
 || |d7 }qw|||d	S )
Nr   r-   r.   z-100rR   rQ   )rs   r4   r/   r3   O)rs   r4   r3   )r   
isinstancestrcut_sentencerf   rU   re   )r"   r   document_listr/   r4   r3   rs   idr   sentencerM   itemsentence_of_current_paragraphr   r   r&   rS      sZ   















z*DocumentSegmentationPipeline.cut_documentsc                 C   sT   t dd|}t dd|}t dd|}t dd|}| }dd |dD S )	Nu   ([。！.!？\?])([^”’])z\1\n\2u   (\.{6})([^”’])u   (\…{2})([^”’])u*   ([。！？\?][”’])([^，。！？\?])c                 S   s   g | ]}|r|qS r   r   )r8   _r   r   r&   rF     s    z=DocumentSegmentationPipeline.cut_sentence.<locals>.<listcomp>
)resubrstripsplit)r"   r   r   r   r&   r     s   z)DocumentSegmentationPipeline.cut_sentence)NNr   T)__name__
__module____qualname__r   r   r   r   r   r   r   r   r,   r)   r   r*   rS   r   __classcell__r   r   r$   r&   r      s>    
!



"]$15)!r   typingr   r   r   r   numpyra   r5   datasetsr   modelscope.metainfor   modelscope.modelsr   modelscope.outputsr	   modelscope.pipelines.baser
   r   modelscope.pipelines.builderr   modelscope.preprocessorsr   modelscope.utils.constantr   modelscope.utils.loggerr   logger__all__register_moduledocument_segmentationr   r   r   r   r&   <module>   s&   