o
    Mi6                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZddlm	Z	 ddlm
Z
 zd dlZW n ey?   d dlZY nw ejd  dkZeeZG dd deZdS )    N   )get_version)utils   c                   @   s   e Zd ZdZ			d6ddZd7d
dZdd Zdd Zdd Zdd Z	dd Z
		d8ddZdd Zdd Zedd Zd d! Zed"d# Zd$d% Zd&d' Zd(d) Zd9d*d+Zd,d- Zd.d/ Zd0d1 Zd:d2d3Zed4d5 ZdS );MorfessorIOzDefinition for all input and output files. Also handles all
    encoding issues.

    The only state this class has is the separators used in the data.
    Therefore, the same class instance can be used for initializing multiple
    files.

    N + #\s+Fc                 C   sR   || _ || _|| _t|tj| _|| _|d ur t|tj| _|| _	t
 | _d S N)encodingconstruction_separatorcomment_startrecompileUNICODEcompound_sep_reatom_separator_atom_sep_re	lowercaser   _version)selfr   r   r   compound_separatorr   r    r   @/home/ubuntu/.local/lib/python3.10/site-packages/morfessor/io.py__init__"   s   zMorfessorIO.__init__Tc           	      +   s    t d|  |D ]=}|r|dd\}}nd|}}t fdd| jD } jdu r8d|}n	tdd |D }t|||fV  qt d	 dS )
zRead segmentation file.

        File format:
        <count> <construction1><sep><construction2><sep>...<constructionN>

        z"Reading segmentations from '%s'... r   c                 3   s    | ]}  |V  qd S r
   )_split_atoms.0constrr   r   r   	<genexpr><   s
    
z5MorfessorIO.read_segmentation_file.<locals>.<genexpr>N c                 s   s    | ]
}|D ]}|V  qqd S r
   r   )r   r   atomr   r   r   r!   B   s    Done.)	_loggerinfo_read_text_filesplittupler   r   joinint)	r   	file_name
has_countskwargslinecountcompound_strconstructionscompoundr   r    r   read_segmentation_file/   s   


z"MorfessorIO.read_segmentation_filec           
         s   t d|  |E}tj jdd}|d j|f  |D ]'\}}} jdu r2 j	
|}	n j	
 fdd|D }	|d||	f  q!W d   n1 sSw   Y  t d	 dS )
zWrite segmentation file.

        File format:
        <count> <construction1><sep><construction2><sep>...<constructionN>

        zSaving segmentations to '%s'...r   microsecondz(# Output from Morfessor Baseline %s, %s
Nc                 3   s    | ]	} j |V  qd S r
   )r   r*   r   r    r   r   r!   X   s    z6MorfessorIO.write_segmentation_file.<locals>.<genexpr>%d %s
r$   )r%   r&   _open_text_file_writedatetimenowreplacewriter   r   r   r*   )
r   r,   segmentationsr.   file_objdr0   _segmentationsr   r    r   write_segmentation_fileG   s$   

z#MorfessorIO.write_segmentation_filec                 c   &    |D ]}|  |D ]}|V  q
qdS )zdRead one or more corpus files.

        Yield for each compound found (1, compound_atoms).

        N)read_corpus_filer   
file_namesr,   itemr   r   r   read_corpus_files]      zMorfessorIO.read_corpus_filesc                 c   rD   )zmRead one or more corpus list files.

        Yield for each compound found (count, compound_atoms).

        N)read_corpus_list_filerF   r   r   r   read_corpus_list_filesg   rJ   z"MorfessorIO.read_corpus_list_filesc                 c   sd    t d| | j|ddD ]}| j|D ]}t|dkr&d| |fV  qdV  qt d dS )	z~Read one corpus file.

        For each compound, yield (1, compound_atoms).
        After each line, yield (0, ()).

        zReading corpus from '%s'...T)rawr   r   )r   r   r$   N)r%   r&   r'   r   r(   lenr   )r   r,   r/   r3   r   r   r   rE   q   s   zMorfessorIO.read_corpus_filec              	   c   sv    t d| | |D ]'}z|dd\}}t|| |fV  W q ty3   d| |fV  Y qw t d dS )zRead a corpus list file.

        Each line has the format:
        <count> <compound>

        Yield tuples (count, compound_atoms) for each compound.

        z Reading corpus from list '%s'...Nr   r$   )r%   r&   r'   r(   r+   r   
ValueError)r   r,   r/   r0   r3   r   r   r   rK      s   	z!MorfessorIO.read_corpus_list_filer   ,c           	      C   s   i }t d| | |D ]9}|dd\}}||vrg ||< |dur<||D ]}| }|| | | q(q|| || qt d |S )zRead a annotations file.

        Each line has the format:
        <compound> <constr1> <constr2>... <constrN>, <constr1>...<constrN>, ...

        Yield tuples (compound, list(analyses)).

        z Reading annotations from '%s'...Nr   r$   )r%   r&   r'   r(   stripappend)	r   r,   r   analysis_sepannotationsr/   r3   analyses_lineanalysisr   r   r   read_annotations_file   s$   

z!MorfessorIO.read_annotations_filec                 C   sd   t d| | |}|D ]\}}|d||f  qW d   n1 s&w   Y  t d dS )z;Write to a Lexicon file all constructions and their counts.zSaving model lexicon to '%s'...r7   Nr$   )r%   r&   r8   r<   )r   r,   lexiconr>   constructionr0   r   r   r   write_lexicon_file   s   zMorfessorIO.write_lexicon_filec                 C   s$   t d| | |}t d |S )zRead a pickled model from file.zLoading model from '%s'...r$   )r%   r&   read_binary_filer   r,   modelr   r   r   read_binary_model_file   s   

z"MorfessorIO.read_binary_model_filec                 C   s:   t | d}t|}W d   |S 1 sw   Y  |S )z"Read a pickled object from a file.rbN)openpickleload)r,   fobjobjr   r   r   r[      s   
zMorfessorIO.read_binary_filec                 C   s&   t d| | || t d dS )zPickle a model to a file.zSaving model to '%s'...r$   N)r%   r&   write_binary_filer\   r   r   r   write_binary_model_file   s   z#MorfessorIO.write_binary_model_filec                 C   s@   t | d}t||tj W d   dS 1 sw   Y  dS )zPickle an object into a file.wbN)r`   ra   dumpHIGHEST_PROTOCOL)r,   rd   rc   r   r   r   re      s   "zMorfessorIO.write_binary_filec                 C   sz   |  |.}tj jdd}|d| j| | D ]\}}|d|| qW d   dS 1 s6w   Y  dS )z/Write learned or estimated parameters to a filer   r5   z"# Parameters for Morfessor {}, {}
z{}:	{}
N)r8   r9   r:   r;   r<   formatr   items)r   r,   paramsr>   r?   keyvalr   r   r   write_parameter_file   s   "z MorfessorIO.write_parameter_filec              	   C   sp   i }t d}| |D ])}|| }|r5|d}|d}zt|}W n	 ty0   Y nw |||< q|S )z0Read learned or estimated parameters from a filez^(.*)\s*:\s*(.*)$r      )r   r   r'   matchrstripgroupfloatrO   )r   r,   rl   line_rer/   mrm   rn   r   r   r   read_parameter_file   s   


zMorfessorIO.read_parameter_filec                 C   sb   z|  |}td| |W S  ty   Y nw ddlm} | }|| | td| |S )zRead a file that is either a binary model or a Morfessor 1.0 style
        model segmentation. This method can not be used on standard input as
        data might need to be read multiple timesz%s was read as a binary modelr   )BaselineModelz%s was read as a segmentation)r^   r%   r&   BaseException	morfessorrx   load_segmentationsr4   )r   r,   r]   rx   r   r   r   read_any_model   s   
zMorfessorIO.read_any_modelc                    sL   |du r| j } du r| j t|d r||S |t fdd|S )z6Return a formatted string for a list of constructions.Nr   c                    s
     | S r
   )r*   )xatom_sepr   r   <lambda>  s   
 z2MorfessorIO.format_constructions.<locals>.<lambda>)r   r   r   
_is_stringr*   map)r   r2   csepr   r   r~   r   format_constructions   s   
z MorfessorIO.format_constructionsc                 C   s   | j du r|S t| j|S )z Split construction to its atoms.N)r   r)   r   r(   )r   rY   r   r   r   r   
  s   
zMorfessorIO._split_atomsc                 C   sv   |dkrt j}tr|S n|drt|d}n|dr$t|d}nt|d}| jdu r3t	
 | _t| j|S )zAOpen a file for writing with the appropriate compression/encoding-.gzrg   .bz2N)sysstdoutPY3endswithgzipr`   bz2BZ2Filer   localegetpreferredencodingcodecs	getwriter)r   r,   r>   r   r   r   r8     s   




z!MorfessorIO._open_text_file_writec                 C   s   |dkrt rtj}|S G dd d}|| j}|S |dr%t|d}n|dr1t|d}nt|d}| jdu rA| 	|| _t
| j|}|S )zAOpen a file for reading with the appropriate compression/encodingr   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )z<MorfessorIO._open_text_file_read.<locals>.StdinUnicodeReaderc                 S   s"   || _ | j d u rt | _ d S d S r
   )r   r   r   )r   r   r   r   r   r   )  s   
zEMorfessorIO._open_text_file_read.<locals>.StdinUnicodeReader.__init__c                 S   s   | S r
   r   r    r   r   r   __iter__.  s   zEMorfessorIO._open_text_file_read.<locals>.StdinUnicodeReader.__iter__c                 S   s    t j }|s
t || jS r
   )r   stdinreadlineStopIterationdecoder   )r   lr   r   r   next1  s   
zAMorfessorIO._open_text_file_read.<locals>.StdinUnicodeReader.nextN)__name__
__module____qualname__r   r   r   r   r   r   r   StdinUnicodeReader(  s    r   r   r_   r   N)r   r   r   r   r   r   r`   r   r   _find_encodingr   	getreader)r   r,   inpr   r>   r   r   r   _open_text_file_read"  s    




z MorfessorIO._open_text_file_readc                 c   s    |  |}z'|D ]!}| }|st|dks|| jrq	| jr'| V  q	|V  q	W dS  tyA   |dkr@t	d Y dS  w )zRead a text file with the appropriate compression and encoding.

        Comments and empty lines are skipped unless raw is True.

        r   r   zFinished reading from stdinN)
r   rr   rN   
startswithr   r   lowerKeyboardInterruptr%   r&   )r   r,   rM   r   r/   r   r   r   r'   E  s$   
	
zMorfessorIO._read_text_filec               
   G   s   dt  g}|D ]O}d}| D ]<}|dkrqz)|dr"t|d}n|dr.t|d}nt|d}t||D ]}q:W q t	yJ   d}Y  nw |rWt
d| |  S qtd	)
zTest default encodings on reading files.

        If no encoding is given, this method can be used to test which
        of the default encodings would work.

        zutf-8Tr   r   r_   r   FzDetected %s encodingz)Can not determine encoding of input files)r   r   r   r   r`   r   r   r   r   UnicodeDecodeErrorr%   r&   UnicodeError)filestest_encodingsr   okfr>   r@   r   r   r   r   ]  s0   


zMorfessorIO._find_encoding)Nr   r   r	   NF)T)r   rP   )NN)F)r   r   r   __doc__r   r4   rC   rI   rL   rE   rK   rW   rZ   r^   staticmethodr[   rf   re   ro   rw   r|   r   r   r8   r   r'   r   r   r   r   r   r      s@    	









#r   )r   r   r9   r   r   loggingr   r   r"   r   r   cPicklera   ImportErrorversion_infor   	getLoggerr   r%   objectr   r   r   r   r   <module>   s$    
