o
    NiG                     @   s(  d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	m
Z
 ddl	mZ ddlZddlm  mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dZdZ ej!Z"dddZ#G dd dej$ej%Z&dd Z'dd Z(dS )z@Base DatasetBuilderTestCase to test a DatasetBuilder base class.    )absolute_import)division)print_functionN)absltest)parameterized)dataset_builder)dataset_info)dataset_utils)download)
registered)utils)	checksums)tf_utils)
test_utils)chmodchownlinklistdirlstatmakedirsmkdirmknodopenpathconfreadlinkremove
removedirsrenamerenamesrmdirstatstatvfssymlinkunlinkwalk)existsisdirisfileFc                 K   s2   t | ds	td|rtdt| ||fi |S )NreadzEYou MUST pass a `tf.io.gfile.GFile` or file-like object to `np.load`.z2Unpicling files is forbidden for security reasons.)hasattrAssertionError_ORGINAL_NP_LOAD)file_	mmap_modeallow_picklekwargs r0   g/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/testing/dataset_builder_testing.py_np_loadN   s   
r2   c                       s   e Zd ZdZdZdZdZdZdZdZ	g Z
dZdZe fddZ fddZ fd	d
Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd'ddZe dd Zdd  Zd!d" Zd#d$ Zd%d& Z   Z!S )(DatasetBuilderTestCasea	  Inherit this class to test your DatasetBuilder class.

  You must set the following class attributes:

    * DATASET_CLASS: class object of DatasetBuilder you want to test.

  You may set the following class attributes:

    * VERSION: `str`. The version used to run the test. eg: '1.2.*'.
      Defaults to None (canonical version).
    * BUILDER_CONFIG_NAMES_TO_TEST: `list[str]`, the list of builder configs
      that should be tested. If None, all the BUILDER_CONFIGS from the class
      will be tested.
    * DL_EXTRACT_RESULT: `dict[str]`, the returned result of mocked
      `download_and_extract` method. The values should be the path of files
      present in the `fake_examples` directory, relative to that directory.
      If not specified, path to `fake_examples` will always be returned.
    * DL_DOWNLOAD_RESULT: `dict[str]`, the returned result of mocked
      `download_and_extract` method. The values should be the path of files
      present in the `fake_examples` directory, relative to that directory.
      If not specified: will use DL_EXTRACT_RESULT (this is due to backwards
      compatibility and will be removed in the future).
    * EXAMPLE_DIR: `str`, the base directory in in which fake examples are
      contained. Optional; defaults to
      tensorflow_datasets/testing/test_data/fake_examples/<dataset name>.
    * OVERLAPPING_SPLITS: `list[str]`, splits containing examples from other
      splits (e.g. a "example" split containing pictures from other splits).
    * MOCK_OUT_FORBIDDEN_OS_FUNCTIONS: `bool`, defaults to True. Set to False to
      disable checks preventing usage of `os` or builtin functions instead of
      recommended `tf.io.gfile` API.
    * SKIP_CHECKSUMS: Checks that the urls called by `dl_manager.download`
      are registered.

  This test case will check for the following:

   - the dataset builder is correctly registered, i.e. `tfds.load(name)` works;
   - the dataset builder can read the fake examples stored in
       testing/test_data/fake_examples/{dataset_name};
   - the dataset builder can produce serialized data;
   - the dataset builder produces a valid Dataset object from serialized data
     - in eager mode;
     - in graph mode.
   - the produced Dataset examples have the expected dimensions and types;
   - the produced Dataset has and the expected number of examples;
   - a example is not part of two splits, or one of these splits is whitelisted
       in OVERLAPPING_SPLITS.
  NTFc                    s@   t   tt|   | j}| jd u st| jstd| d S )Nz5Assign your DatasetBuilder class to %s.DATASET_CLASS.)	tfenable_v2_behaviorsuperr3   
setUpClass__name__DATASET_CLASScallabler*   )clsname	__class__r0   r1   r7      s   z!DatasetBuilderTestCase.setUpClassc                    s   t t|   g | _|  | _tjt	
 | jj| _| jd ur$| j| _tjj| js5d| j }t|| jr<|   t | _d| _d S )Nzfake_examples dir %s not found.F)r6   r3   setUppatchers_make_builderbuilderospathjoinr   fake_examples_dirr<   example_dirEXAMPLE_DIRr4   iogfiler%   
ValueErrorMOCK_OUT_FORBIDDEN_OS_FUNCTIONS _mock_out_forbidden_os_functionsset_download_urls_stop_record_download)selferr_msgr=   r0   r1   r?      s   



zDatasetBuilderTestCase.setUpc                    s&   t t|   | jD ]}|  q
d S N)r6   r3   tearDownr@   stop)rQ   patcherr=   r0   r1   rT      s   

zDatasetBuilderTestCase.tearDownc           
      C   s  t d}tjj}tjjtjtjd}||_tD ]}|t||_	qtjjt|d}t
D ]}tjdkr7tt|s7q*|t||_	q*tjj| jjd |dd}|  | j| t }tjj|d|d	< tj| jjd
 |}|  | j| tjdt}	|	  | j|	 dS )zCRaises error if forbidden os functions are called instead of gfile.zdDo not use `os`, but `tf.io.gfile` module instead. This makes code compatible with more filesystems.)wraps)rD   ntz.osT)createside_effectr   z.__builtins__z
numpy.loadN)r*   rC   rD   sepr   mockMockFORBIDDEN_OS_PATH_FUNCTIONSgetattrr[   FORBIDDEN_OS_FUNCTIONSr<   r)   patchr9   
__module__startr@   append__builtins__copyr2   )
rQ   errr\   mock_os_pathfopmock_os
os_patchermock_builtinsopen_patchernp_load_patcherr0   r0   r1   rM      s4   z7DatasetBuilderTestCase._mock_out_forbidden_os_functionsc                 C   s   |  | jtjd d S )NzADataset class must inherit from `dataset_builder.DatasetBuilder`.)assertIsInstancerB   r   DatasetBuilderrQ   r0   r0   r1   test_baseclass   s   z%DatasetBuilderTestCase.test_baseclassc                 C   s6   | j jt v }| j j}| |p|d| j j d S )Nz:Dataset {} was not registered and is not `IN_DEVELOPMENT`.)rB   r<   r   list_buildersIN_DEVELOPMENT
assertTrueformat)rQ   is_registered
exceptionsr0   r0   r1   test_registered   s   

z&DatasetBuilderTestCase.test_registeredc                 C   s,   | j j}| |tj | | j j|j d S rS   )rB   inforp   r   DatasetInfoassertEqualr<   )rQ   r{   r0   r0   r1   	test_info   s   z DatasetBuilderTestCase.test_infoc                 C   s:   | j rd S t|tjjr| j|j d S | j| d S rS   )rP   
isinstancer
   resourceResourcerO   addurl)rQ   url_or_urlsr0   r0   r1   _add_url   s
   zDatasetBuilderTestCase._add_urlc                    s:   t j j| ~ jd u r jS t j fdd jS )Nc                       t j j| S rS   rC   rD   rE   rG   fnamerr   r0   r1   <lambda>       z?DatasetBuilderTestCase._get_dl_extract_result.<locals>.<lambda>)r4   nestmap_structurer   DL_EXTRACT_RESULTrG   rQ   r   r0   rr   r1   _get_dl_extract_result   s   

z-DatasetBuilderTestCase._get_dl_extract_resultc                    s<   t j j|  jd u r |S t j fdd jS )Nc                    r   rS   r   r   rr   r0   r1   r     r   z@DatasetBuilderTestCase._get_dl_download_result.<locals>.<lambda>)r4   r   r   r   DL_DOWNLOAD_RESULTr   r   r0   rr   r1   _get_dl_download_result  s   


z.DatasetBuilderTestCase._get_dl_download_resultc                 C   s
   d| _ d S )NT)rP   r   r0   r0   r1   _download_checksums  s   
z*DatasetBuilderTestCase._download_checksumsc                 C   s   | j | j|| jdS )N)data_dirconfigversion)r9   tmp_dirVERSION)rQ   r   r0   r0   r1   rA     s
   z$DatasetBuilderTestCase._make_builderc              	   C   s"  | j r| j D ]}|| jjv sJ d|t| jjf q| jj}tdt|  |rj|D ]=}| j d ur@|j| j vr@td|j  q+| |j td|j  | j	|d}| 
| W d    n1 scw   Y  q+n| 
| j | js| d |   W d    d S 1 sw   Y  d S d S )Nz9Config %s specified in test does not exist. Available:
%szTotal configs: %dzSkipping config %szTesting config %sr   url_checksums)BUILDER_CONFIG_NAMES_TO_TESTrB   builder_configslistBUILDER_CONFIGSprintlenr<   _subTestrA    _download_and_prepare_as_datasetSKIP_CHECKSUMS_test_checksums)rQ   r   configsrB   r0   r0   r1   $test_download_and_prepare_as_dataset  s8   


"z;DatasetBuilderTestCase.test_download_and_prepare_as_datasetc                 C   s   | j sd S d}tj|d tjt| jj	}t
|}W d    n1 s(w   Y  | j t|  }| |d|| d S )Nz{If you are developping outside TFDS and want to opt-out, please add `SKIP_CHECKSUMS = True` to the `DatasetBuilderTestCase`)suffixzSome urls checksums are missing at: {} Did you forget to record checksums with `--register_checksums` ? See instructions at: https://www.tensorflow.org/datasets/add_dataset#2_run_download_and_prepare_locally
{})rO   r   try_reraiserC   rD   rE   r   	_get_pathrB   r<   _get_url_infosrN   keysassertEmptyrw   )rQ   rR   filepath	url_infosmissing_urlsr0   r0   r1   r   2  s   z&DatasetBuilderTestCase._test_checksumsc           	   	   C   s  t jjtdd}|jr| jn|}t jjjd| j| j	| j
|d/ t|tjr4dd l}d }|jj }nd }d }tjtjj||d}|j|d W d    n1 sRw   Y  | d | | W d    n1 slw   Y  | d	 | | W d    n1 sw   Y  | d
7 | j|jd}| | | d | | W d    n1 sw   Y  W d    d S W d    d S 1 sw   Y  d S )Nz$Missing MANUAL_DOWNLOAD_INSTRUCTIONSrZ   z1tensorflow_datasets.core.download.DownloadManager)download_and_extractr
   download_checksums
manual_dirr   )compute_statsbeam_runnerbeam_options)download_config
as_datasetnum_examplesreloadr   )r   r]   PropertyMock	ExceptionMANUAL_DOWNLOAD_INSTRUCTIONSrG   rb   multipler   r   r   r   r   BeamBasedBuilderapache_beamoptionspipeline_optionsPipelineOptionsr
   DownloadConfigComputeStatsModeSKIPdownload_and_preparer   _assertAsDataset_assertNumSamplesrA   builder_config)	rQ   rB   missing_dir_mockr   beamr   r   r   builder_reloadedr0   r0   r1   r   I  sT   
"z7DatasetBuilderTestCase._download_and_prepare_as_datasetc                 C   s   i }| j  D ]<\}}|j|d}t|jj tjj	j
|tjj	j
| tt|j|d}tdd |D ||< | || qt| dD ] \\}}\}	}
|| jv s^|	| jv r_qL| ||
d||	f  qLd S )N)splitc                 s   s    | ]}t |V  qd S rS   )checksum).0recr0   r0   r1   	<genexpr>  s    z:DatasetBuilderTestCase._assertAsDataset.<locals>.<genexpr>   zSplits '%s' and '%s' are overlapping. Are you sure you want to have the same objects in those splits? If yes, add one one of them to OVERLAPPING_SPLITS class attribute.)SPLITSitemsr   compare_shapes_and_typesr{   featuresget_tensor_infor4   compatv1dataget_output_typesget_output_shapesr   r	   as_numpyrN   	assertLen	itertoolscombinationsOVERLAPPING_SPLITSassertFalseintersection)rQ   rB   split_to_checksums
split_nameexpected_examples_numberdsexamplessplit1hashes1split2hashes2r0   r0   r1   r   z  s4   



z'DatasetBuilderTestCase._assertAsDatasetc                 C   sJ   | j  D ]\}}| |jj| j| q| |jjjt| j   d S rS   )	r   r   r}   r{   splitsr   total_num_examplessumvalues)rQ   rB   r   expected_num_examplesr0   r0   r1   r     s   z(DatasetBuilderTestCase._assertNumSamplesrS   )"r8   rc   __qualname____doc__r9   r   r   r   r   rH   r   rL   r   classmethodr7   r?   rT   rM   rs   rz   r~   r   r   r   r   rA   r   run_in_graph_and_eager_modesr   r   r   r   r   __classcell__r0   r0   r=   r1   r3   W   s<    0	



1r3   c                    sH    fdd  g | }dd |D }d |}t }|| | S )z%Computes the md5 for a given example.c                    s  t |tjr
t|}t |tr&t| D ]\}}| |  | | q| S t |tr<t|dr5|	d}| | | S t |t
jt
jjjjfrS| t|  | S t |tjr|jjtju rw| tt|j | tt|  | S | |  | S | t| | S )z:Recursively flatten an element to its byte representation.decodezlatin-1)r   numbersNumberstrdictsortedr   re   r)   r   r4   RaggedTensorr   r   raggedRaggedTensorValueto_listnpndarraydtypetypeobject_tupleshaper   raveltobytesbytes)flat_strelementkv_bytes_flattenr0   r1   r    s4   





z checksum.<locals>._bytes_flattenc                 S   s$   g | ]}t |ts|d n|qS )zutf-8)r   r
  encode)r   sr0   r0   r1   
<listcomp>  s    zchecksum.<locals>.<listcomp>    )rE   hashlibmd5update	hexdigest)exampler  
flat_byteshash_r0   r  r1   r     s   


r   c           	      C   sx   |   D ]5\}}t|trt||| ||  q|j}|| }||kr,td|||f |j}|| }t|| qdS )zECompare shapes and types between TensorInfo and Dataset types/shapes.z&Feature %s has type %s but expected %sN)	r   r   r   r   r  	TypeErrorr  r   assert_shape_match)	tensor_infooutput_typesoutput_shapesfeature_namefeature_infoexpected_typeoutput_typeexpected_shapeoutput_shaper0   r0   r1   r     s   

r   )NF))r   
__future__r   r   r   r  r   r   rC   absl.testingr   r   numpyr  tensorflow.compat.v2r   v2r4   tensorflow_datasets.corer   r   r	   r
   r   r   !tensorflow_datasets.core.downloadr   tensorflow_datasets.core.utilsr   tensorflow_datasets.testingr   ra   r_   loadr+   r2   TestCaseSubTestCaser3   r   r   r0   r0   r0   r1   <module>   s<   
	  I.