Error occuring at the "Generating train split" stage when downloading the entire dataset.
Data · 23 Apr 2023, 17:10 · 2

Hello guys,

I tried to download the entire dataset locally using

load_dataset("tobiolatunji/afrispeech-200", "all", streaming=False)

The downloading and extracting phases succeeded; nevertheless, an error occurs at the "Generating train split" step and always at the 35,999 th sample. The full traceback is the following:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/builder.py:1628, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1627 example = self.info.features.encode_example(record) if self.info.features is not None else record
-> 1628 writer.write(example, key)
   1629 num_examples_progress_update += 1

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:488, in ArrowWriter.write(self, example, key, writer_batch_size)
    486     self.hkey_record = []
--> 488 self.write_examples_on_file()

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:446, in ArrowWriter.write_examples_on_file(self)
    442         batch_examples[col] = [
    443             row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
    444             for row in self.current_examples
    445         ]
--> 446 self.write_batch(batch_examples=batch_examples)
    447 self.current_examples = []

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:555, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
    554 pa_table = pa.Table.from_arrays(arrays, schema=schema)
--> 555 self.write_table(pa_table, writer_batch_size)

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:567, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
    566     self._build_writer(inferred_schema=pa_table.schema)
--> 567 pa_table = pa_table.combine_chunks()
    568 pa_table = table_cast(pa_table, self._schema)

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/pyarrow/table.pxi:3315, in pyarrow.lib.Table.combine_chunks()

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: offset overflow while concatenating arrays

During handling of the above exception, another exception occurred:

ArrowInvalid                              Traceback (most recent call last)
File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/builder.py:1637, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1636 num_shards = shard_id + 1
-> 1637 num_examples, num_bytes = writer.finalize()
   1638 writer.close()

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:582, in ArrowWriter.finalize(self, close_stream)
    581     self.hkey_record = []
--> 582 self.write_examples_on_file()
    583 # If schema is known, infer features even if no examples were written

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:446, in ArrowWriter.write_examples_on_file(self)
    442         batch_examples[col] = [
    443             row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
    444             for row in self.current_examples
    445         ]
--> 446 self.write_batch(batch_examples=batch_examples)
    447 self.current_examples = []

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:555, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
    554 pa_table = pa.Table.from_arrays(arrays, schema=schema)
--> 555 self.write_table(pa_table, writer_batch_size)

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/arrow_writer.py:567, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
    566     self._build_writer(inferred_schema=pa_table.schema)
--> 567 pa_table = pa_table.combine_chunks()
    568 pa_table = table_cast(pa_table, self._schema)

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/pyarrow/table.pxi:3315, in pyarrow.lib.Table.combine_chunks()

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: offset overflow while concatenating arrays

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[5], line 1
----> 1 dataset = datasets.load_dataset(
      2         "tobiolatunji/afrispeech-200", "all", streaming=False,
      3 )

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/load.py:1791, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1788 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1790 # Download and prepare data
-> 1791 builder_instance.download_and_prepare(
   1792     download_config=download_config,
   1793     download_mode=download_mode,
   1794     verification_mode=verification_mode,
   1795     try_from_hf_gcs=try_from_hf_gcs,
   1796     num_proc=num_proc,
   1797     storage_options=storage_options,
   1798 )
   1800 # Build dataset for splits
   1801 keep_in_memory = (
   1802     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   1803 )

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/builder.py:891, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    889     if num_proc is not None:
    890         prepare_split_kwargs["num_proc"] = num_proc
--> 891     self._download_and_prepare(
    892         dl_manager=dl_manager,
    893         verification_mode=verification_mode,
    894         **prepare_split_kwargs,
    895         **download_and_prepare_kwargs,
    896     )
    897 # Sync info
    898 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/builder.py:1651, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs)
   1650 def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
-> 1651     super()._download_and_prepare(
   1652         dl_manager,
   1653         verification_mode,
   1654         check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
   1655         or verification_mode == VerificationMode.ALL_CHECKS,
   1656         **prepare_splits_kwargs,
   1657     )

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/builder.py:986, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    982 split_dict.add(split_generator.split_info)
    984 try:
    985     # Prepare split will record examples associated to the split
--> 986     self._prepare_split(split_generator, **prepare_split_kwargs)
    987 except OSError as e:
    988     raise OSError(
    989         "Cannot find data file. "
    990         + (self.manual_download_instructions or "")
    991         + "\nOriginal error:\n"
    992         + str(e)
    993     ) from None

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/builder.py:1490, in GeneratorBasedBuilder._prepare_split(self, split_generator, check_duplicate_keys, file_format, num_proc, max_shard_size)
   1488 gen_kwargs = split_generator.gen_kwargs
   1489 job_id = 0
-> 1490 for job_id, done, content in self._prepare_split_single(
   1491     gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1492 ):
   1493     if done:
   1494         result = content

File ~/anaconda3/envs/torch/lib/python3.9/site-packages/datasets/builder.py:1646, in GeneratorBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, split_info, check_duplicate_keys, job_id)
   1644     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1645         e = e.__context__
-> 1646     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1648 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

NB:

- I have more than enough space on my hard drive.

- I believe it is not a RAM memory issue since it did not get full during the execution of the code.

- I decided not to use the streaming mode because when I do so, it takes ~1h only to fetch/read the data before starting with the training loop. It results in a slow training process.

Any help would highly be appreciated.

Thanks.

Discussion 2 answers

If hard drive space is not a limitation, can you use the google drive data download method while we resolve this issue with the Huggingface team?

23 Apr 2023, 17:34
Upvotes 0

Sure, I did not think of it.

Thanks.