forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add script csv datasets (huggingface#25)
* First commit to try to create agnostic local CSV datasets * Initial commit to allow genreic local CSV datasets * Add dummy data * Ignore index value in _generate_examples * [WIP] - refactoring * fixing CSV * small fix to convert * let's not handle ClassLabel for ArrowBuilder for now + style/quality Co-authored-by: Thomas Wolf <[email protected]>
- Loading branch information
Showing
16 changed files
with
641 additions
and
1,004 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# coding=utf-8 | ||
|
||
from dataclasses import dataclass | ||
|
||
import pyarrow as pa | ||
import pyarrow.csv as pac | ||
|
||
import nlp | ||
|
||
|
||
@dataclass | ||
class CsvConfig(nlp.BuilderConfig): | ||
"""BuilderConfig for CSV.""" | ||
skip_rows: int = 0 | ||
header_as_column_names: bool = True | ||
delimiter: str = "," | ||
quote_char: str = "\"" | ||
read_options: pac.ReadOptions = None | ||
parse_options: pac.ParseOptions = None | ||
convert_options: pac.ConvertOptions = None | ||
|
||
@property | ||
def pa_read_options(self): | ||
read_options = self.read_options or pac.ReadOptions() | ||
read_options.skip_rows = self.skip_rows | ||
read_options.autogenerate_column_names = not self.header_as_column_names | ||
return read_options | ||
|
||
@property | ||
def pa_parse_options(self): | ||
parse_options = self.parse_options or pac.ParseOptions() | ||
parse_options.delimiter = self.delimiter | ||
parse_options.quote_char = self.quote_char | ||
return parse_options | ||
|
||
@property | ||
def pa_convert_options(self): | ||
convert_options = self.convert_options or pac.ConvertOptions() | ||
return convert_options | ||
|
||
|
||
class Csv(nlp.ArrowBasedBuilder): | ||
BUILDER_CONFIGS = [ | ||
CsvConfig( | ||
name="CSV", | ||
version=nlp.Version("1.0.0"), | ||
description="Csv dataloader", | ||
), | ||
] | ||
|
||
def _info(self): | ||
return nlp.DatasetInfo() | ||
|
||
def _split_generators(self, dl_manager): | ||
""" We handle string, list and dicts in datafiles | ||
""" | ||
if isinstance(self.config.data_files, (str, list, tuple)): | ||
files = self.config.data_files | ||
if isinstance(files, str): | ||
files = [files] | ||
return [nlp.SplitGenerator( | ||
name=nlp.Split.TRAIN, | ||
gen_kwargs={"files": files})] | ||
splits = [] | ||
for split_name in [nlp.Split.TRAIN, nlp.Split.VALIDATION, nlp.Split.TEST]: | ||
if split_name in self.config.data_files: | ||
files = self.config.data_files[split_name] | ||
if isinstance(files, str): | ||
files = [files] | ||
splits.append( | ||
nlp.SplitGenerator( | ||
name=split_name, | ||
gen_kwargs={"files": files})) | ||
return splits | ||
|
||
def _generate_tables(self, files): | ||
for i, file in enumerate(files): | ||
pa_table = pac.read_csv( | ||
file, | ||
read_options=self.config.pa_read_options, | ||
parse_options=self.config.pa_parse_options, | ||
convert_options=self.config.convert_options, | ||
) | ||
yield i, pa_table |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.