Skip to content

Commit 345fb07

Browse files
Merge pull request #2917 from AI-Hypercomputer:dataset_update
PiperOrigin-RevId: 853445233
2 parents ec26bfd + 344c58c commit 345fb07

3 files changed

Lines changed: 6 additions & 1 deletion

File tree

src/MaxText/configs/base.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,9 @@ eval_dataset_name: 'c4/en:3.0.1'
587587
train_split: 'train'
588588
eval_split: 'validation'
589589
# for HuggingFace input pipeline (dataset_type=hf)
590+
# Check definition at https://github.com/huggingface/datasets/blob/0feb65dd8733191dd2d1e74215b422fc5939a56a/src/datasets/load.py#L1338-L1408
590591
hf_path: ''
592+
hf_name: ''
591593
hf_data_dir: ''
592594
hf_train_files: ''
593595
hf_eval_split: ''

src/MaxText/configs/types.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -919,7 +919,8 @@ class TfdsDataset(BaseModel):
919919
class HfDataset(BaseModel):
920920
"""Configuration specific to HuggingFace datasets."""
921921

922-
hf_path: str = Field("", description="Path or name of the Hugging Face dataset.")
922+
hf_path: str = Field("", description="Path of the Hugging Face dataset.")
923+
hf_name: str = Field("", description="Name of the Hugging Face dataset.")
923924
hf_data_dir: PathStr = Field("", description="Data directory for the HF dataset.")
924925
hf_train_files: Optional[str] = Field(None, description="Files for the HF training split.")
925926
hf_eval_split: str = Field("", description="Name of the HF evaluation split.")

src/MaxText/input_pipeline/_hf_data_processing.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ def make_hf_train_iterator(
352352
"""Load, preprocess dataset and return iterators"""
353353
train_ds = datasets.load_dataset(
354354
config.hf_path,
355+
name=config.hf_name,
355356
data_dir=config.hf_data_dir,
356357
data_files=config.hf_train_files,
357358
split=config.train_split,
@@ -404,6 +405,7 @@ def make_hf_eval_iterator(
404405
"""Make Hugging Face evaluation iterator. Load and preprocess eval dataset: and return iterator."""
405406
eval_ds = datasets.load_dataset(
406407
config.hf_path,
408+
name=config.hf_name,
407409
data_dir=config.hf_data_dir,
408410
data_files=config.hf_eval_files,
409411
split=config.hf_eval_split,

0 commit comments

Comments
 (0)