fix hf unit test (#151)

aireenmei · web-flow · commit c1fc2f5bb7f8 · 2025-03-03T15:13:14.000-08:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 jax>=0.4.30
 jaxlib>=0.4.30
-grain-nightly
+grain-nightly==0.0.10
 google-cloud-storage==2.17.0
 absl-py
 datasets
diff --git a/src/maxdiffusion/input_pipeline/_hf_data_processing.py b/src/maxdiffusion/input_pipeline/_hf_data_processing.py
@@ -108,6 +108,7 @@ def __init__(
     self.current_shard = dataloading_host_index
     self.dataset_shard = split_dataset_by_node(dataset, world_size=self.n_shards, rank=self.current_shard)
     self.data_iter = None
+    self.out_of_data = False
 
   def _check_shard_count(self):
     if self.n_shards < self.dataloading_host_count:
@@ -119,11 +120,15 @@ def _check_shard_count(self):
       self.n_shards = self.dataloading_host_count
 
   def _update_shard(self):
-    new_shard = (self.current_shard + self.dataloading_host_count) % self.n_shards
-    max_logging.log(f"Updating host {self.dataloading_host_index} dataset from shard {self.current_shard} to {new_shard}")
-    self.current_shard = new_shard
-    self.dataset_shard = split_dataset_by_node(self.dataset, world_size=self.n_shards, rank=self.current_shard)
-    self.data_iter = iter(self.dataset_shard)
+    new_shard = self.current_shard + self.dataloading_host_count
+    if new_shard < self.n_shards:
+      max_logging.log(f"Updating host {self.dataloading_host_index} dataset from shard {self.current_shard} to {new_shard}")
+      self.current_shard = new_shard
+      self.dataset_shard = split_dataset_by_node(self.dataset, world_size=self.n_shards, rank=self.current_shard)
+      self.data_iter = iter(self.dataset_shard)
+    else:
+      max_logging.log(f"Run out of shards on host {self.dataloading_host_index}, shard {new_shard} is not available")
+      self.out_of_data = True
 
   def __len__(self):
     """Return length of the HF dataset. Since HuggingFace IterableDataset does not have length,
@@ -138,6 +143,8 @@ def __getitem__(self, index):
 
     while True:
       try:
+        if self.out_of_data:
+          return None
         data = next(self.data_iter)
         return data
       except StopIteration:
diff --git a/src/maxdiffusion/maxdiffusion_utils.py b/src/maxdiffusion/maxdiffusion_utils.py
@@ -14,6 +14,8 @@
  limitations under the License.
  """
 
+import io
+from PIL import Image
 import importlib
 import numpy as np
 import tensorflow as tf
@@ -72,6 +74,24 @@ def vae_apply(images, sample_rng, vae, vae_params):
   return latents
 
 
+def convert_dict_to_pil(image):
+  """
+  Converts a dictionary containing image bytes to a PIL Image object.
+
+  Args:
+    image_dict: A dictionary with keys 'bytes' (image data) and 'path' (optional).
+
+  Returns:
+    A PIL Image object.
+  """
+  if isinstance(image, dict):
+    image_bytes = image["bytes"]
+    image_stream = io.BytesIO(image_bytes)  # Create a BytesIO object
+    pil_image = Image.open(image_stream)  # Open the image from the stream
+    return pil_image
+  return image
+
+
 def transform_images(
     examples,
     image_column,
@@ -83,7 +103,7 @@ def transform_images(
 ):
   """Preprocess images to latents."""
   images = list(examples[image_column])
-  images = [np.asarray(image) for image in images]
+  images = [convert_dict_to_pil(image) for image in images]
   tensor_list = []
   for image in images:
     image = tf.image.resize(image, [image_resolution, image_resolution], method="bilinear", antialias=True)
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py
@@ -148,6 +148,8 @@ def user_init(raw_keys):
 
     if "hf_train_files" in raw_keys and not raw_keys["hf_train_files"]:
       raw_keys["hf_train_files"] = None
+    if "hf_access_token" in raw_keys and not raw_keys["hf_access_token"]:
+      raw_keys["hf_access_token"] = None
 
     raw_keys["total_train_batch_size"] = max_utils.get_global_batch_size(raw_keys["per_device_batch_size"])