AI-Hypercomputer · coolkp · Sep 22, 2025 · Sep 22, 2025
@@ -52,9 +52,13 @@ jobs:
     - name: Analysing the code with ruff
       run: |
         ruff check .
+    - name: version check
+      run: |
+        python --version
+        pip show jax jaxlib flax transformers datasets tensorflow tensorflow_datasets
     - name: PyTest
-      run: | 
-        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ python3 -m pytest --deselect=src/maxdiffusion/tests/ltx_transformer_step_test.py --deselect=src/maxdiffusion/tests/input_pipeline_interface_test.py -x
+      run: | #--deselect=src/maxdiffusion/tests/input_pipeline_interface_test.py
+        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ TOKENIZERS_PARALLELISM=false python3 -m pytest --deselect=src/maxdiffusion/tests/ltx_transformer_step_test.py  -x
 #  add_pull_ready:
 #    if: github.ref != 'refs/heads/main'
 #    permissions:

@@ -22,7 +22,7 @@
 from maxdiffusion import multihost_dataloading, max_logging
 
 AUTOTUNE = tf.data.AUTOTUNE
-
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 def load_as_tf_dataset(dataset, global_batch_size, shuffle, dataloading_host_count):
   dataset = dataset.with_format("tensorflow")[:]
@@ -50,7 +50,7 @@ def make_tf_iterator(
         function=tokenize_fn,
         batched=True,
         remove_columns=[config.caption_column],
-        num_proc=1 if config.cache_latents_text_encoder_outputs else config.tokenize_captions_num_proc,
+        num_proc=None,
         desc="Running tokenizer on train dataset",
     )
     # need to do it before load_as_tf_dataset
@@ -60,7 +60,7 @@ def make_tf_iterator(
         function=image_transforms_fn,
         batched=True,
         remove_columns=[config.image_column],
-        num_proc=1 if config.cache_latents_text_encoder_outputs else config.transform_images_num_proc,
+        num_proc=None,
         desc="Transforming images",
     )
     if config.cache_latents_text_encoder_outputs:

@@ -40,7 +40,7 @@
 from PIL import Image
 
 AUTOTUNE = tf.data.experimental.AUTOTUNE
-
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 def make_data_iterator(
     config,
@@ -159,7 +159,7 @@ def make_dreambooth_train_iterator(config, mesh, global_batch_size, tokenizer, v
         function=tokenize_fn,
         batched=True,
         remove_columns=[INSTANCE_PROMPT_IDS],
-        num_proc=1,
+        num_proc=None,
         desc="Running tokenizer on instance dataset",
     )
     rng = jax.random.key(config.seed)
@@ -177,7 +177,7 @@ def make_dreambooth_train_iterator(config, mesh, global_batch_size, tokenizer, v
         function=transform_images_fn,
         batched=True,
         remove_columns=[INSTANCE_IMAGES],
-        num_proc=1,
+        num_proc=None,
         desc="Running vae on instance dataset",
     )
 
@@ -188,7 +188,7 @@ def make_dreambooth_train_iterator(config, mesh, global_batch_size, tokenizer, v
         function=tokenize_fn,
         batched=True,
         remove_columns=[CLASS_PROMPT_IDS],
-        num_proc=1,
+        num_proc=None,
         desc="Running tokenizer on class dataset",
     )
     transform_images_fn = partial(
@@ -204,7 +204,7 @@ def make_dreambooth_train_iterator(config, mesh, global_batch_size, tokenizer, v
         function=transform_images_fn,
         batched=True,
         remove_columns=[CLASS_IMAGES],
-        num_proc=1,
+        num_proc=None,
         desc="Running vae on instance dataset",
     )
 

@@ -21,9 +21,7 @@
 import subprocess
 import unittest
 from absl.testing import absltest
-
 import numpy as np
-import pytest
 import tensorflow as tf
 import tensorflow.experimental.numpy as tnp
 import jax
@@ -70,7 +68,6 @@ class InputPipelineInterface(unittest.TestCase):
   def setUp(self):
     InputPipelineInterface.dummy_data = {}
 
-  @pytest.mark.skip(reason="Debug segfault")
   def test_make_dreambooth_train_iterator(self):
 
     instance_class_gcs_dir = "gs://maxdiffusion-github-runner-test-assets/datasets/dreambooth/instance_class"
@@ -85,6 +82,7 @@ def test_make_dreambooth_train_iterator(self):
             os.path.join(THIS_DIR, "..", "configs", "base14.yml"),
             "cache_latents_text_encoder_outputs=True",
             "dataset_name=my_dreambooth_dataset",
+            "transform_images_num_proc=1",
             f"instance_data_dir={instance_class_local_dir}",
             f"class_data_dir={class_class_local_dir}",
             "instance_prompt=photo of ohwx dog",