interactive plotting (bs not multiple)

mluerig · mluerig · commit f0713e8627ac · 2025-03-17T18:26:46.000-04:00
diff --git a/bioencoder/core/utils.py b/bioencoder/core/utils.py
@@ -227,13 +227,13 @@ def build_loaders(data_dir, transforms, batch_sizes, num_workers, second_stage=F
         second_stage=True
     )
 
-    train_features_loader = torch.utils.data.DataLoader(
+    train_loader = torch.utils.data.DataLoader(
         train_features_dataset, 
         batch_size=batch_sizes['train_batch_size'], 
         shuffle=True,
         num_workers=num_workers, 
         pin_memory=True, 
-        drop_last=True
+        drop_last=(batch_sizes['train_batch_size'] is not None)
     )
         
     valid_loader = torch.utils.data.DataLoader(
@@ -242,11 +242,11 @@ def build_loaders(data_dir, transforms, batch_sizes, num_workers, second_stage=F
         shuffle=False,
         num_workers=num_workers, 
         pin_memory=True, 
-        drop_last=True
+        drop_last=(batch_sizes['valid_batch_size'] is not None)
     )
     
     loaders = {
-        'train_features_loader': train_features_loader, 
+        'train_loader': train_loader, 
         'valid_loader': valid_loader
     }
 
diff --git a/bioencoder/scripts/interactive_plots.py b/bioencoder/scripts/interactive_plots.py
@@ -46,94 +46,90 @@ def interactive_plots(
 
     """
         
-    ## load bioencoer config
-    root_dir = config.root_dir
-    run_name = config.run_name
-    
-    ## load config
+    ## Load Bioencoder config
+    root_dir, run_name = config.root_dir, config.run_name
     hyperparams = utils.load_yaml(config_path)
     
-    ## parse config
+    ## Parse config
     backbone = hyperparams["model"]["backbone"]
     num_classes = hyperparams["model"].get("num_classes", None)
     checkpoint = hyperparams["model"].get("checkpoint", "swa")
-    stage = hyperparams["model"].get("stage", "first")
+    stage = hyperparams.get("model", {}).get("stage", "first")
+    
     batch_sizes = {
-        "train_batch_size": hyperparams["dataloaders"]["train_batch_size"],
-        "valid_batch_size": hyperparams["dataloaders"]["valid_batch_size"],
+        "train_batch_size": hyperparams.get("dataloaders", {}).get("train_batch_size"),
+        "valid_batch_size": hyperparams.get("dataloaders", {}).get("valid_batch_size",1),
     }
-    num_workers = hyperparams["dataloaders"]["num_workers"]
-    color_classes = hyperparams.get("color_classes", None)
-    color_map = hyperparams.get("color_map", "jet")
-    plot_style = hyperparams.get("plot_style", 1)
-    point_size = hyperparams.get("point_size", 10)
-    perplexity = hyperparams.get("perplexity", None)
+    num_workers = hyperparams.get("dataloaders", {}).get("num_workers", 4)
+    perplexity = hyperparams.get("perplexity", 30)
 
-    ## set up dirs
-    data_dir = os.path.join(root_dir,"data",  run_name)
-    plot_dir = os.path.join(root_dir, "plots", run_name)
-    os.makedirs(plot_dir, exist_ok=True)
+    plot_config = {
+        "color_classes": hyperparams.get("color_classes", None),
+        "color_map": hyperparams.get("color_map", "jet"),
+        "plot_style": hyperparams.get("plot_style", 1),
+        "point_size": hyperparams.get("point_size", 10),
+    }
+    
     
-    ## plot path
-    plot_path = os.path.join(plot_dir, f"embeddings_{run_name}.html")
+    ## Set up directories
+    data_dir = os.path.join(root_dir, "data", run_name)
+    plot_path = os.path.join(root_dir, "plots", run_name, f"embeddings_{run_name}.html")
     if not overwrite and not kwargs.get("ret_embeddings"):
         assert not os.path.isfile(plot_path), f"File exists: {plot_path}"
     
-    ## load weights
+    ## Load model and set up
     print(f"Checkpoint: using {checkpoint} of {stage} stage")
-    ckpt_pretrained = os.path.join(config.root_dir, "weights", run_name, stage, checkpoint)
-
-    ## set random seed
+    ckpt_pretrained = os.path.join(root_dir, "weights", run_name, stage, checkpoint)
     utils.set_seed()
-
-    ## extract embeddings
     transforms = utils.build_transforms(hyperparams)
-    loaders = utils.build_loaders(
-        data_dir, transforms, batch_sizes, num_workers, second_stage=(stage == "second")
-    )
-    model = utils.build_model(
-        backbone,
-        second_stage=(stage == "second"),
-        num_classes=num_classes,
-        ckpt_pretrained=ckpt_pretrained,
-    ).cuda()
+    loaders = utils.build_loaders(data_dir, transforms, batch_sizes, num_workers, second_stage=(stage == "second"))
+    model = utils.build_model(backbone, second_stage=(stage == "second"), num_classes=num_classes, ckpt_pretrained=ckpt_pretrained).cuda()
     model.use_projection_head(False)
     model.eval()
-    embeddings_train, labels_train = utils.compute_embeddings(
-        loaders["valid_loader"], model
-    )
-    
-    ## load dataset
-    rel_paths_train = [item[0][len(root_dir) + 1:] for item in loaders["valid_loader"].dataset.imgs]
-       
-    ## return embeddings without plotting
+    
+    ## Determine which embeddings to compute
+    embeddings, labels, rel_paths = [], [], []
+    
+    ## val batch size cant be zero
+    embeddings_val, labels_val = utils.compute_embeddings(loaders["valid_loader"], model)
+    if len(embeddings_val) < len(loaders["valid_loader"].dataset.imgs):
+        missed_imgs = len(loaders["valid_loader"].dataset.imgs) - len(embeddings_val)
+        print(f"Warning: missed {missed_imgs} images because batch size was not a multiple of validation dataset size.")
+    rel_paths_val = [item[0][len(root_dir) + 1:] for item in loaders["valid_loader"].dataset.imgs[:len(embeddings_val)]]
+    embeddings.extend(embeddings_val)
+    labels.extend(labels_val)
+    rel_paths.extend(rel_paths_val)
+    
+    ## train set embeddings
+    if batch_sizes["train_batch_size"] is not None:
+        embeddings_train, labels_train = utils.compute_embeddings(loaders["train_loader"], model)
+        if len(embeddings_train) < len(loaders["train_loader"].dataset.imgs):
+            missed_imgs = len(loaders["train_loader"].dataset.imgs) - len(embeddings_train)
+            print(f"Warning: missed {missed_imgs} images because batch size was not a multiple of training dataset size.")
+        rel_paths_train = [item[0][len(root_dir) + 1:] for item in loaders["train_loader"].dataset.imgs[:len(embeddings_train)]]
+        embeddings.extend(embeddings_train)
+        labels.extend(labels_train)
+        rel_paths.extend(rel_paths_train)
+    
+    ## Return embeddings without plotting
     if kwargs.get("ret_embeddings"):
+        df = pd.DataFrame({"image_name": [os.path.basename(p) for p in rel_paths], "class": [os.path.basename(os.path.dirname(p)) for p in rel_paths]})
+        return pd.concat([df, pd.DataFrame(embeddings)], axis=1)
         
-        df = pd.DataFrame([os.path.basename(item) for item in rel_paths_train], columns=["image_name"])
-        df["class"] = [
-            os.path.basename(os.path.dirname(item[0])) for item in loaders["valid_loader"].dataset.imgs
-        ]
-        return pd.concat([df, pd.DataFrame(embeddings_train)], axis=1)
-    
-    ## reduce dimensionality
-    perplexity = perplexity if perplexity else min(100, len(embeddings_train) // 2)
-    reduced_data, colnames, _ = helpers.embbedings_dimension_reductions(
-        embeddings_train, perplexity
-    )       
-    df = pd.DataFrame(reduced_data, columns=colnames)
-    df["paths"] = [ os.path.join("..", "..", item) for item in rel_paths_train]
-    df["class"] = labels_train
-    df["class_str"] = [
-        os.path.basename(os.path.dirname(item[0])) for item in loaders["valid_loader"].dataset.imgs
-    ]
-    
-    ## check if color matches n classes
-    if color_classes:
-        assert len(np.unique(labels_train)) == len(color_classes), f"Number of classes is {len(np.unique(labels_train))}, but you only provided {len(color_classes)} colors"
-    
-    helpers.bokeh_plot(df, out_path=plot_path, color_map=color_map, color_classes=color_classes, 
-                       plot_style=plot_style, point_size=point_size)
+    ## Reduce dimensionality
+    if not perplexity:
+        perplexity = min(100, len(embeddings) // 2)
+        print(f"tSNE: using a perplexity value of {perplexity}")
+    reduced_data, colnames, _ = helpers.embbedings_dimension_reductions(embeddings, perplexity)
     
+    ## make plot
+    df = pd.DataFrame(reduced_data, columns=colnames)
+    df["paths"] = [os.path.join("..", "..", p) for p in rel_paths]
+    df["class"], df["class_str"] = labels, [os.path.basename(os.path.dirname(p)) for p in rel_paths]
+    df["dataset"] = df["paths"].apply(lambda x: "validation" if "/val/" in x else "train")
+        
+    helpers.bokeh_plot(df, out_path=plot_path, **plot_config)
+
     
 def cli():
         
diff --git a/bioencoder/scripts/lr_finder.py b/bioencoder/scripts/lr_finder.py
@@ -118,7 +118,7 @@ def lr_finder(
         optim["scheduler"],
     )
     lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
-    lr_finder.range_test(loaders["train_features_loader"], end_lr=1, num_iter=num_iter)
+    lr_finder.range_test(loaders["train_loader"], end_lr=1, num_iter=num_iter)
 
     fig, ax = plt.subplots()
     ax, lr = lr_finder.plot(ax=ax, skip_start=skip_start, skip_end=skip_end)
diff --git a/bioencoder/scripts/split_dataset.py b/bioencoder/scripts/split_dataset.py
@@ -16,7 +16,7 @@ def split_dataset(
         image_dir, 
         mode="flat",
         val_percent=0.1, 
-        max_ratio=7,
+        max_ratio=10,
         min_per_class=20,
         random_seed=42,
         dry_run=False,
@@ -205,11 +205,11 @@ def cli():
     parser = argparse.ArgumentParser()
     parser.add_argument("--image-dir", type=str, help="Path to the images directory sorted into class-specific subfolders.")
     parser.add_argument("--mode", type=str, choices=['flat', 'random', 'fixed'], default='flat', help="Type of dataset split to perform.")
-    parser.add_argument("--val_percent", type=float, default=0.1, help="Percentage of data to use as validation set.")
-    parser.add_argument("--max_ratio", type=int, default=7, help="Maximum ratio between the most and least abundant classes.")
-    parser.add_argument("--min_per_class", type=int, default=20, help="Minimum number of images per class.")
-    parser.add_argument("--random_seed", type=int, default=42, help="Seed for random number generator.")
-    parser.add_argument("--dry_run", action='store_true', help="Run without making any changes.")
+    parser.add_argument("--val-percent", type=float, default=0.1, help="Percentage of data to use as validation set.")
+    parser.add_argument("--max-ratio", type=int, default=7, help="Maximum ratio between the most and least abundant classes.")
+    parser.add_argument("--min-per-class", type=int, default=20, help="Minimum number of images per class.")
+    parser.add_argument("--random-seed", type=int, default=42, help="Seed for random number generator.")
+    parser.add_argument("--dry-run", action='store_true', help="Run without making any changes.")
     parser.add_argument("--overwrite", action='store_true', help="Overwrite existing files without asking.")
     args = parser.parse_args()
     
diff --git a/bioencoder/scripts/swa.py b/bioencoder/scripts/swa.py
@@ -112,7 +112,7 @@ def swa(
 
     if stage == "first":
         valid_metrics = utils.validation_constructive(
-            loaders["valid_loader"], loaders["train_features_loader"], model, scaler
+            loaders["valid_loader"], loaders["train_loader"], model, scaler
         )
     else:
         valid_metrics = utils.validation_ce(
diff --git a/bioencoder/scripts/train.py b/bioencoder/scripts/train.py
@@ -219,7 +219,7 @@ def train(
         optim["loss_optimizer"],
     )
     if ema:
-        iters = len(loaders["train_features_loader"])
+        iters = len(loaders["train_loader"])
         ema_decay = ema_decay_per_epoch ** (1 / iters)
         ema = ExponentialMovingAverage(model.parameters(), decay=ema_decay)
 
@@ -244,7 +244,7 @@ def train(
                 )
             else:
                 train_metrics = utils.train_epoch_ce(
-                    loaders["train_features_loader"],
+                    loaders["train_loader"],
                     model,
                     criterion,
                     optimizer,
@@ -261,7 +261,7 @@ def train(
     
             if stage == "first":
                 valid_metrics_projection_head = utils.validation_constructive(
-                    loaders["valid_loader"], loaders["train_features_loader"], model, scaler
+                    loaders["valid_loader"], loaders["train_loader"], model, scaler
                 )
                 
                 ## check for GPU parallelization
@@ -270,7 +270,7 @@ def train(
                 #model_copy.use_projection_head(False)
                 model.use_projection_head(False)
                 valid_metrics_encoder = utils.validation_constructive(
-                    loaders["valid_loader"], loaders["train_features_loader"], model, scaler
+                    loaders["valid_loader"], loaders["train_loader"], model, scaler
                 )
                 model.use_projection_head(True)
                 #model_copy.use_projection_head(True)    parser.add_argument("--dry_run", action='store_true', help="Run without making any changes.")
diff --git a/bioencoder/vis/helpers.py b/bioencoder/vis/helpers.py
@@ -1,19 +1,25 @@
+
+#%% imports
+
 import numpy as np
 import pandas as pd
 import cv2
 import torch
 from torchvision import transforms
 from sklearn import decomposition, manifold
 import matplotlib.pyplot as plt
-from bokeh.models import (LassoSelectTool, PanTool,
-                          ResetTool, Div, CustomJS, 
-                          HoverTool, WheelZoomTool)
-TOOLS = [LassoSelectTool, PanTool, WheelZoomTool, ResetTool]
-from bokeh.models import ColumnDataSource
+from bokeh.models import (LassoSelectTool, PanTool,ResetTool, Div, CustomJS, HoverTool, WheelZoomTool,
+                          ColumnDataSource, Legend, LegendItem)
 from bokeh import plotting as bplot
+from bokeh.transform import factor_mark
 from bokeh.io import show
 from bokeh import layouts
 
+TOOLS = [LassoSelectTool, PanTool, WheelZoomTool, ResetTool]
+
+
+#%% functions
+
 
 def preprocess_image(img):
     """
@@ -236,7 +242,7 @@ def embbedings_dimension_reductions(data_table, perplexity):
     return np.hstack((pca, tsne)), names, pca_obj
 
 
-def bokeh_plot(df, out_path='plot.html', color_map="jet", color_classes=None, plot_style=1, 
+def bokeh_plot(df, out_path='plot.html', color_map="viridis", color_classes=None, plot_style=1, 
                point_size=10, **kwargs):
     """
     Plot a scatter plot of the PCA and t-SNE dimensions of the data using bokeh.
@@ -256,18 +262,25 @@ class labels of the images).
     
     if not all(col in df.columns for col in ['paths', 'class']):
         raise ValueError("The dataframe must have columns 'paths' and 'class'")      
-        
-    df['image_files'] =  df['paths']
-    
-    ## color management
+   
+    unique_classes = df['class'].unique()
+ 
+   
+    ## Color management
     if color_classes:
+        assert len(unique_classes) == len(color_classes), (
+            f"Number of classes is {len(unique_classes)}, but only {len(color_classes)} colors provided."
+        )
+
+        # Convert dict to DataFrame and merge colors
         df_col = pd.DataFrame.from_dict(color_classes.items())
-        df_col.columns = ["class_str","color"]
-        df = df.merge(df_col)
+        df_col.columns = ["class_str", "color"]
+        df = df.merge(df_col, how="left", left_on="class", right_on="class_str").drop(columns=["class_str"])
+
     else:
-        num_classes = len(df['class'].unique())
-        cmap=plt.cm.get_cmap(color_map, num_classes)
-        colors_raw = cmap((df['class']), bytes=True)
+        num_classes = len(unique_classes)
+        cmap = plt.cm.get_cmap(color_map, num_classes)
+        colors_raw = cmap(df['class'], bytes=True)
         colors_str = ['#%02x%02x%02x' % tuple(c[:3]) for c in colors_raw]
         df['color'] = colors_str
         
@@ -279,7 +292,7 @@ class labels of the images).
         <div>
             <div>
                 <img
-                src="@image_files" height="192" alt="image"
+                src="@paths" height="192" alt="image"
                 style="float: left; margin: 0px 15px 15px 0px; image-rendering: pixelated;"
                 border="2"
                 ></img>
@@ -312,8 +325,8 @@ class labels of the images).
             const indices = hit_test_result.indices;
             if (indices.length > 0) {
                 div.text = `<img 
-                src="${ds.data['image_files'][indices[0]]}"
-                style="float: left; margin: 0px 15px 15px 0px; max-width: 650px; max-height: 650px; width: auto; height: auto;"
+                src="${ds.data['paths'][indices[0]]}"
+                style="float: left; margin: 0px 15px 15px 0px; max-width: 650px; max-height: 500px; width: auto; height: auto;"
                 border="2"
                 />`;
             }

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ def lr_finder(`
`118`	`118`	`optim["scheduler"],`
`119`	`119`	`)`
`120`	`120`	`lr_finder = LRFinder(model, optimizer, criterion, device="cuda")`
`121`		`- lr_finder.range_test(loaders["train_features_loader"], end_lr=1, num_iter=num_iter)`
	`121`	`+ lr_finder.range_test(loaders["train_loader"], end_lr=1, num_iter=num_iter)`
`122`	`122`
`123`	`123`	`fig, ax = plt.subplots()`
`124`	`124`	`ax, lr = lr_finder.plot(ax=ax, skip_start=skip_start, skip_end=skip_end)`
Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@ def swa(`
`112`	`112`
`113`	`113`	`if stage == "first":`
`114`	`114`	`valid_metrics = utils.validation_constructive(`
`115`		`- loaders["valid_loader"], loaders["train_features_loader"], model, scaler`
	`115`	`+ loaders["valid_loader"], loaders["train_loader"], model, scaler`
`116`	`116`	`)`
`117`	`117`	`else:`
`118`	`118`	`valid_metrics = utils.validation_ce(`