fixed interactive plots (img size), drop last = F

mluerig · mluerig · commit 0d1e0ce70b3f · 2025-03-18T16:40:42.000-04:00
diff --git a/bioencoder/core/augmentations.py b/bioencoder/core/augmentations.py
@@ -13,8 +13,10 @@ def get_transforms(config, valid=False):
     Returns:
         albumentations.core.composition.Compose: The image transformation pipeline.
     """
-    default_size = 224
-    img_size = config.get('img_size', default_size)
+    
+    img_size = config.get('img_size')
+    if img_size is None:
+        raise ValueError("config must include 'img_size'")
     config_aug = config.get('augmentations', {})
     aug = get_aug_from_config(config_aug.get('transforms', []))
 
diff --git a/bioencoder/core/utils.py b/bioencoder/core/utils.py
@@ -196,7 +196,9 @@ def build_transforms(config):
     }
 
 
-def build_loaders(data_dir, transforms, batch_sizes, num_workers, second_stage=False, is_supcon=False):
+def build_loaders(data_dir, transforms, batch_sizes, num_workers, 
+                  second_stage=False, is_supcon=False,
+                  shuffle_train=True, drop_last=True):
     """
     Build data loaders for training and validation.
     
@@ -230,19 +232,19 @@ def build_loaders(data_dir, transforms, batch_sizes, num_workers, second_stage=F
     train_loader = torch.utils.data.DataLoader(
         train_features_dataset, 
         batch_size=batch_sizes['train_batch_size'], 
-        shuffle=True,
+        shuffle=shuffle_train,
         num_workers=num_workers, 
         pin_memory=True, 
-        drop_last=(batch_sizes['train_batch_size'] is not None)
+        drop_last=drop_last and batch_sizes['train_batch_size'] is not None
     )
-        
+
     valid_loader = torch.utils.data.DataLoader(
         valid_dataset, 
         batch_size=batch_sizes['valid_batch_size'], 
         shuffle=False,
         num_workers=num_workers, 
         pin_memory=True, 
-        drop_last=(batch_sizes['valid_batch_size'] is not None)
+        drop_last=drop_last
     )
     
     loaders = {
diff --git a/bioencoder/scripts/interactive_plots.py b/bioencoder/scripts/interactive_plots.py
@@ -70,7 +70,6 @@ def interactive_plots(
         "point_size": hyperparams.get("point_size", 10),
     }
     
-    
     ## directories and file management
     data_dir = os.path.join(root_dir, "data", run_name)
     plot_dir = os.path.join(root_dir, "plots", run_name)
@@ -83,32 +82,28 @@ def interactive_plots(
     print(f"Checkpoint: using {checkpoint} of {stage} stage")
     ckpt_pretrained = os.path.join(root_dir, "weights", run_name, stage, checkpoint)
     utils.set_seed()
-    transforms = utils.build_transforms(hyperparams)
-    loaders = utils.build_loaders(data_dir, transforms, batch_sizes, num_workers, second_stage=(stage == "second"))
     model = utils.build_model(backbone, second_stage=(stage == "second"), num_classes=num_classes, ckpt_pretrained=ckpt_pretrained).cuda()
     model.use_projection_head(False)
     model.eval()
     
-    ## Determine which embeddings to compute
+    ## prep computation
+    transforms = utils.build_transforms(hyperparams)
+    loaders = utils.build_loaders(
+        data_dir, transforms, batch_sizes, num_workers, 
+        second_stage=(stage == "second"), drop_last=False, shuffle_train=False)
     embeddings, labels, rel_paths = [], [], []
     
-    ## val batch size cant be zero
+    ## val set - batch size cant be zero
     embeddings_val, labels_val = utils.compute_embeddings(loaders["valid_loader"], model)
-    if len(embeddings_val) < len(loaders["valid_loader"].dataset.imgs):
-        missed_imgs = len(loaders["valid_loader"].dataset.imgs) - len(embeddings_val)
-        print(f"Warning: missed {missed_imgs} images because batch size was not a multiple of validation dataset size.")
-    rel_paths_val = [item[0][len(root_dir) + 1:] for item in loaders["valid_loader"].dataset.imgs[:len(embeddings_val)]]
+    rel_paths_val = [item[0][len(root_dir) + 1:] for item in loaders["valid_loader"].dataset.imgs]
     embeddings.extend(embeddings_val)
     labels.extend(labels_val)
     rel_paths.extend(rel_paths_val)
     
-    ## train set embeddings
+    ## train set - skipped if zero batch size
     if batch_sizes["train_batch_size"] is not None:
         embeddings_train, labels_train = utils.compute_embeddings(loaders["train_loader"], model)
-        if len(embeddings_train) < len(loaders["train_loader"].dataset.imgs):
-            missed_imgs = len(loaders["train_loader"].dataset.imgs) - len(embeddings_train)
-            print(f"Warning: missed {missed_imgs} images because batch size was not a multiple of training dataset size.")
-        rel_paths_train = [item[0][len(root_dir) + 1:] for item in loaders["train_loader"].dataset.imgs[:len(embeddings_train)]]
+        rel_paths_train = [item[0][len(root_dir) + 1:] for item in loaders["train_loader"].dataset.imgs]
         embeddings.extend(embeddings_train)
         labels.extend(labels_train)
         rel_paths.extend(rel_paths_train)
@@ -120,7 +115,7 @@ def interactive_plots(
         
     ## Reduce dimensionality
     if not perplexity:
-        perplexity = min(100, len(embeddings) // 2)
+        perplexity = min(30, max(5, (len(embeddings) - 1) / 3))
         print(f"tSNE: using a perplexity value of {perplexity}")
     reduced_data, colnames, _ = helpers.embbedings_dimension_reductions(embeddings, perplexity)
     
@@ -129,7 +124,6 @@ def interactive_plots(
     df["paths"] = [os.path.join("..", "..", p) for p in rel_paths]
     df["class"], df["class_str"] = labels, [os.path.basename(os.path.dirname(p)) for p in rel_paths]
     df["dataset"] = df["paths"].apply(lambda x: "validation" if "/val/" in x else "train")
-        
     helpers.bokeh_plot(df, out_path=plot_path, **plot_config)
 
     
diff --git a/bioencoder/vis/helpers.py b/bioencoder/vis/helpers.py
@@ -264,7 +264,9 @@ class labels of the images).
         raise ValueError("The dataframe must have columns 'paths' and 'class'")      
    
     unique_classes = df['class'].unique()
- 
+    unique_datasets = df['dataset'].unique()
+    markers = ['circle', 'square']  # Define markers for each group
+
     ## Color management
     if color_classes:
         assert len(unique_classes) == len(color_classes), (
@@ -283,10 +285,12 @@ class labels of the images).
         colors_str = ['#%02x%02x%02x' % tuple(c[:3]) for c in colors_raw]
         df['color'] = colors_str
         
+        
     source = ColumnDataSource(df)
     bplot.output_file(out_path)
     
     if plot_style == 1:
+        div = Div(text="")
         tooltip = """
         <div>
             <div>
@@ -305,18 +309,12 @@ class labels of the images).
         hover1 = HoverTool(tooltips=tooltip)
         tools0 = [t() for t in TOOLS] + [hover0]
         tools1 = [t() for t in TOOLS] + [hover1]
-        pca = bplot.figure(tools=tools0)
-        pca.scatter('PC1', 'PC2', color='color', source=source, size=point_size)
-        tsne = bplot.figure(tools=tools1)
-        tsne.scatter('tSNE-0', 'tSNE-1', color='color', source=source, size=point_size)
-        p = bplot.gridplot([[pca, tsne]])
-        bplot.show(p)
-        
+
     elif plot_style == 2:
         div = Div(text="")
         hover=HoverTool(
                 tooltips = [
-                ("class_str", "@class_str"),
+                ("Class", "@class_str"),
                 ]
         )
         hover.callback = CustomJS(args=dict(div=div, ds=source), code="""
@@ -332,11 +330,28 @@ class labels of the images).
             """)
         tools0 = [t() for t in TOOLS] + [hover]
         tools1 = [t() for t in TOOLS] + [hover]
-        pca = bplot.figure(tools=tools0)
-        pca.scatter('PC1', 'PC2', color='color', source=source, size=point_size)
-        tsne = bplot.figure(tools=tools1)
-        tsne.scatter('tSNE-0', 'tSNE-1', color='color', source=source, size=point_size)
-        p = bplot.gridplot([[pca, tsne]])
-        show(layouts.row(p, div))
+        
+    # Create figures
+    pca = bplot.figure(tools=tools0, title="PCA", match_aspect=True)
+    tsne = bplot.figure(tools=tools1, title="t-SNE", match_aspect=True)
+    
+    # Store renderers for dataset legend
+    legend_items_dataset = []
+    
+    # Scatter plots with different markers for datasets
+    for dataset, marker in zip(unique_datasets, markers):
+        dataset_source = ColumnDataSource(df[df['dataset'].astype(str) == dataset])  # Filter dataset-specific data
+        r = pca.scatter('PC1', 'PC2', source=dataset_source, color='color', size=point_size, marker=marker)
+        tsne.scatter('tSNE-0', 'tSNE-1', source=dataset_source, color='color', size=point_size, marker=marker)
+        legend_items_dataset.append(LegendItem(label=str(dataset), renderers=[r]))
+    
+    # Create and add horizontal legend for dataset markers
+    legend_dataset = Legend(items=legend_items_dataset, orientation="horizontal")
+    pca.add_layout(legend_dataset, 'below')
+    
+    # Display plots
+    p = bplot.gridplot([[pca, tsne]])
+    show(layouts.row(p, div))
+
 
     return p
diff --git a/bioencoder_configs/plot_stage1.yml b/bioencoder_configs/plot_stage1.yml
@@ -4,11 +4,15 @@ model:
   stage: first # Training stage: 'first' for initial training, 'second' for fine-tuning
 
 dataloaders:
-  train_batch_size:  # Larger is faster but may drop leftover data points - no value or removing this line will not include training data
-  valid_batch_size: 10 # Larger is faster but may drop leftover data points - ideally use a multiple of val set size
+  train_batch_size: 20 # Larger is faster; no value or removing this line will not include training data
+  valid_batch_size: 20 # Larger is faster; val data is always plotted
   num_workers: 32 # Should not exceed available CPU cores
 
-plot_style: 1 # (1: pictogram above point, 2: pictogram next to plot panel)
+img_size: 384 # image size used for training
+
+perplexity: 30 # for tSNE<; cannot be larger than dataset
+
+plot_style: 2 # (1: pictogram above point, 2: pictogram next to plot panel)
 point_size: 10  ## size of points in scatter plot
 color_map: 'Set1' # Default color map; see https://matplotlib.org/stable/users/explain/colors/colormaps.html for options
 #color_classes: # overrides color_map