fix interactive plotting (for real this time)

mluerig · mluerig · commit 632879b6c067 · 2025-10-28T20:12:25.000-04:00
diff --git a/bioencoder/scripts/interactive_plots.py b/bioencoder/scripts/interactive_plots.py
@@ -69,19 +69,21 @@ def interactive_plots(
         "plot_style": hyperparams.get("plot_style", 1),
         "point_size": hyperparams.get("point_size", 10),
     }
-    
+
+    return_results = hyperparams.get("return_results", False)
+
     ## directories and file management
     data_dir = os.path.join(root_dir, "data", run_name)
     plot_dir = os.path.join(root_dir, "plots", run_name)
     os.makedirs(plot_dir, exist_ok=True)
     plot_path = os.path.join(plot_dir, "embeddings_interactive_plot.html")
-    if not overwrite and not kwargs.get("ret_embeddings"):
+    if not overwrite and not (return_embeddings or return_coords):
         assert not os.path.isfile(plot_path), f"File already exists: {plot_path}"
     
     ## Load model and set up
     print(f"Checkpoint: using {checkpoint} of {stage} stage")
     ckpt_pretrained = os.path.join(root_dir, "weights", run_name, stage, checkpoint)
-    utils.set_seed()
+    seed = utils.set_seed()
     model = utils.build_model(backbone, second_stage=(stage == "second"), num_classes=num_classes, ckpt_pretrained=ckpt_pretrained).cuda()
     model.use_projection_head(False)
     model.eval()
@@ -91,40 +93,57 @@ def interactive_plots(
     loaders = utils.build_loaders(
         data_dir, transforms, batch_sizes, num_workers, 
         second_stage=(stage == "second"), drop_last=False, shuffle_train=False)
-    embeddings, labels, rel_paths = [], [], []
     
-    ## val set - batch size cant be zero
+    ## val set (always computed)
     embeddings_val, labels_val = utils.compute_embeddings(loaders["valid_loader"], model)
     rel_paths_val = [item[0][len(root_dir) + 1:] for item in loaders["valid_loader"].dataset.imgs]
-    embeddings.extend(embeddings_val)
-    labels.extend(labels_val)
-    rel_paths.extend(rel_paths_val)
+    # Build validation DataFrame (meta + embeddings)
+    df_val_meta = pd.DataFrame({
+        "image_name": [os.path.basename(p) for p in rel_paths_val],
+        "class_str": [os.path.basename(os.path.dirname(p)) for p in rel_paths_val],
+        "dataset": "val",
+    })
+    df_embeddings = pd.concat([df_val_meta, pd.DataFrame(embeddings_val)], axis=1)
     
     ## train set - skipped if zero batch size
     if batch_sizes["train_batch_size"] is not None:
         embeddings_train, labels_train = utils.compute_embeddings(loaders["train_loader"], model)
         rel_paths_train = [item[0][len(root_dir) + 1:] for item in loaders["train_loader"].dataset.imgs]
-        embeddings.extend(embeddings_train)
-        labels.extend(labels_train)
-        rel_paths.extend(rel_paths_train)
-    
-    ## Return embeddings without plotting
-    if kwargs.get("ret_embeddings"):
-        df = pd.DataFrame({"image_name": [os.path.basename(p) for p in rel_paths], "class": [os.path.basename(os.path.dirname(p)) for p in rel_paths]})
-        return pd.concat([df, pd.DataFrame(embeddings)], axis=1)
         
+        # Build training DataFrame (meta + embeddings)
+        df_train_meta = pd.DataFrame({
+            "image_name": [os.path.basename(p) for p in rel_paths_train],
+            "class_str": [os.path.basename(os.path.dirname(p)) for p in rel_paths_train],
+            "dataset": "train",
+        })
+        df_train = pd.concat([df_train_meta, pd.DataFrame(embeddings_train)], axis=1)
+        df_embeddings = pd.concat([df_embeddings, df_train], ignore_index=True)
+
+    ## Stable order before reduction
+    df_embeddings = df_embeddings.sort_values(by=["class_str", "dataset","image_name"]).reset_index(drop=True)
+
     ## Reduce dimensionality
     if not perplexity:
-        perplexity = min(30, max(5, (len(embeddings) - 1) / 3))
-        print(f"tSNE: using a perplexity value of {perplexity}")
-    reduced_data, colnames, _ = helpers.embbedings_dimension_reductions(embeddings, perplexity)
+        perplexity = min(30, max(5, (len(df_embeddings) - 1) / 3))
+    print(f"tSNE: using perplexity {perplexity}")
+    # Reduce on numeric embedding columns only
+    embedding_matrix = df_embeddings.select_dtypes(include=[np.number])
+    reduced_data, colnames, _ = helpers.embbedings_dimension_reductions(embedding_matrix, perplexity, seed)
     
     ## make plot
-    df = pd.DataFrame(reduced_data, columns=colnames)
-    df["paths"] = [os.path.join("..", "..", p) for p in rel_paths]
-    df["class"], df["class_str"] = labels, [os.path.basename(os.path.dirname(p)) for p in rel_paths]
-    df["dataset"] = df["paths"].apply(lambda x: "validation" if "/val/" in x else "train")
-    helpers.bokeh_plot(df, out_path=plot_path, **plot_config)
+    df_plot = df_embeddings.select_dtypes(exclude=[np.number])
+    df_plot['paths'] = df_plot.apply(lambda row: os.path.join(
+        "..", "..", "data", run_name, row['dataset'], row['class_str'], row['image_name']), axis=1)
+    df_plot["class"] = pd.Categorical(df_plot["class_str"]).codes
+    df_plot = pd.concat([df_plot, pd.DataFrame(reduced_data, columns=colnames)], axis=1)
+
+    helpers.bokeh_plot(df_plot, out_path=plot_path, **plot_config)
+
+    # Return logic: either one or both
+    if return_results:
+        return df_embeddings, df_plot
+
+
 
     
 def cli():
diff --git a/bioencoder/vis/helpers.py b/bioencoder/vis/helpers.py
@@ -220,7 +220,7 @@ def feature_map_normalization(f):
     act_map /= act_map.max()
     return act_map
 
-def embbedings_dimension_reductions(data_table, perplexity):
+def embbedings_dimension_reductions(data_table, perplexity, seed):
     """
     Perform dimension reduction on the input data.
 
@@ -235,14 +235,25 @@ def embbedings_dimension_reductions(data_table, perplexity):
     mean = np.mean(data_table, axis=0)
     std = np.std(data_table, axis=0)
     norm_data = (data_table - mean) / std
+    
+    ## PCA 
     pca_obj = decomposition.PCA(n_components=2)
     pca = pca_obj.fit_transform(norm_data)
-    tsne = manifold.TSNE(perplexity=perplexity, learning_rate='auto', init='pca').fit_transform(norm_data)
+
+    ## tSNE
+    tsne = manifold.TSNE(
+        perplexity=perplexity,
+        random_state=seed,
+        learning_rate='auto',
+        method="exact",
+        init=pca
+    ).fit_transform(norm_data)
+    
     names = ['PC1', 'PC2', 'tSNE-0', 'tSNE-1']
     return np.hstack((pca, tsne)), names, pca_obj
 
 
-def bokeh_plot(df, out_path='plot.html', color_map="jet1", color_classes=None, plot_style=1, 
+def bokeh_plot(df, out_path='plot.html', color_map="jet", color_classes=None, plot_style=1, 
                point_size=10, **kwargs):
     """
     Plot a scatter plot of the PCA and t-SNE dimensions of the data using bokeh.
@@ -264,7 +275,7 @@ class labels of the images).
         raise ValueError("The dataframe must have columns 'paths' and 'class'")      
    
     unique_classes = df['class'].unique()
-    unique_datasets = df['dataset'].unique()
+    unique_datasets = df['dataset'].astype(str).unique()
     markers = ['circle', 'square']  # Define markers for each group
 
     ## Color management
@@ -273,10 +284,9 @@ class labels of the images).
             f"Number of classes is {len(unique_classes)}, but only {len(color_classes)} colors provided."
         )
 
-        # Convert dict to DataFrame and merge colors
-        df_col = pd.DataFrame.from_dict(color_classes.items())
-        df_col.columns = ["class_str", "color"]
-        df = df.merge(df_col, how="left", left_on="class", right_on="class_str").drop(columns=["class_str"])
+        # Convert dict to DataFrame and merge colors by class_str (deterministic, no row reordering)
+        df_col = pd.DataFrame(list(color_classes.items()), columns=["class_str", "color"])
+        df = df.merge(df_col, how="left", on="class_str")
 
     else:
         num_classes = len(unique_classes)
@@ -285,7 +295,6 @@ class labels of the images).
         colors_str = ['#%02x%02x%02x' % tuple(c[:3]) for c in colors_raw]
         df['color'] = colors_str
         
-        
     source = ColumnDataSource(df)
     bplot.output_file(out_path)
     
@@ -335,19 +344,22 @@ class labels of the images).
     pca = bplot.figure(tools=tools0, title="PCA", match_aspect=True)
     tsne = bplot.figure(tools=tools1, title="t-SNE", match_aspect=True)
     
-    # Store renderers for dataset legend
-    legend_items_dataset = []
-    
-    # Scatter plots with different markers for datasets
-    for dataset, marker in zip(unique_datasets, markers):
-        dataset_source = ColumnDataSource(df[df['dataset'].astype(str) == dataset])  # Filter dataset-specific data
-        r = pca.scatter('PC1', 'PC2', source=dataset_source, color='color', size=point_size, marker=marker)
-        tsne.scatter('tSNE-0', 'tSNE-1', source=dataset_source, color='color', size=point_size, marker=marker)
-        legend_items_dataset.append(LegendItem(label=str(dataset), renderers=[r]))
-    
-    # Create and add horizontal legend for dataset markers
-    legend_dataset = Legend(items=legend_items_dataset, orientation="horizontal")
-    pca.add_layout(legend_dataset, 'below')
+    # Single source scatter with per-point markers mapped from dataset; no reordering
+    from itertools import cycle, islice
+    dataset_factors = list(pd.unique(df['dataset'].astype(str)))
+    marker_factors = list(islice(cycle(markers), len(dataset_factors)))
+    marker_map = factor_mark('dataset', marker_factors, dataset_factors)
+
+    pca.scatter('PC1', 'PC2', source=source, color='color', size=point_size, marker=marker_map, legend_field='dataset')
+    tsne.scatter('tSNE-0', 'tSNE-1', source=source, color='color', size=point_size, marker=marker_map)
+
+    # Single legend below the PCA plot
+    if getattr(pca, 'legend', None) and len(pca.legend) > 0:
+        pca.legend[0].orientation = "horizontal"
+        pca.legend[0].location = "left"
+        pca.add_layout(pca.legend[0], 'below')
+    if getattr(tsne, 'legend', None):
+        tsne.legend.visible = False
     
     # Display plots
     p = bplot.gridplot([[pca, tsne]])
diff --git a/bioencoder_configs/plot_stage1.yml b/bioencoder_configs/plot_stage1.yml
@@ -18,5 +18,7 @@ color_map: 'Set1' # Default color map; see https://matplotlib.org/stable/users/e
 #color_classes: # overrides color_map
   #class1: "#FFD467"
   #class2: "#4DC9F2"
-  
+
+return_results: False
+