|
| 1 | +import difflib |
1 | 2 | import io |
2 | 3 | import json |
| 4 | +import re |
3 | 5 | from itertools import cycle |
4 | 6 | from typing import ( |
5 | 7 | Any, |
@@ -784,12 +786,65 @@ def sample_metadata( |
784 | 786 | if prepared_sample_query is not None: |
785 | 787 | # Assume a pandas query string. |
786 | 788 | sample_query_options = sample_query_options or {} |
| 789 | + |
| 790 | + # Save a reference to the pre-query DataFrame so we can detect |
| 791 | + # zero-result queries and provide a helpful warning. |
| 792 | + df_before_query = df_samples |
| 793 | + |
787 | 794 | # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean. |
788 | 795 | df_samples = df_samples.query( |
789 | 796 | prepared_sample_query, **sample_query_options, engine="python" |
790 | 797 | ) |
791 | 798 | df_samples = df_samples.reset_index(drop=True) |
792 | 799 |
|
| 800 | + # Warn if query returned zero results on a non-empty dataset. |
| 801 | + # Provide fuzzy-match suggestions so users can spot typos, |
| 802 | + # case mismatches, or partial-value issues. |
| 803 | + if len(df_samples) == 0 and len(df_before_query) > 0: |
| 804 | + hint_lines = [ |
| 805 | + f"sample_metadata() returned 0 samples for query: {prepared_sample_query!r}.", |
| 806 | + ] |
| 807 | + |
| 808 | + # Extract column == 'value' pairs from the query. |
| 809 | + col_val_pairs = re.findall( |
| 810 | + r"\b(\w+)\s*==\s*['\"]([^'\"]+)['\"]", |
| 811 | + prepared_sample_query, |
| 812 | + ) |
| 813 | + |
| 814 | + for col_name, queried_val in col_val_pairs: |
| 815 | + # If the column name is not recognised, suggest |
| 816 | + # close column names. |
| 817 | + if col_name not in df_before_query.columns: |
| 818 | + close_cols = difflib.get_close_matches( |
| 819 | + col_name, |
| 820 | + df_before_query.columns.tolist(), |
| 821 | + n=3, |
| 822 | + cutoff=0.6, |
| 823 | + ) |
| 824 | + if close_cols: |
| 825 | + hint_lines.append( |
| 826 | + f"Column {col_name!r} not found. " |
| 827 | + f"Did you mean: {close_cols}?" |
| 828 | + ) |
| 829 | + continue |
| 830 | + |
| 831 | + # For string columns, suggest close values. |
| 832 | + if df_before_query[col_name].dtype == object: |
| 833 | + valid_vals = ( |
| 834 | + df_before_query[col_name].dropna().unique().tolist() |
| 835 | + ) |
| 836 | + close_vals = difflib.get_close_matches( |
| 837 | + queried_val, valid_vals, n=5, cutoff=0.6 |
| 838 | + ) |
| 839 | + if close_vals: |
| 840 | + hint_lines.append( |
| 841 | + f"Value {queried_val!r} not found in " |
| 842 | + f"column {col_name!r}. " |
| 843 | + f"Did you mean: {close_vals}?" |
| 844 | + ) |
| 845 | + |
| 846 | + warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2) |
| 847 | + |
793 | 848 | # Apply the sample_indices, if there are any. |
794 | 849 | # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`. |
795 | 850 | if sample_indices is not None: |
|
0 commit comments