|
| 1 | +import difflib |
1 | 2 | import io |
2 | 3 | import json |
| 4 | +import re |
3 | 5 | from itertools import cycle |
4 | 6 | from typing import ( |
5 | 7 | Any, |
@@ -705,6 +707,17 @@ def clear_extra_metadata(self): |
705 | 707 | @doc( |
706 | 708 | summary="Access sample metadata for one or more sample sets.", |
707 | 709 | returns="A dataframe of sample metadata, one row per sample.", |
| 710 | + notes=""" |
| 711 | + Some samples in the dataset are lab crosses — mosquitoes bred in |
| 712 | + the laboratory that have no real collection date. These samples |
| 713 | + use ``year=-1`` and ``month=-1`` as sentinel values. They may |
| 714 | + cause unexpected results in date-based analyses (e.g., |
| 715 | + ``pd.to_datetime`` will fail on negative year values). |
| 716 | +
|
| 717 | + To exclude lab cross samples, use:: |
| 718 | +
|
| 719 | + df = api.sample_metadata(sample_query="year >= 0") |
| 720 | + """, |
708 | 721 | ) |
709 | 722 | def sample_metadata( |
710 | 723 | self, |
@@ -784,12 +797,65 @@ def sample_metadata( |
784 | 797 | if prepared_sample_query is not None: |
785 | 798 | # Assume a pandas query string. |
786 | 799 | sample_query_options = sample_query_options or {} |
| 800 | + |
| 801 | + # Save a reference to the pre-query DataFrame so we can detect |
| 802 | + # zero-result queries and provide a helpful warning. |
| 803 | + df_before_query = df_samples |
| 804 | + |
787 | 805 | # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean. |
788 | 806 | df_samples = df_samples.query( |
789 | 807 | prepared_sample_query, **sample_query_options, engine="python" |
790 | 808 | ) |
791 | 809 | df_samples = df_samples.reset_index(drop=True) |
792 | 810 |
|
| 811 | + # Warn if query returned zero results on a non-empty dataset. |
| 812 | + # Provide fuzzy-match suggestions so users can spot typos, |
| 813 | + # case mismatches, or partial-value issues. |
| 814 | + if len(df_samples) == 0 and len(df_before_query) > 0: |
| 815 | + hint_lines = [ |
| 816 | + f"sample_metadata() returned 0 samples for query: {prepared_sample_query!r}.", |
| 817 | + ] |
| 818 | + |
| 819 | + # Extract column == 'value' pairs from the query. |
| 820 | + col_val_pairs = re.findall( |
| 821 | + r"\b(\w+)\s*==\s*['\"]([^'\"]+)['\"]", |
| 822 | + prepared_sample_query, |
| 823 | + ) |
| 824 | + |
| 825 | + for col_name, queried_val in col_val_pairs: |
| 826 | + # If the column name is not recognised, suggest |
| 827 | + # close column names. |
| 828 | + if col_name not in df_before_query.columns: |
| 829 | + close_cols = difflib.get_close_matches( |
| 830 | + col_name, |
| 831 | + df_before_query.columns.tolist(), |
| 832 | + n=3, |
| 833 | + cutoff=0.6, |
| 834 | + ) |
| 835 | + if close_cols: |
| 836 | + hint_lines.append( |
| 837 | + f"Column {col_name!r} not found. " |
| 838 | + f"Did you mean: {close_cols}?" |
| 839 | + ) |
| 840 | + continue |
| 841 | + |
| 842 | + # For string columns, suggest close values. |
| 843 | + if df_before_query[col_name].dtype == object: |
| 844 | + valid_vals = ( |
| 845 | + df_before_query[col_name].dropna().unique().tolist() |
| 846 | + ) |
| 847 | + close_vals = difflib.get_close_matches( |
| 848 | + queried_val, valid_vals, n=5, cutoff=0.6 |
| 849 | + ) |
| 850 | + if close_vals: |
| 851 | + hint_lines.append( |
| 852 | + f"Value {queried_val!r} not found in " |
| 853 | + f"column {col_name!r}. " |
| 854 | + f"Did you mean: {close_vals}?" |
| 855 | + ) |
| 856 | + |
| 857 | + warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2) |
| 858 | + |
793 | 859 | # Apply the sample_indices, if there are any. |
794 | 860 | # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`. |
795 | 861 | if sample_indices is not None: |
|
0 commit comments