|
1 | 1 | # For general helpers, regexes, or shared logic (e.g. phone/address parsing functions). |
2 | | -import copy |
3 | 2 | import logging |
4 | 3 | import os |
5 | 4 | import polars |
@@ -96,18 +95,11 @@ def convert_to_dataframe(d: dict) -> polars.DataFrame: |
96 | 95 | """internal dict to dataframe""" |
97 | 96 | flatdata = [_flatdict(f) for f in d.values()] |
98 | 97 | """ |
99 | | - Field names should find the _longest_ set of keys, not just the first one |
100 | | - to avoid dropping data by accident from some rows (with things like additional inspections) |
| 98 | + Ideally we'd look for the longest row to use as our schema, |
| 99 | + but dataframes are picky about services missing those extra rows, |
| 100 | + so for simpler logic, we'll just use the first row |
101 | 101 | """ |
102 | | - longest: list = list(flatdata[0].keys()) |
103 | | - longest_len: int = len(longest) |
104 | | - for dobj in flatdata[1:]: |
105 | | - keys = list(dobj.keys()) |
106 | | - if len(keys) > longest_len: |
107 | | - longest = copy.deepcopy(keys) |
108 | | - longest_len = len(longest) |
109 | | - logger.info("Key list is: %s", longest) |
110 | | - fieldnames = [k for k in longest if k not in flatdata_filtered_keys] |
| 102 | + fieldnames = [k for k in flatdata[0].keys() if k not in flatdata_filtered_keys] |
111 | 103 | # https://docs.pola.rs/api/python/stable/reference/api/polars.from_dicts.html |
112 | 104 | df = polars.from_dicts(flatdata, schema=fieldnames) |
113 | 105 | # logger.debug("Dataframe: %s", df) |
|
0 commit comments