Skip to content

Commit b35bd24

Browse files
BSchilperoortYang
andauthored
Add checks for missing data (#71)
* Add LAI to global data selection, tests * Correct test data generation * Add instructions, download script & parsing nb * Move global data operations to separate folder * Modify nearest_non_nan to check max_distance * Fix typing issue * Start split of global data, add verification * Finish refactoring global data * Fix bugs * Add checks for tile existance for DEM and h_canopy * Compress txt assets * Improve err msgs. Add tests for raises * Fix linter issues * Apply suggestions from code review Co-authored-by: Yang <y.liu@esciencecenter.nl> --------- Co-authored-by: Yang <y.liu@esciencecenter.nl>
1 parent 69c0ad1 commit b35bd24

15 files changed

Lines changed: 1210 additions & 339 deletions

PyStemmusScope/forcing_io.py

Lines changed: 13 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
"""Module for forcing data input and output operations."""
22
from pathlib import Path
3-
from typing import Any
43
from typing import Dict
54
import hdf5storage
65
import numpy as np
7-
import pandas as pd
86
import xarray as xr
9-
from . import global_data_selection as gds
10-
from . import utils
11-
from . import variable_conversion as vc
7+
from PyStemmusScope import global_data
8+
from PyStemmusScope import utils
9+
from PyStemmusScope import variable_conversion as vc
1210

1311

1412
def _write_matlab_ascii(fname, data, ncols):
@@ -131,67 +129,12 @@ def read_forcing_data_global( # noqa:PLR0913 (too many arguments)
131129
Returns:
132130
Dictionary containing the forcing data.
133131
"""
134-
files_cams = list((global_data_dir / "co2").glob("*.nc"))
135-
file_canopy_height = (
136-
global_data_dir / "canopy_height" / gds.get_filename_canopy_height(lat, lon)
132+
return global_data.collect_datasets(
133+
global_data_dir=global_data_dir,
134+
latlon=(lat, lon),
135+
time_range=(start_time, end_time),
136+
timestep=timestep,
137137
)
138-
file_dem = global_data_dir / "dem" / gds.get_filename_dem(lat, lon)
139-
files_era5 = list((global_data_dir / "era5").glob("*.nc"))
140-
files_era5land = list((global_data_dir / "era5-land").glob("*.nc"))
141-
files_lai = list((global_data_dir / "lai").glob("*.nc"))
142-
143-
data: Dict[str, Any] = {
144-
"time": xr.DataArray(
145-
pd.date_range(str(start_time), str(end_time), freq=timestep).rename("time")
146-
)
147-
}
148-
era5_data = gds.extract_era5_data(
149-
files_era5, files_era5land, lat, lon, start_time, end_time, timestep
150-
)
151-
data = {**data, **era5_data}
152-
153-
data["co2_conv"] = (
154-
vc.co2_mass_fraction_to_kg_per_m3(
155-
gds.extract_cams_data(files_cams, lat, lon, start_time, end_time, timestep)
156-
)
157-
* 1e6
158-
) # kg/m3 -> mg/m3
159-
160-
data["lai"] = vc.mask_data(
161-
gds.extract_lai_data(files_lai, lat, lon, start_time, end_time, timestep),
162-
min_value=0.01,
163-
)
164-
165-
data["elevation"] = gds.extract_prism_dem_data(file_dem, lat, lon)
166-
167-
data["canopy_height"] = gds.extract_canopy_height_data(file_canopy_height, lat, lon)
168-
# Height of measurement. Data has no actual equivalent. Set to a temp. value. see
169-
# issue #145.
170-
data["reference_height"] = 10.0
171-
172-
data["sitename"] = "global"
173-
174-
# Expected time format is days (as floating point) since Jan 1st 00:00.
175-
data["doy_float"] = (
176-
data["time"].dt.dayofyear
177-
- 1
178-
+ data["time"].dt.hour / 24
179-
+ data["time"].dt.minute / 60 / 24
180-
)
181-
data["year"] = data["time"].dt.year.astype(float)
182-
183-
data["DELT"] = (
184-
(data["time"].values[1] - data["time"].values[0]) / np.timedelta64(1, "s")
185-
).astype(float)
186-
data["total_timesteps"] = data["time"].size
187-
188-
data["latitude"] = lat
189-
data["longitude"] = lon
190-
191-
# TODO: Add land cover data retrieval.
192-
data["IGBP_veg_long"] = "Evergreen Needleleaf Forests"
193-
194-
return data
195138

196139

197140
def write_dat_files(data: dict, input_dir: Path):
@@ -305,7 +248,9 @@ def prepare_forcing(config: dict) -> None:
305248
if fmt == "site":
306249
forcing_file = utils.get_forcing_file(config)
307250
data = read_forcing_data_plumber2(
308-
forcing_file, config["StartTime"], config["EndTime"]
251+
forcing_file=forcing_file,
252+
start_time=config["StartTime"],
253+
end_time=config["EndTime"],
309254
)
310255

311256
elif fmt == "latlon":
@@ -319,8 +264,8 @@ def prepare_forcing(config: dict) -> None:
319264
global_data_dir=Path(config["ForcingPath"]),
320265
lat=loc[0], # type: ignore
321266
lon=loc[1], # type: ignore
322-
start_time=config["StartTime"],
323-
end_time=config["EndTime"],
267+
start_time=np.datetime64(config["StartTime"]),
268+
end_time=np.datetime64(config["EndTime"]),
324269
)
325270
else:
326271
raise NotImplementedError
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""Module for operations related to the 'global' datasets."""
2+
from PyStemmusScope.global_data import cams_co2
3+
from PyStemmusScope.global_data import copernicus_lai
4+
from PyStemmusScope.global_data import era5
5+
from PyStemmusScope.global_data import eth_canopy_height
6+
from PyStemmusScope.global_data import prism_dem
7+
from PyStemmusScope.global_data import utils
8+
from PyStemmusScope.global_data.global_data_selection import collect_datasets
9+
10+
11+
__all__ = [
12+
"collect_datasets",
13+
"utils",
14+
"era5",
15+
"eth_canopy_height",
16+
"prism_dem",
17+
"cams_co2",
18+
"copernicus_lai",
19+
]
Binary file not shown.
Binary file not shown.
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
"""Module for loading and validating the CAMS CO2 dataset."""
2+
from pathlib import Path
3+
from typing import List
4+
from typing import Tuple
5+
from typing import Union
6+
import numpy as np
7+
import xarray as xr
8+
from PyStemmusScope.global_data import utils
9+
10+
11+
RESOLUTION_CAMS = 0.75 # Resolution of the dataset in degrees
12+
13+
14+
def retrieve_co2_data(
15+
global_data_dir: Path,
16+
latlon: Union[Tuple[int, int], Tuple[float, float]],
17+
time_range: Tuple[np.datetime64, np.datetime64],
18+
timestep: str,
19+
) -> np.ndarray:
20+
"""Check for availability and retrieve the CAMS CO2 data.
21+
22+
Args:
23+
global_data_dir: Path to the directory containing the global datasets.
24+
latlon: Latitude and longitude of the site.
25+
time_range: Start and end time of the model run.
26+
timestep: Desired timestep of the model, this is derived from the forcing data.
27+
In a pandas-timedelta compatible format. For example: "1800S"
28+
29+
Returns:
30+
DataArray containing the CO2 at the specified site for the given time range.
31+
"""
32+
files = list((global_data_dir / "co2").glob("*.nc"))
33+
34+
if len(files) == 0:
35+
raise FileNotFoundError(
36+
f"No netCDF files found in the folder '{global_data_dir / 'co2'}'"
37+
)
38+
39+
return extract_cams_data(
40+
files_cams=files,
41+
latlon=latlon,
42+
time_range=time_range,
43+
timestep=timestep,
44+
)
45+
46+
47+
def extract_cams_data(
48+
files_cams: List[Path],
49+
latlon: Union[Tuple[int, int], Tuple[float, float]],
50+
time_range: Tuple[np.datetime64, np.datetime64],
51+
timestep: str,
52+
) -> np.ndarray:
53+
"""Extract and convert the required variables from the CAMS CO2 dataset.
54+
55+
Args:
56+
files_cams: List of CAMS files.
57+
latlon: Latitude and longitude of the site.
58+
time_range: Start and end time of the model run.
59+
timestep: Desired timestep of the model, this is derived from the forcing data.
60+
In a pandas-timedelta compatible format. For example: "1800S"
61+
62+
Returns:
63+
DataArray containing the CO2 concentration.
64+
"""
65+
ds = xr.open_mfdataset(files_cams)
66+
67+
check_cams_dataset(cams_data=ds, latlon=latlon, time_range=time_range)
68+
69+
try:
70+
ds = ds.sel(
71+
latitude=latlon[0],
72+
longitude=latlon[1],
73+
method="nearest",
74+
tolerance=RESOLUTION_CAMS,
75+
)
76+
except KeyError as err:
77+
if "not all values found" in str(err):
78+
raise utils.MissingDataError(
79+
f"\nNo data point was found within {RESOLUTION_CAMS} degrees"
80+
f"\nof the specified location {latlon}."
81+
f"\nPlease check the netCDF files or select a different location"
82+
) from err
83+
else:
84+
raise err
85+
86+
ds = ds.drop_vars(["latitude", "longitude"])
87+
ds = ds.compute()
88+
ds = ds.resample(time=timestep).interpolate("linear")
89+
ds = ds.sel(time=slice(time_range[0], time_range[1]))
90+
91+
return ds["co2"].values
92+
93+
94+
def check_cams_dataset(
95+
cams_data: xr.Dataset,
96+
latlon: Union[Tuple[int, int], Tuple[float, float]],
97+
time_range: Tuple[np.datetime64, np.datetime64],
98+
) -> None:
99+
"""Validate the CAMS CO2 dataset (variable name, location & time range).
100+
101+
Args:
102+
cams_data: Dataset containing the CAMS CO2 data.
103+
latlon: Latitude and longitude of the site.
104+
time_range: Start and end time of the model run.
105+
"""
106+
try:
107+
utils.assert_variables_present(cams_data, ["co2"])
108+
except utils.MissingDataError as err:
109+
raise utils.MissingDataError(
110+
"\nCould not find the variable 'co2' in the CAMS dataset."
111+
"\nPlease check the netCDF files or the path."
112+
) from err
113+
114+
try:
115+
utils.assert_location_within_bounds(
116+
cams_data, x=latlon[1], y=latlon[0], xdim="longitude", ydim="latitude"
117+
)
118+
except utils.MissingDataError as err:
119+
raise utils.MissingDataError(
120+
"\nThe CO2 data does not cover the given location."
121+
"\nPlease check the CAMS CO2 netCDF files, or select"
122+
"\na different location."
123+
) from err
124+
125+
try:
126+
utils.assert_time_within_bounds(cams_data, time_range[0], time_range[1])
127+
except utils.MissingDataError as err:
128+
raise utils.MissingDataError(
129+
"\nThe CO2 data does not cover the given start and end time. "
130+
"\nPlease check the CAMS CO2 netCDF files, or modify the model"
131+
"\nstart and end time."
132+
) from err
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
"""Module for loading and validating the Copernicus LAI dataset."""
2+
from pathlib import Path
3+
from typing import List
4+
from typing import Tuple
5+
from typing import Union
6+
import numpy as np
7+
import xarray as xr
8+
from PyStemmusScope.global_data import utils
9+
10+
11+
RESOLUTION_LAI = 1 / 112 # Resolution of the LAI dataset in degrees
12+
13+
14+
def retrieve_lai_data(
15+
global_data_dir: Path,
16+
latlon: Union[Tuple[int, int], Tuple[float, float]],
17+
time_range: Tuple[np.datetime64, np.datetime64],
18+
timestep: str,
19+
) -> np.ndarray:
20+
"""Check for availability and retrieve the Copernicus LAI data.
21+
22+
Args:
23+
global_data_dir: Path to the directory containing the global datasets.
24+
latlon: Latitude and longitude of the site.
25+
time_range: Start and end time of the model run.
26+
timestep: Desired timestep of the model, this is derived from the forcing data.
27+
In a pandas-timedelta compatible format. For example: "1800S"
28+
29+
Returns:
30+
DataArray containing the LAI of the specified site for the given time range.
31+
"""
32+
files = list((global_data_dir / "lai").glob("*.nc"))
33+
34+
if len(files) == 0:
35+
raise FileNotFoundError(
36+
f"No netCDF files found in the folder '{global_data_dir / 'lai'}'"
37+
)
38+
39+
return extract_lai_data(
40+
files_lai=files,
41+
latlon=latlon,
42+
time_range=time_range,
43+
timestep=timestep,
44+
)
45+
46+
47+
def extract_lai_data(
48+
files_lai: List[Path],
49+
latlon: Union[Tuple[int, int], Tuple[float, float]],
50+
time_range: Tuple[np.datetime64, np.datetime64],
51+
timestep: str,
52+
) -> np.ndarray:
53+
"""Generate LAI values, until a dataset is chosen.
54+
55+
Args:
56+
files_lai: List of paths to the *.nc files.
57+
latlon: Latitude and longitude of the site.
58+
time_range: Start and end time of the model run.
59+
timestep: Desired timestep of the model, this is derived from the forcing data.
60+
In a pandas-timedelta compatible format. For example: "1800S"
61+
62+
Returns:
63+
DataArray containing the LAI of the specified site for the given time range.
64+
"""
65+
ds = xr.open_mfdataset(files_lai)
66+
67+
check_lai_dataset(ds, latlon, time_range)
68+
69+
ds = ds.drop_vars(["crs", "LAI_ERR", "retrieval_flag"])
70+
71+
try:
72+
ds = ds.sel(
73+
lat=latlon[0],
74+
lon=latlon[1],
75+
method="nearest",
76+
tolerance=RESOLUTION_LAI,
77+
)
78+
except KeyError as err:
79+
if "not all values found" in str(err):
80+
raise utils.MissingDataError(
81+
f"\nNo data point was found within {RESOLUTION_LAI} degrees"
82+
f"\nof the specified location {latlon}."
83+
f"\nPlease check the netCDF files or select a different location"
84+
) from err
85+
else:
86+
raise err
87+
88+
ds = ds.drop_vars(["lat", "lon"])
89+
ds = ds.compute() # Load into memory before resampling
90+
ds = ds.resample(time=timestep).interpolate("linear")
91+
ds = ds.sel(time=slice(time_range[0], time_range[1]))
92+
93+
return ds["LAI"].values
94+
95+
96+
def check_lai_dataset(
97+
lai_data: xr.Dataset,
98+
latlon: Union[Tuple[int, int], Tuple[float, float]],
99+
time_range: Tuple[np.datetime64, np.datetime64],
100+
) -> None:
101+
"""Validate the LAI dataset (variables name, location & time range).
102+
103+
Args:
104+
lai_data: Dataset containing the LAI data.
105+
latlon: Latitude and longitude of the site.
106+
time_range: Start and end time of the model run.
107+
"""
108+
try:
109+
utils.assert_variables_present(lai_data, ["LAI"])
110+
except utils.MissingDataError as err:
111+
raise utils.MissingDataError(
112+
"\nCould not find the variable 'LAI' in the LAI dataset. "
113+
"\nPlease check the netCDF files or the path."
114+
) from err
115+
116+
try:
117+
utils.assert_location_within_bounds(
118+
lai_data, x=latlon[1], y=latlon[0], xdim="lon", ydim="lat"
119+
)
120+
except utils.MissingDataError as err:
121+
raise utils.MissingDataError(
122+
"\nThe LAI data does not cover the given location."
123+
"\nPlease check the LAI netCDF files, or select a different location."
124+
) from err
125+
126+
try:
127+
utils.assert_time_within_bounds(lai_data, time_range[0], time_range[1])
128+
except utils.MissingDataError as err:
129+
raise utils.MissingDataError(
130+
"\nThe LAI data does not cover the given start and end time. "
131+
"\nPlease check the LAI netCDF files, or modify the model"
132+
"\nstart and end time."
133+
) from err

0 commit comments

Comments
 (0)