-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeature_engineering.py
More file actions
35 lines (26 loc) · 1.11 KB
/
feature_engineering.py
File metadata and controls
35 lines (26 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import numpy as np
def add_features(df, window_size='7D'):
df = df.copy()
# Ensure correct dtypes
df['date_time'] = pd.to_datetime(df['date_time'])
df['streamflow_cfs'] = pd.to_numeric(df['streamflow_cfs'], errors='coerce')
# Log transform for better scaling
import numpy as np
df['log_streamflow'] = np.log1p(df['streamflow_cfs'])
df.set_index('date_time', inplace=True)
df.sort_index(inplace=True)
# Rolling percentiles
df['streamflow_p10'] = df['streamflow_cfs'].rolling(window=window_size).quantile(0.10)
df['streamflow_p50'] = df['streamflow_cfs'].rolling(window=window_size).quantile(0.50)
df['streamflow_p90'] = df['streamflow_cfs'].rolling(window=window_size).quantile(0.90)
# Temporal gradients
df['streamflow_diff'] = df['streamflow_cfs'].diff()
df['streamflow_pct_change'] = df['streamflow_cfs'].pct_change()
# Handle infinity and NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)
# Drop missing values
df.dropna(inplace=True)
df.reset_index(inplace=True)
return df