|
2 | 2 |
|
3 | 3 | import pandas as pd |
4 | 4 | import numpy as np |
| 5 | +from pathlib import Path |
5 | 6 | from sklearn.model_selection import train_test_split |
6 | 7 | from sklearn.preprocessing import StandardScaler |
7 | 8 |
|
8 | | -def load_and_preprocess(csv_path='dataset/kaggle_diabetes.csv', test_size=0.2, random_state=0): |
| 9 | +BASE_DIR = Path(__file__).resolve().parent |
| 10 | + |
| 11 | +def load_and_preprocess( |
| 12 | + csv_path=BASE_DIR / "dataset" / "kaggle_diabetes.csv", |
| 13 | + test_size=0.2, |
| 14 | + random_state=0 |
| 15 | +): |
9 | 16 | # Load dataset |
10 | 17 | df = pd.read_csv(csv_path) |
11 | | - |
12 | | - # Rename column |
| 18 | + |
| 19 | + # Rename column (IMPORTANT: this defines feature names forever) |
13 | 20 | df = df.rename(columns={'DiabetesPedigreeFunction': 'DPF'}) |
14 | | - |
15 | | - # Replace 0s with NaN |
| 21 | + |
| 22 | + # Replace invalid zeros |
16 | 23 | cols_with_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI'] |
17 | 24 | df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan) |
18 | | - |
| 25 | + |
19 | 26 | # Fill NaNs |
20 | 27 | df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean()) |
21 | 28 | df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mean()) |
22 | 29 | df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].median()) |
23 | 30 | df['Insulin'] = df['Insulin'].fillna(df['Insulin'].median()) |
24 | 31 | df['BMI'] = df['BMI'].fillna(df['BMI'].median()) |
25 | | - |
26 | | - # Features & Target |
| 32 | + |
| 33 | + # Features & target |
27 | 34 | X = df.drop(columns='Outcome') |
28 | 35 | y = df['Outcome'] |
29 | | - |
30 | | - # Train/Test Split |
| 36 | + |
| 37 | + # Split |
31 | 38 | X_train, X_test, y_train, y_test = train_test_split( |
32 | 39 | X, y, test_size=test_size, random_state=random_state |
33 | 40 | ) |
34 | | - |
35 | | - # Feature Scaling |
| 41 | + |
| 42 | + # Scaling |
36 | 43 | scaler = StandardScaler() |
37 | 44 | X_train_scaled = scaler.fit_transform(X_train) |
38 | 45 | X_test_scaled = scaler.transform(X_test) |
39 | | - |
| 46 | + |
40 | 47 | return X_train_scaled, X_test_scaled, y_train, y_test, scaler |
0 commit comments