Skip to content

Commit 15e3398

Browse files
committed
Phase C: Fix data leakage using sklearn pipelines
1 parent 95338e0 commit 15e3398

4 files changed

Lines changed: 52 additions & 22 deletions

File tree

Diabetes Prediction [END 2 END]/diabetes_pipeline/data_preprocessing.py

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,44 +4,26 @@
44
import numpy as np
55
from pathlib import Path
66
from sklearn.model_selection import train_test_split
7-
from sklearn.preprocessing import StandardScaler
87

9-
BASE_DIR = Path(__file__).resolve().parent
10-
11-
def load_and_preprocess(
12-
csv_path=BASE_DIR / "dataset" / "kaggle_diabetes.csv",
13-
test_size=0.2,
14-
random_state=0
15-
):
16-
# Load dataset
8+
def load_and_preprocess(test_size=0.2, random_state=0):
9+
BASE_DIR = Path(__file__).resolve().parent
10+
csv_path = BASE_DIR / "dataset" / "kaggle_diabetes.csv"
1711
df = pd.read_csv(csv_path)
1812

19-
# Rename column (IMPORTANT: this defines feature names forever)
2013
df = df.rename(columns={'DiabetesPedigreeFunction': 'DPF'})
2114

22-
# Replace invalid zeros
2315
cols_with_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
2416
df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan)
2517

26-
# Fill NaNs
2718
df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean())
2819
df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mean())
2920
df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].median())
3021
df['Insulin'] = df['Insulin'].fillna(df['Insulin'].median())
3122
df['BMI'] = df['BMI'].fillna(df['BMI'].median())
3223

33-
# Features & target
3424
X = df.drop(columns='Outcome')
3525
y = df['Outcome']
3626

37-
# Split
38-
X_train, X_test, y_train, y_test = train_test_split(
27+
return train_test_split(
3928
X, y, test_size=test_size, random_state=random_state
4029
)
41-
42-
# Scaling
43-
scaler = StandardScaler()
44-
X_train_scaled = scaler.fit_transform(X_train)
45-
X_test_scaled = scaler.transform(X_test)
46-
47-
return X_train_scaled, X_test_scaled, y_train, y_test, scaler

Diabetes Prediction [END 2 END]/diabetes_pipeline/experiments/__init__.py

Whitespace-only changes.
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# diabetes_pipeline/experiments/experiment_runner.py
2+
3+
import pandas as pd
4+
from sklearn.pipeline import Pipeline
5+
from sklearn.preprocessing import StandardScaler
6+
from sklearn.linear_model import LogisticRegression
7+
from sklearn.tree import DecisionTreeClassifier
8+
from sklearn.ensemble import RandomForestClassifier
9+
from sklearn.svm import SVC
10+
from sklearn.metrics import accuracy_score, f1_score
11+
12+
from diabetes_pipeline.data_preprocessing import load_and_preprocess
13+
14+
X_train, X_test, y_train, y_test = load_and_preprocess()
15+
16+
models = {
17+
"LogisticRegression": LogisticRegression(max_iter=1000),
18+
"DecisionTree": DecisionTreeClassifier(random_state=0),
19+
"RandomForest": RandomForestClassifier(n_estimators=50, random_state=0),
20+
"SVM": SVC()
21+
}
22+
23+
results = []
24+
25+
for name, model in models.items():
26+
pipeline = Pipeline([
27+
("scaler", StandardScaler()),
28+
("model", model)
29+
])
30+
31+
pipeline.fit(X_train, y_train)
32+
preds = pipeline.predict(X_test)
33+
34+
results.append({
35+
"Model": name,
36+
"Accuracy": accuracy_score(y_test, preds),
37+
"F1 Score": f1_score(y_test, preds)
38+
})
39+
40+
df = pd.DataFrame(results)
41+
print(df)
42+
43+
df.to_csv("diabetes_pipeline/experiments/results.csv", index=False)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Model,Accuracy,F1 Score
2+
LogisticRegression,0.7875,0.6320346320346321
3+
DecisionTree,0.9875,0.980544747081712
4+
RandomForest,0.995,0.9921259842519685
5+
SVM,0.845,0.7327586206896551

0 commit comments

Comments
 (0)