Skip to content

Commit a36936c

Browse files
Add files via upload
1 parent 0df4bb4 commit a36936c

File tree

1 file changed

+373
-0
lines changed

1 file changed

+373
-0
lines changed
Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pickle
4+
import sys
5+
import os
6+
import matplotlib.pyplot as plt
7+
from sklearn.model_selection import train_test_split
8+
from sklearn.metrics import (
9+
accuracy_score, precision_score, recall_score, f1_score,
10+
confusion_matrix, roc_curve, auc
11+
)
12+
13+
# ========================
14+
# MLP Model Implementation
15+
# ========================
16+
17+
class MLP:
18+
def __init__(self, layer_sizes, activations=None, seed=42, dropout_rates=None):
19+
np.random.seed(seed)
20+
self.layer_sizes = layer_sizes
21+
self.num_layers = len(layer_sizes) - 1
22+
23+
self.activations = ['relu'] * (self.num_layers - 1) if activations is None else activations
24+
self.dropout_rates = [0.0] * (self.num_layers - 1) if dropout_rates is None else dropout_rates
25+
26+
self.params = {}
27+
for i in range(self.num_layers):
28+
input_size = layer_sizes[i]
29+
output_size = layer_sizes[i + 1]
30+
limit = np.sqrt(6.0 / (input_size + output_size))
31+
self.params['W' + str(i)] = np.random.uniform(-limit, limit, (input_size, output_size))
32+
self.params['b' + str(i)] = np.zeros(output_size)
33+
34+
self.m = {}
35+
self.v = {}
36+
for key in self.params:
37+
self.m[key] = np.zeros_like(self.params[key])
38+
self.v[key] = np.zeros_like(self.params[key])
39+
self.t = 0
40+
41+
def relu(self, z):
42+
return np.maximum(0, z)
43+
44+
def relu_derivative(self, z):
45+
return (z > 0).astype(float)
46+
47+
def sigmoid(self, z):
48+
return 1.0 / (1.0 + np.exp(-np.clip(z, -500, 500)))
49+
50+
def forward(self, X, training=True):
51+
cache = {}
52+
cache['A0'] = X
53+
A = X
54+
for i in range(self.num_layers):
55+
W = self.params['W' + str(i)]
56+
b = self.params['b' + str(i)]
57+
Z = np.dot(A, W) + b
58+
cache['Z' + str(i)] = Z
59+
if i == self.num_layers - 1:
60+
A = self.sigmoid(Z)
61+
else:
62+
A = self.relu(Z)
63+
if training and self.dropout_rates[i] > 0:
64+
mask = np.random.rand(*A.shape) > self.dropout_rates[i]
65+
A = A * mask / (1.0 - self.dropout_rates[i])
66+
cache['dropout_mask' + str(i)] = mask
67+
cache['A' + str(i + 1)] = A
68+
return A, cache
69+
70+
def backward(self, y_pred, y_true, cache, class_weights=None):
71+
m = y_true.shape[0]
72+
grads = {}
73+
if class_weights is not None:
74+
sample_weights = np.array([class_weights[int(y)] for y in y_true]).reshape(-1, 1)
75+
dA = (y_pred.reshape(-1, 1) - y_true.reshape(-1, 1)) * sample_weights / m
76+
else:
77+
dA = (y_pred.reshape(-1, 1) - y_true.reshape(-1, 1)) / m
78+
for i in range(self.num_layers - 1, -1, -1):
79+
A_prev = cache['A' + str(i)]
80+
grads['W' + str(i)] = np.dot(A_prev.T, dA)
81+
grads['b' + str(i)] = np.sum(dA, axis=0)
82+
if i > 0:
83+
W = self.params['W' + str(i)]
84+
dA = np.dot(dA, W.T)
85+
Z_prev = cache['Z' + str(i - 1)]
86+
dA = dA * self.relu_derivative(Z_prev)
87+
if 'dropout_mask' + str(i - 1) in cache:
88+
mask = cache['dropout_mask' + str(i - 1)]
89+
dA = dA * mask / (1.0 - self.dropout_rates[i - 1])
90+
return grads
91+
92+
def update_params(self, grads, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
93+
self.t += 1
94+
for key in self.params:
95+
self.m[key] = beta1 * self.m[key] + (1 - beta1) * grads[key]
96+
self.v[key] = beta2 * self.v[key] + (1 - beta2) * (grads[key] ** 2)
97+
m_hat = self.m[key] / (1 - beta1 ** self.t)
98+
v_hat = self.v[key] / (1 - beta2 ** self.t)
99+
self.params[key] -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
100+
101+
def save_model(self, filepath):
102+
with open(filepath, 'wb') as f:
103+
pickle.dump(self.params, f)
104+
105+
def load_model(self, filepath):
106+
with open(filepath, 'rb') as f:
107+
self.params = pickle.load(f)
108+
109+
110+
# ========================
111+
# Data Preprocessing Utils
112+
# ========================
113+
114+
class SimpleScaler:
115+
def __init__(self):
116+
self.mean = None
117+
self.std = None
118+
119+
def fit(self, X):
120+
self.mean = np.mean(X, axis=0)
121+
self.std = np.std(X, axis=0)
122+
self.std[self.std == 0] = 1.0
123+
124+
def transform(self, X):
125+
return (X - self.mean) / self.std
126+
127+
def fit_transform(self, X):
128+
self.fit(X)
129+
return self.transform(X)
130+
131+
def load_and_preprocess(csv_path, target_col='LUNG_CANCER', scaler=None):
132+
df = pd.read_csv(csv_path)
133+
df.columns = [col.strip() for col in df.columns]
134+
if df[target_col].dtype == 'object':
135+
df[target_col] = df[target_col].apply(lambda x: 1 if str(x).upper() == 'YES' else 0)
136+
y = df[target_col].values
137+
X = df.drop(columns=[target_col])
138+
X = pd.get_dummies(X, drop_first=True)
139+
X = X.fillna(X.median())
140+
X = X.values.astype(float)
141+
if scaler is None:
142+
scaler = SimpleScaler()
143+
X = scaler.fit_transform(X)
144+
else:
145+
X = scaler.transform(X)
146+
return X, y, scaler
147+
148+
def get_batches(X, y, batch_size=32, shuffle=True):
149+
n_samples = X.shape[0]
150+
indices = np.arange(n_samples)
151+
if shuffle:
152+
np.random.shuffle(indices)
153+
for start_idx in range(0, n_samples, batch_size):
154+
end_idx = min(start_idx + batch_size, n_samples)
155+
batch_indices = indices[start_idx:end_idx]
156+
yield X[batch_indices], y[batch_indices]
157+
158+
def binary_crossentropy_loss(y_pred, y_true):
159+
y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
160+
loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
161+
return loss
162+
163+
def calculate_class_weights(y):
164+
classes, counts = np.unique(y, return_counts=True)
165+
total = len(y)
166+
weights = {}
167+
for cls, count in zip(classes, counts):
168+
weights[int(cls)] = total / (len(classes) * count)
169+
return weights
170+
171+
def compute_metrics(model, X, y, threshold=0.5):
172+
predictions, _ = model.forward(X, training=False)
173+
y_prob = predictions.flatten()
174+
y_pred = (y_prob >= threshold).astype(int)
175+
acc = accuracy_score(y, y_pred)
176+
prec = precision_score(y, y_pred, zero_division=0)
177+
rec = recall_score(y, y_pred, zero_division=0)
178+
f1 = f1_score(y, y_pred, zero_division=0)
179+
cm = confusion_matrix(y, y_pred)
180+
fpr, tpr, _ = roc_curve(y, y_prob)
181+
roc_auc = auc(fpr, tpr)
182+
results = {
183+
'accuracy': acc,
184+
'precision': prec,
185+
'recall': rec,
186+
'f1_score': f1,
187+
'auc': roc_auc,
188+
'confusion_matrix': cm,
189+
'y_prob': y_prob,
190+
'y_pred': y_pred,
191+
'fpr': fpr,
192+
'tpr': tpr
193+
}
194+
return results
195+
196+
def print_evaluation_results(results):
197+
print("\n" + "="*50)
198+
print("EVALUATION RESULTS")
199+
print("="*50)
200+
print(f"Accuracy: {results['accuracy']:.4f}")
201+
print(f"Precision: {results['precision']:.4f}")
202+
print(f"Recall: {results['recall']:.4f}")
203+
print(f"F1 Score: {results['f1_score']:.4f}")
204+
print(f"AUC: {results['auc']:.4f}")
205+
print("\nConfusion Matrix:")
206+
print(results['confusion_matrix'])
207+
print("="*50 + "\n")
208+
209+
def save_roc_curve(fpr, tpr, roc_auc, filename='roc_curve.png'):
210+
plt.figure(figsize=(8, 6))
211+
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
212+
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
213+
plt.xlim([0.0, 1.0])
214+
plt.ylim([0.0, 1.05])
215+
plt.xlabel('False Positive Rate')
216+
plt.ylabel('True Positive Rate')
217+
plt.title('Receiver Operating Characteristic (ROC) Curve')
218+
plt.legend(loc="lower right")
219+
plt.grid(alpha=0.3)
220+
plt.savefig(filename, dpi=300, bbox_inches='tight')
221+
plt.close()
222+
print(f"ROC curve saved to {filename}")
223+
224+
def save_confusion_matrix(cm, filename='confusion_matrix.png'):
225+
fig, ax = plt.subplots(figsize=(6, 5))
226+
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
227+
ax.figure.colorbar(im, ax=ax)
228+
ax.set(xticks=np.arange(cm.shape[1]),
229+
yticks=np.arange(cm.shape[0]),
230+
xticklabels=['Negative', 'Positive'],
231+
yticklabels=['Negative', 'Positive'],
232+
title='Confusion Matrix',
233+
ylabel='True label',
234+
xlabel='Predicted label')
235+
thresh = cm.max() / 2.
236+
for i in range(cm.shape[0]):
237+
for j in range(cm.shape[1]):
238+
ax.text(j, i, format(cm[i, j], 'd'),
239+
ha="center", va="center",
240+
color="white" if cm[i, j] > thresh else "black")
241+
fig.tight_layout()
242+
plt.savefig(filename, dpi=300, bbox_inches='tight')
243+
plt.close()
244+
print(f"Confusion matrix saved to {filename}")
245+
246+
# =================
247+
# Data Splitting
248+
# =================
249+
250+
def split_data(input_csv):
251+
df = pd.read_csv(input_csv)
252+
print(f"Total samples: {len(df)}")
253+
train_df, temp_df = train_test_split(
254+
df,
255+
test_size=0.30,
256+
stratify=df['LUNG_CANCER'],
257+
random_state=42
258+
)
259+
val_df, test_df = train_test_split(
260+
temp_df,
261+
test_size=0.50,
262+
stratify=temp_df['LUNG_CANCER'],
263+
random_state=42
264+
)
265+
train_df.to_csv('train.csv', index=False)
266+
val_df.to_csv('val.csv', index=False)
267+
test_df.to_csv('test.csv', index=False)
268+
print(f"\nSplit completed:")
269+
print(f" train.csv: {len(train_df)} samples")
270+
print(f" val.csv: {len(val_df)} samples")
271+
print(f" test.csv: {len(test_df)} samples")
272+
print(f"\nFiles saved successfully!")
273+
274+
# =================
275+
# Training Routine
276+
# =================
277+
278+
def train_model(model, X_train, y_train, X_val, y_val,
279+
epochs=100, batch_size=32, learning_rate=0.001,
280+
class_weights=None):
281+
print("\nStarting training...")
282+
train_losses = []
283+
val_losses = []
284+
for epoch in range(epochs):
285+
epoch_losses = []
286+
for X_batch, y_batch in get_batches(X_train, y_train, batch_size, shuffle=True):
287+
y_pred, cache = model.forward(X_batch, training=True)
288+
y_pred_flat = y_pred.flatten()
289+
loss = binary_crossentropy_loss(y_pred_flat, y_batch)
290+
epoch_losses.append(loss)
291+
grads = model.backward(y_pred, y_batch, cache, class_weights)
292+
model.update_params(grads, learning_rate)
293+
avg_train_loss = np.mean(epoch_losses)
294+
train_losses.append(avg_train_loss)
295+
val_pred, _ = model.forward(X_val, training=False)
296+
val_loss = binary_crossentropy_loss(val_pred.flatten(), y_val)
297+
val_losses.append(val_loss)
298+
if (epoch + 1) % 5 == 0 or epoch == 0:
299+
print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}")
300+
print("Training completed!\n")
301+
return train_losses, val_losses
302+
303+
def main(train_csv, test_csv):
304+
print("="*60)
305+
print("MLP LUNG CANCER PREDICTION")
306+
print("="*60)
307+
print("\nLoading training data...")
308+
X_train, y_train, scaler = load_and_preprocess(train_csv, target_col='LUNG_CANCER')
309+
print("Loading test data...")
310+
X_test, y_test, _ = load_and_preprocess(test_csv, target_col='LUNG_CANCER', scaler=scaler)
311+
print(f"\nDataset Info:")
312+
print(f" Training samples: {X_train.shape[0]}")
313+
print(f" Test samples: {X_test.shape[0]}")
314+
print(f" Number of features: {X_train.shape[1]}")
315+
print(f"\nClass distribution in training set:")
316+
unique, counts = np.unique(y_train, return_counts=True)
317+
for cls, count in zip(unique, counts):
318+
print(f" Class {cls}: {count} samples ({count/len(y_train)*100:.1f}%)")
319+
class_weights = calculate_class_weights(y_train)
320+
print(f"\nClass weights: {class_weights}")
321+
input_dim = X_train.shape[1]
322+
hidden_dim1 = 128
323+
hidden_dim2 = 64
324+
output_dim = 1
325+
layer_sizes = [input_dim, hidden_dim1, hidden_dim2, output_dim]
326+
activations = ['relu', 'relu']
327+
dropout_rates = [0.2, 0.1]
328+
print(f"\nModel Architecture:")
329+
print(f" Input layer: {input_dim} neurons")
330+
print(f" Hidden layer 1: {hidden_dim1} neurons (ReLU, Dropout=0.2)")
331+
print(f" Hidden layer 2: {hidden_dim2} neurons (ReLU, Dropout=0.1)")
332+
print(f" Output layer: {output_dim} neuron (Sigmoid)")
333+
model = MLP(layer_sizes=layer_sizes, activations=activations, dropout_rates=dropout_rates, seed=42)
334+
model_path = 'final_model.pkl'
335+
if os.path.exists(model_path):
336+
print(f"\nFound existing model at {model_path}")
337+
print("Loading trained model...")
338+
model.load_model(model_path)
339+
else:
340+
print("\nNo existing model found. Training new model...")
341+
epochs = 100
342+
batch_size = 32
343+
learning_rate = 0.001
344+
train_losses, val_losses = train_model(
345+
model, X_train, y_train, X_train, y_train,
346+
epochs=epochs,
347+
batch_size=batch_size,
348+
learning_rate=learning_rate,
349+
class_weights=class_weights
350+
)
351+
print(f"Saving model to {model_path}...")
352+
model.save_model(model_path)
353+
print("\nEvaluating model on test set...")
354+
results = compute_metrics(model, X_test, y_test, threshold=0.5)
355+
print_evaluation_results(results)
356+
print("Generating and saving plots...")
357+
save_roc_curve(results['fpr'], results['tpr'], results['auc'], 'test_roc_curve.png')
358+
save_confusion_matrix(results['confusion_matrix'], 'test_confusion_matrix.png')
359+
print("\nAll done! Check the generated plots.")
360+
return results
361+
362+
if __name__ == "__main__":
363+
# If you want to run splitting, uncomment -> split_data('survey lung cancer.csv')
364+
# split_data('survey lung cancer.csv')
365+
if len(sys.argv) == 3:
366+
train_csv_path = sys.argv[1]
367+
test_csv_path = sys.argv[2]
368+
else:
369+
train_csv_path = 'train.csv'
370+
test_csv_path = 'test.csv'
371+
print(f"Usage: python lung_cancer_mlp.py <train_csv> <test_csv>")
372+
print(f"Using default files: {train_csv_path}, {test_csv_path}\n")
373+
main(train_csv_path, test_csv_path)

0 commit comments

Comments
 (0)