1+ import numpy as np
2+ import pandas as pd
3+ import pickle
4+ import sys
5+ import os
6+ import matplotlib .pyplot as plt
7+ from sklearn .model_selection import train_test_split
8+ from sklearn .metrics import (
9+ accuracy_score , precision_score , recall_score , f1_score ,
10+ confusion_matrix , roc_curve , auc
11+ )
12+
13+ # ========================
14+ # MLP Model Implementation
15+ # ========================
16+
17+ class MLP :
18+ def __init__ (self , layer_sizes , activations = None , seed = 42 , dropout_rates = None ):
19+ np .random .seed (seed )
20+ self .layer_sizes = layer_sizes
21+ self .num_layers = len (layer_sizes ) - 1
22+
23+ self .activations = ['relu' ] * (self .num_layers - 1 ) if activations is None else activations
24+ self .dropout_rates = [0.0 ] * (self .num_layers - 1 ) if dropout_rates is None else dropout_rates
25+
26+ self .params = {}
27+ for i in range (self .num_layers ):
28+ input_size = layer_sizes [i ]
29+ output_size = layer_sizes [i + 1 ]
30+ limit = np .sqrt (6.0 / (input_size + output_size ))
31+ self .params ['W' + str (i )] = np .random .uniform (- limit , limit , (input_size , output_size ))
32+ self .params ['b' + str (i )] = np .zeros (output_size )
33+
34+ self .m = {}
35+ self .v = {}
36+ for key in self .params :
37+ self .m [key ] = np .zeros_like (self .params [key ])
38+ self .v [key ] = np .zeros_like (self .params [key ])
39+ self .t = 0
40+
41+ def relu (self , z ):
42+ return np .maximum (0 , z )
43+
44+ def relu_derivative (self , z ):
45+ return (z > 0 ).astype (float )
46+
47+ def sigmoid (self , z ):
48+ return 1.0 / (1.0 + np .exp (- np .clip (z , - 500 , 500 )))
49+
50+ def forward (self , X , training = True ):
51+ cache = {}
52+ cache ['A0' ] = X
53+ A = X
54+ for i in range (self .num_layers ):
55+ W = self .params ['W' + str (i )]
56+ b = self .params ['b' + str (i )]
57+ Z = np .dot (A , W ) + b
58+ cache ['Z' + str (i )] = Z
59+ if i == self .num_layers - 1 :
60+ A = self .sigmoid (Z )
61+ else :
62+ A = self .relu (Z )
63+ if training and self .dropout_rates [i ] > 0 :
64+ mask = np .random .rand (* A .shape ) > self .dropout_rates [i ]
65+ A = A * mask / (1.0 - self .dropout_rates [i ])
66+ cache ['dropout_mask' + str (i )] = mask
67+ cache ['A' + str (i + 1 )] = A
68+ return A , cache
69+
70+ def backward (self , y_pred , y_true , cache , class_weights = None ):
71+ m = y_true .shape [0 ]
72+ grads = {}
73+ if class_weights is not None :
74+ sample_weights = np .array ([class_weights [int (y )] for y in y_true ]).reshape (- 1 , 1 )
75+ dA = (y_pred .reshape (- 1 , 1 ) - y_true .reshape (- 1 , 1 )) * sample_weights / m
76+ else :
77+ dA = (y_pred .reshape (- 1 , 1 ) - y_true .reshape (- 1 , 1 )) / m
78+ for i in range (self .num_layers - 1 , - 1 , - 1 ):
79+ A_prev = cache ['A' + str (i )]
80+ grads ['W' + str (i )] = np .dot (A_prev .T , dA )
81+ grads ['b' + str (i )] = np .sum (dA , axis = 0 )
82+ if i > 0 :
83+ W = self .params ['W' + str (i )]
84+ dA = np .dot (dA , W .T )
85+ Z_prev = cache ['Z' + str (i - 1 )]
86+ dA = dA * self .relu_derivative (Z_prev )
87+ if 'dropout_mask' + str (i - 1 ) in cache :
88+ mask = cache ['dropout_mask' + str (i - 1 )]
89+ dA = dA * mask / (1.0 - self .dropout_rates [i - 1 ])
90+ return grads
91+
92+ def update_params (self , grads , learning_rate = 0.001 , beta1 = 0.9 , beta2 = 0.999 , epsilon = 1e-8 ):
93+ self .t += 1
94+ for key in self .params :
95+ self .m [key ] = beta1 * self .m [key ] + (1 - beta1 ) * grads [key ]
96+ self .v [key ] = beta2 * self .v [key ] + (1 - beta2 ) * (grads [key ] ** 2 )
97+ m_hat = self .m [key ] / (1 - beta1 ** self .t )
98+ v_hat = self .v [key ] / (1 - beta2 ** self .t )
99+ self .params [key ] -= learning_rate * m_hat / (np .sqrt (v_hat ) + epsilon )
100+
101+ def save_model (self , filepath ):
102+ with open (filepath , 'wb' ) as f :
103+ pickle .dump (self .params , f )
104+
105+ def load_model (self , filepath ):
106+ with open (filepath , 'rb' ) as f :
107+ self .params = pickle .load (f )
108+
109+
110+ # ========================
111+ # Data Preprocessing Utils
112+ # ========================
113+
114+ class SimpleScaler :
115+ def __init__ (self ):
116+ self .mean = None
117+ self .std = None
118+
119+ def fit (self , X ):
120+ self .mean = np .mean (X , axis = 0 )
121+ self .std = np .std (X , axis = 0 )
122+ self .std [self .std == 0 ] = 1.0
123+
124+ def transform (self , X ):
125+ return (X - self .mean ) / self .std
126+
127+ def fit_transform (self , X ):
128+ self .fit (X )
129+ return self .transform (X )
130+
131+ def load_and_preprocess (csv_path , target_col = 'LUNG_CANCER' , scaler = None ):
132+ df = pd .read_csv (csv_path )
133+ df .columns = [col .strip () for col in df .columns ]
134+ if df [target_col ].dtype == 'object' :
135+ df [target_col ] = df [target_col ].apply (lambda x : 1 if str (x ).upper () == 'YES' else 0 )
136+ y = df [target_col ].values
137+ X = df .drop (columns = [target_col ])
138+ X = pd .get_dummies (X , drop_first = True )
139+ X = X .fillna (X .median ())
140+ X = X .values .astype (float )
141+ if scaler is None :
142+ scaler = SimpleScaler ()
143+ X = scaler .fit_transform (X )
144+ else :
145+ X = scaler .transform (X )
146+ return X , y , scaler
147+
148+ def get_batches (X , y , batch_size = 32 , shuffle = True ):
149+ n_samples = X .shape [0 ]
150+ indices = np .arange (n_samples )
151+ if shuffle :
152+ np .random .shuffle (indices )
153+ for start_idx in range (0 , n_samples , batch_size ):
154+ end_idx = min (start_idx + batch_size , n_samples )
155+ batch_indices = indices [start_idx :end_idx ]
156+ yield X [batch_indices ], y [batch_indices ]
157+
158+ def binary_crossentropy_loss (y_pred , y_true ):
159+ y_pred = np .clip (y_pred , 1e-7 , 1 - 1e-7 )
160+ loss = - np .mean (y_true * np .log (y_pred ) + (1 - y_true ) * np .log (1 - y_pred ))
161+ return loss
162+
163+ def calculate_class_weights (y ):
164+ classes , counts = np .unique (y , return_counts = True )
165+ total = len (y )
166+ weights = {}
167+ for cls , count in zip (classes , counts ):
168+ weights [int (cls )] = total / (len (classes ) * count )
169+ return weights
170+
171+ def compute_metrics (model , X , y , threshold = 0.5 ):
172+ predictions , _ = model .forward (X , training = False )
173+ y_prob = predictions .flatten ()
174+ y_pred = (y_prob >= threshold ).astype (int )
175+ acc = accuracy_score (y , y_pred )
176+ prec = precision_score (y , y_pred , zero_division = 0 )
177+ rec = recall_score (y , y_pred , zero_division = 0 )
178+ f1 = f1_score (y , y_pred , zero_division = 0 )
179+ cm = confusion_matrix (y , y_pred )
180+ fpr , tpr , _ = roc_curve (y , y_prob )
181+ roc_auc = auc (fpr , tpr )
182+ results = {
183+ 'accuracy' : acc ,
184+ 'precision' : prec ,
185+ 'recall' : rec ,
186+ 'f1_score' : f1 ,
187+ 'auc' : roc_auc ,
188+ 'confusion_matrix' : cm ,
189+ 'y_prob' : y_prob ,
190+ 'y_pred' : y_pred ,
191+ 'fpr' : fpr ,
192+ 'tpr' : tpr
193+ }
194+ return results
195+
196+ def print_evaluation_results (results ):
197+ print ("\n " + "=" * 50 )
198+ print ("EVALUATION RESULTS" )
199+ print ("=" * 50 )
200+ print (f"Accuracy: { results ['accuracy' ]:.4f} " )
201+ print (f"Precision: { results ['precision' ]:.4f} " )
202+ print (f"Recall: { results ['recall' ]:.4f} " )
203+ print (f"F1 Score: { results ['f1_score' ]:.4f} " )
204+ print (f"AUC: { results ['auc' ]:.4f} " )
205+ print ("\n Confusion Matrix:" )
206+ print (results ['confusion_matrix' ])
207+ print ("=" * 50 + "\n " )
208+
209+ def save_roc_curve (fpr , tpr , roc_auc , filename = 'roc_curve.png' ):
210+ plt .figure (figsize = (8 , 6 ))
211+ plt .plot (fpr , tpr , color = 'darkorange' , lw = 2 , label = f'ROC curve (AUC = { roc_auc :.2f} )' )
212+ plt .plot ([0 , 1 ], [0 , 1 ], color = 'navy' , lw = 2 , linestyle = '--' , label = 'Random' )
213+ plt .xlim ([0.0 , 1.0 ])
214+ plt .ylim ([0.0 , 1.05 ])
215+ plt .xlabel ('False Positive Rate' )
216+ plt .ylabel ('True Positive Rate' )
217+ plt .title ('Receiver Operating Characteristic (ROC) Curve' )
218+ plt .legend (loc = "lower right" )
219+ plt .grid (alpha = 0.3 )
220+ plt .savefig (filename , dpi = 300 , bbox_inches = 'tight' )
221+ plt .close ()
222+ print (f"ROC curve saved to { filename } " )
223+
224+ def save_confusion_matrix (cm , filename = 'confusion_matrix.png' ):
225+ fig , ax = plt .subplots (figsize = (6 , 5 ))
226+ im = ax .imshow (cm , interpolation = 'nearest' , cmap = plt .cm .Blues )
227+ ax .figure .colorbar (im , ax = ax )
228+ ax .set (xticks = np .arange (cm .shape [1 ]),
229+ yticks = np .arange (cm .shape [0 ]),
230+ xticklabels = ['Negative' , 'Positive' ],
231+ yticklabels = ['Negative' , 'Positive' ],
232+ title = 'Confusion Matrix' ,
233+ ylabel = 'True label' ,
234+ xlabel = 'Predicted label' )
235+ thresh = cm .max () / 2.
236+ for i in range (cm .shape [0 ]):
237+ for j in range (cm .shape [1 ]):
238+ ax .text (j , i , format (cm [i , j ], 'd' ),
239+ ha = "center" , va = "center" ,
240+ color = "white" if cm [i , j ] > thresh else "black" )
241+ fig .tight_layout ()
242+ plt .savefig (filename , dpi = 300 , bbox_inches = 'tight' )
243+ plt .close ()
244+ print (f"Confusion matrix saved to { filename } " )
245+
246+ # =================
247+ # Data Splitting
248+ # =================
249+
250+ def split_data (input_csv ):
251+ df = pd .read_csv (input_csv )
252+ print (f"Total samples: { len (df )} " )
253+ train_df , temp_df = train_test_split (
254+ df ,
255+ test_size = 0.30 ,
256+ stratify = df ['LUNG_CANCER' ],
257+ random_state = 42
258+ )
259+ val_df , test_df = train_test_split (
260+ temp_df ,
261+ test_size = 0.50 ,
262+ stratify = temp_df ['LUNG_CANCER' ],
263+ random_state = 42
264+ )
265+ train_df .to_csv ('train.csv' , index = False )
266+ val_df .to_csv ('val.csv' , index = False )
267+ test_df .to_csv ('test.csv' , index = False )
268+ print (f"\n Split completed:" )
269+ print (f" train.csv: { len (train_df )} samples" )
270+ print (f" val.csv: { len (val_df )} samples" )
271+ print (f" test.csv: { len (test_df )} samples" )
272+ print (f"\n Files saved successfully!" )
273+
274+ # =================
275+ # Training Routine
276+ # =================
277+
278+ def train_model (model , X_train , y_train , X_val , y_val ,
279+ epochs = 100 , batch_size = 32 , learning_rate = 0.001 ,
280+ class_weights = None ):
281+ print ("\n Starting training..." )
282+ train_losses = []
283+ val_losses = []
284+ for epoch in range (epochs ):
285+ epoch_losses = []
286+ for X_batch , y_batch in get_batches (X_train , y_train , batch_size , shuffle = True ):
287+ y_pred , cache = model .forward (X_batch , training = True )
288+ y_pred_flat = y_pred .flatten ()
289+ loss = binary_crossentropy_loss (y_pred_flat , y_batch )
290+ epoch_losses .append (loss )
291+ grads = model .backward (y_pred , y_batch , cache , class_weights )
292+ model .update_params (grads , learning_rate )
293+ avg_train_loss = np .mean (epoch_losses )
294+ train_losses .append (avg_train_loss )
295+ val_pred , _ = model .forward (X_val , training = False )
296+ val_loss = binary_crossentropy_loss (val_pred .flatten (), y_val )
297+ val_losses .append (val_loss )
298+ if (epoch + 1 ) % 5 == 0 or epoch == 0 :
299+ print (f"Epoch { epoch + 1 } /{ epochs } - Train Loss: { avg_train_loss :.4f} , Val Loss: { val_loss :.4f} " )
300+ print ("Training completed!\n " )
301+ return train_losses , val_losses
302+
303+ def main (train_csv , test_csv ):
304+ print ("=" * 60 )
305+ print ("MLP LUNG CANCER PREDICTION" )
306+ print ("=" * 60 )
307+ print ("\n Loading training data..." )
308+ X_train , y_train , scaler = load_and_preprocess (train_csv , target_col = 'LUNG_CANCER' )
309+ print ("Loading test data..." )
310+ X_test , y_test , _ = load_and_preprocess (test_csv , target_col = 'LUNG_CANCER' , scaler = scaler )
311+ print (f"\n Dataset Info:" )
312+ print (f" Training samples: { X_train .shape [0 ]} " )
313+ print (f" Test samples: { X_test .shape [0 ]} " )
314+ print (f" Number of features: { X_train .shape [1 ]} " )
315+ print (f"\n Class distribution in training set:" )
316+ unique , counts = np .unique (y_train , return_counts = True )
317+ for cls , count in zip (unique , counts ):
318+ print (f" Class { cls } : { count } samples ({ count / len (y_train )* 100 :.1f} %)" )
319+ class_weights = calculate_class_weights (y_train )
320+ print (f"\n Class weights: { class_weights } " )
321+ input_dim = X_train .shape [1 ]
322+ hidden_dim1 = 128
323+ hidden_dim2 = 64
324+ output_dim = 1
325+ layer_sizes = [input_dim , hidden_dim1 , hidden_dim2 , output_dim ]
326+ activations = ['relu' , 'relu' ]
327+ dropout_rates = [0.2 , 0.1 ]
328+ print (f"\n Model Architecture:" )
329+ print (f" Input layer: { input_dim } neurons" )
330+ print (f" Hidden layer 1: { hidden_dim1 } neurons (ReLU, Dropout=0.2)" )
331+ print (f" Hidden layer 2: { hidden_dim2 } neurons (ReLU, Dropout=0.1)" )
332+ print (f" Output layer: { output_dim } neuron (Sigmoid)" )
333+ model = MLP (layer_sizes = layer_sizes , activations = activations , dropout_rates = dropout_rates , seed = 42 )
334+ model_path = 'final_model.pkl'
335+ if os .path .exists (model_path ):
336+ print (f"\n Found existing model at { model_path } " )
337+ print ("Loading trained model..." )
338+ model .load_model (model_path )
339+ else :
340+ print ("\n No existing model found. Training new model..." )
341+ epochs = 100
342+ batch_size = 32
343+ learning_rate = 0.001
344+ train_losses , val_losses = train_model (
345+ model , X_train , y_train , X_train , y_train ,
346+ epochs = epochs ,
347+ batch_size = batch_size ,
348+ learning_rate = learning_rate ,
349+ class_weights = class_weights
350+ )
351+ print (f"Saving model to { model_path } ..." )
352+ model .save_model (model_path )
353+ print ("\n Evaluating model on test set..." )
354+ results = compute_metrics (model , X_test , y_test , threshold = 0.5 )
355+ print_evaluation_results (results )
356+ print ("Generating and saving plots..." )
357+ save_roc_curve (results ['fpr' ], results ['tpr' ], results ['auc' ], 'test_roc_curve.png' )
358+ save_confusion_matrix (results ['confusion_matrix' ], 'test_confusion_matrix.png' )
359+ print ("\n All done! Check the generated plots." )
360+ return results
361+
362+ if __name__ == "__main__" :
363+ # If you want to run splitting, uncomment -> split_data('survey lung cancer.csv')
364+ # split_data('survey lung cancer.csv')
365+ if len (sys .argv ) == 3 :
366+ train_csv_path = sys .argv [1 ]
367+ test_csv_path = sys .argv [2 ]
368+ else :
369+ train_csv_path = 'train.csv'
370+ test_csv_path = 'test.csv'
371+ print (f"Usage: python lung_cancer_mlp.py <train_csv> <test_csv>" )
372+ print (f"Using default files: { train_csv_path } , { test_csv_path } \n " )
373+ main (train_csv_path , test_csv_path )
0 commit comments