added python files for the autoencoder

Marrrco7 · Marrrco7 · commit 255cf134d2cc · 2025-02-25T00:59:21.000+01:00
diff --git a/ML_pipeline/convert_to_tflite.py b/ML_pipeline/convert_to_tflite.py
@@ -0,0 +1,13 @@
+import tensorflow as tf
+
+model = tf.keras.models.load_model("C:/Users/Marco/autoencoder_model.keras")
+
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+tflite_model = converter.convert()
+
+tflite_model_path = "C:/Users/Marco/autoencoder_model.tflite"
+with open(tflite_model_path, "wb") as f:
+    f.write(tflite_model)
+
+print(f"Model successfully converted to TensorFlow Lite")
+print(f"TFLite model saved at: {tflite_model_path}")
diff --git a/ML_pipeline/prepare_data.py b/ML_pipeline/prepare_data.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import numpy as np
+
+file_path = "C:/Users/Marco/processed_prometheus_data_scaled_copy.csv"
+df = pd.read_csv(file_path, parse_dates=['timestamp'])
+
+#
+features = df.drop(columns=['timestamp']).columns
+
+train_df = df[(df['timestamp'] >= '2025-02-21') & (df['timestamp'] < '2025-02-24')]
+test_df = df[df['timestamp'] < '2025-02-21']
+
+X_train = train_df[features].values
+X_test = test_df[features].values
+
+train_file = "C:/Users/Marco/train_data.npy"
+test_file = "C:/Users/Marco/test_data.npy"
+np.save(train_file, X_train)
+np.save(test_file, X_test)
+
+print(f"Data split completed! Training samples: {X_train.shape[0]}, Validation samples: {X_test.shape[0]}")
+print(f"Training data saved at: {train_file}")
+print(f"Validation data saved at: {test_file}")
diff --git a/ML_pipeline/reconstruction_error.py b/ML_pipeline/reconstruction_error.py
@@ -0,0 +1,53 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.stats as stats
+from keras.src.saving import load_model
+
+
+autoencoder = load_model("C:/Users/Marco/autoencoder_model.keras")
+
+val_data = np.load("C:/Users/Marco/test_data.npy")
+
+reconstructions = autoencoder.predict(val_data)
+
+errors = np.mean(np.abs(val_data - reconstructions), axis=1)
+
+plt.figure(figsize=(10, 5))
+plt.hist(errors, bins=30, alpha=0.7, color="blue", edgecolor="black", density=True)
+
+mu, std = np.mean(errors), np.std(errors)
+xmin, xmax = plt.xlim()
+x = np.linspace(xmin, xmax, 100)
+p = stats.norm.pdf(x, mu, std)
+plt.plot(x, p, "r", linewidth=2, label=f"Normal Dist (mean={mu:.4f}, stdDeviation={std:.4f})")
+
+plt.axvline(mu + 3 * std, color="red", linestyle="dashed", linewidth=2, label="mean+ 3deviations Threshold")
+plt.axvline(mu + 2 * std, color="orange", linestyle="dashed", linewidth=2, label="mean + 2deviations Threshold")
+
+plt.title("Histogram of Reconstruction Errors")
+plt.xlabel("Reconstruction Error")
+plt.ylabel("Density")
+plt.legend()
+plt.grid()
+plt.show()
+
+
+# test for normality
+shapiro_test = stats.shapiro(errors)
+ks_test = stats.kstest(errors, "norm", args=(mu, std))
+
+print(f"Shapiro-Wilk Test p-value: {shapiro_test.pvalue:.4f}")
+print(f"Kolmogorov-Smirnov Test p-value: {ks_test.pvalue:.4f}")
+
+# Decision Rule: If p value < 0.05, then the data is not normally distributed
+if shapiro_test.pvalue < 0.05 or ks_test.pvalue < 0.05:
+    print("Since the reconstruction errors don't follow a normal distribution we have to use te percentile based approach.")
+    threshold = np.percentile(errors, 95)  # 95th percentile
+    print(f"* Using 95th Percentile Threshold: {threshold:.4f}")
+else:
+    print("The reconstruction errors follow a normal distribution, therefore mean + 3deviations is a valid thresholding method.")
+    threshold = mu + 3 * std
+    print(f"* Using Mean + 3 Std Dev Threshold: {threshold:.4f}")
+
+np.save("C:/Users/Marco/reconstruction_errors.npy", errors)
+print("Reconstruction errors saved to reconstruction_errors.npy")
diff --git a/ML_pipeline/threshold_percentiles.py b/ML_pipeline/threshold_percentiles.py
@@ -0,0 +1,26 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+reconstruction_errors = np.load("C:/Users/Marco/reconstruction_errors.npy")
+
+percentiles = [95, 99, 99.5]
+thresholds = {p: np.percentile(reconstruction_errors, p) for p in percentiles}
+
+anomaly_counts = {p: np.sum(reconstruction_errors > t) for p, t in thresholds.items()}
+
+plt.figure(figsize=(10, 5))
+sns.histplot(reconstruction_errors, bins=30, kde=True, color="blue")
+
+for p, t in thresholds.items():
+    plt.axvline(t, color="red" if p >= 99 else "orange", linestyle="--", label=f"{p}th Percentile ({t:.4f})")
+
+plt.xlabel("Reconstruction Error")
+plt.ylabel("Density")
+plt.title("Histogram of Reconstruction Errors with Percentile-Based Thresholds")
+plt.legend()
+plt.show()
+
+for p in percentiles:
+    print(f"{p}th Percentile Threshold: {thresholds[p]:.4f}")
+    print(f"Number of Anomalies Detected: {anomaly_counts[p]}")
diff --git a/ML_pipeline/train_autoencoder.py b/ML_pipeline/train_autoencoder.py
@@ -0,0 +1,28 @@
+import numpy as np
+from tensorflow import keras
+
+
+train_data = np.load("C:/Users/Marco/train_data.npy")
+test_data = np.load("C:/Users/Marco/test_data.npy")
+
+input_dim = train_data.shape[1]
+model = keras.Sequential([
+    keras.layers.Dense(14, activation='relu', input_shape=(input_dim,)),
+    keras.layers.Dense(8, activation='relu'),
+    keras.layers.Dense(4, activation='relu'),
+    keras.layers.Dense(8, activation='relu'),
+    keras.layers.Dense(14, activation='relu'),
+    keras.layers.Dense(input_dim, activation='sigmoid')
+])
+
+model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mse')
+
+history = model.fit(
+    train_data, train_data,
+    epochs=100,
+    batch_size=32,
+    validation_data=(test_data, test_data)
+)
+
+model.save("C:/Users/Marco/autoencoder_model.keras")
+print("Autoencoder Model Trained and Saved")
diff --git a/ML_pipeline/train_scaler.py b/ML_pipeline/train_scaler.py
@@ -0,0 +1,12 @@
+import pandas as pd
+import joblib
+from sklearn.preprocessing import MinMaxScaler
+
+file_path = "C:/Users/Marco/processed_prometheus_data_copy.csv"
+df = pd.read_csv(file_path).drop(columns=["timestamp"])
+
+scaler = MinMaxScaler()
+scaler.fit(df)
+
+joblib.dump(scaler, "C:/Users/Marco/scaler.pkl")
+print(f"New scaler trained on {scaler.n_features_in_} features and saved!")
diff --git a/ML_pipeline/tune_autoencoder.py b/ML_pipeline/tune_autoencoder.py
@@ -0,0 +1,61 @@
+import keras_tuner as kt
+from tensorflow import keras
+import numpy as np
+
+
+train_data = np.load("C:/Users/Marco/train_data.npy")
+test_data = np.load("C:/Users/Marco/test_data.npy")
+
+def build_model(hp):
+    input_dim = train_data.shape[1]
+    model = keras.Sequential()
+
+    model.add(keras.layers.Dense(
+        units=hp.Int('units1', min_value=8, max_value=32, step=4),
+        activation='relu',
+        input_shape=(input_dim,)
+    ))
+
+    model.add(keras.layers.Dense(
+        units=hp.Int('units2', min_value=4, max_value=16, step=2),
+        activation='relu'
+    ))
+
+    model.add(keras.layers.Dense(4, activation='relu'))
+
+    model.add(keras.layers.Dense(
+        units=hp.Int('units3', min_value=4, max_value=16, step=2),
+        activation='relu'
+    ))
+
+    model.add(keras.layers.Dense(
+        units=hp.Int('units4', min_value=8, max_value=32, step=4),
+        activation='relu'
+    ))
+
+    model.add(keras.layers.Dense(input_dim, activation='sigmoid'))
+
+    hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4])
+    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
+                  loss='mse')
+    return model
+
+
+tuner = kt.RandomSearch(
+    build_model,
+    objective='val_loss',
+    max_trials=10,
+    executions_per_trial=2,
+    directory='tuner_dir',
+    project_name='autoencoder_tuning'
+)
+
+tuner.search(train_data, train_data, epochs=50, validation_data=(test_data, test_data))
+
+best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
+print("Best Hyperparameters:")
+print(best_hp.values)
+
+best_model = tuner.get_best_models(num_models=1)[0]
+print("Best Model Summary:")
+best_model.summary()