Skip to content

Commit 255cf13

Browse files
committed
added python files for the autoencoder
1 parent 73e2833 commit 255cf13

File tree

7 files changed

+216
-0
lines changed

7 files changed

+216
-0
lines changed

ML_pipeline/convert_to_tflite.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import tensorflow as tf
2+
3+
model = tf.keras.models.load_model("C:/Users/Marco/autoencoder_model.keras")
4+
5+
converter = tf.lite.TFLiteConverter.from_keras_model(model)
6+
tflite_model = converter.convert()
7+
8+
tflite_model_path = "C:/Users/Marco/autoencoder_model.tflite"
9+
with open(tflite_model_path, "wb") as f:
10+
f.write(tflite_model)
11+
12+
print(f"Model successfully converted to TensorFlow Lite")
13+
print(f"TFLite model saved at: {tflite_model_path}")

ML_pipeline/prepare_data.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
file_path = "C:/Users/Marco/processed_prometheus_data_scaled_copy.csv"
5+
df = pd.read_csv(file_path, parse_dates=['timestamp'])
6+
7+
#
8+
features = df.drop(columns=['timestamp']).columns
9+
10+
train_df = df[(df['timestamp'] >= '2025-02-21') & (df['timestamp'] < '2025-02-24')]
11+
test_df = df[df['timestamp'] < '2025-02-21']
12+
13+
X_train = train_df[features].values
14+
X_test = test_df[features].values
15+
16+
train_file = "C:/Users/Marco/train_data.npy"
17+
test_file = "C:/Users/Marco/test_data.npy"
18+
np.save(train_file, X_train)
19+
np.save(test_file, X_test)
20+
21+
print(f"Data split completed! Training samples: {X_train.shape[0]}, Validation samples: {X_test.shape[0]}")
22+
print(f"Training data saved at: {train_file}")
23+
print(f"Validation data saved at: {test_file}")
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
import scipy.stats as stats
4+
from keras.src.saving import load_model
5+
6+
7+
autoencoder = load_model("C:/Users/Marco/autoencoder_model.keras")
8+
9+
val_data = np.load("C:/Users/Marco/test_data.npy")
10+
11+
reconstructions = autoencoder.predict(val_data)
12+
13+
errors = np.mean(np.abs(val_data - reconstructions), axis=1)
14+
15+
plt.figure(figsize=(10, 5))
16+
plt.hist(errors, bins=30, alpha=0.7, color="blue", edgecolor="black", density=True)
17+
18+
mu, std = np.mean(errors), np.std(errors)
19+
xmin, xmax = plt.xlim()
20+
x = np.linspace(xmin, xmax, 100)
21+
p = stats.norm.pdf(x, mu, std)
22+
plt.plot(x, p, "r", linewidth=2, label=f"Normal Dist (mean={mu:.4f}, stdDeviation={std:.4f})")
23+
24+
plt.axvline(mu + 3 * std, color="red", linestyle="dashed", linewidth=2, label="mean+ 3deviations Threshold")
25+
plt.axvline(mu + 2 * std, color="orange", linestyle="dashed", linewidth=2, label="mean + 2deviations Threshold")
26+
27+
plt.title("Histogram of Reconstruction Errors")
28+
plt.xlabel("Reconstruction Error")
29+
plt.ylabel("Density")
30+
plt.legend()
31+
plt.grid()
32+
plt.show()
33+
34+
35+
# test for normality
36+
shapiro_test = stats.shapiro(errors)
37+
ks_test = stats.kstest(errors, "norm", args=(mu, std))
38+
39+
print(f"Shapiro-Wilk Test p-value: {shapiro_test.pvalue:.4f}")
40+
print(f"Kolmogorov-Smirnov Test p-value: {ks_test.pvalue:.4f}")
41+
42+
# Decision Rule: If p value < 0.05, then the data is not normally distributed
43+
if shapiro_test.pvalue < 0.05 or ks_test.pvalue < 0.05:
44+
print("Since the reconstruction errors don't follow a normal distribution we have to use te percentile based approach.")
45+
threshold = np.percentile(errors, 95) # 95th percentile
46+
print(f"* Using 95th Percentile Threshold: {threshold:.4f}")
47+
else:
48+
print("The reconstruction errors follow a normal distribution, therefore mean + 3deviations is a valid thresholding method.")
49+
threshold = mu + 3 * std
50+
print(f"* Using Mean + 3 Std Dev Threshold: {threshold:.4f}")
51+
52+
np.save("C:/Users/Marco/reconstruction_errors.npy", errors)
53+
print("Reconstruction errors saved to reconstruction_errors.npy")
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
import seaborn as sns
4+
5+
reconstruction_errors = np.load("C:/Users/Marco/reconstruction_errors.npy")
6+
7+
percentiles = [95, 99, 99.5]
8+
thresholds = {p: np.percentile(reconstruction_errors, p) for p in percentiles}
9+
10+
anomaly_counts = {p: np.sum(reconstruction_errors > t) for p, t in thresholds.items()}
11+
12+
plt.figure(figsize=(10, 5))
13+
sns.histplot(reconstruction_errors, bins=30, kde=True, color="blue")
14+
15+
for p, t in thresholds.items():
16+
plt.axvline(t, color="red" if p >= 99 else "orange", linestyle="--", label=f"{p}th Percentile ({t:.4f})")
17+
18+
plt.xlabel("Reconstruction Error")
19+
plt.ylabel("Density")
20+
plt.title("Histogram of Reconstruction Errors with Percentile-Based Thresholds")
21+
plt.legend()
22+
plt.show()
23+
24+
for p in percentiles:
25+
print(f"{p}th Percentile Threshold: {thresholds[p]:.4f}")
26+
print(f"Number of Anomalies Detected: {anomaly_counts[p]}")

ML_pipeline/train_autoencoder.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import numpy as np
2+
from tensorflow import keras
3+
4+
5+
train_data = np.load("C:/Users/Marco/train_data.npy")
6+
test_data = np.load("C:/Users/Marco/test_data.npy")
7+
8+
input_dim = train_data.shape[1]
9+
model = keras.Sequential([
10+
keras.layers.Dense(14, activation='relu', input_shape=(input_dim,)),
11+
keras.layers.Dense(8, activation='relu'),
12+
keras.layers.Dense(4, activation='relu'),
13+
keras.layers.Dense(8, activation='relu'),
14+
keras.layers.Dense(14, activation='relu'),
15+
keras.layers.Dense(input_dim, activation='sigmoid')
16+
])
17+
18+
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mse')
19+
20+
history = model.fit(
21+
train_data, train_data,
22+
epochs=100,
23+
batch_size=32,
24+
validation_data=(test_data, test_data)
25+
)
26+
27+
model.save("C:/Users/Marco/autoencoder_model.keras")
28+
print("Autoencoder Model Trained and Saved")

ML_pipeline/train_scaler.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import pandas as pd
2+
import joblib
3+
from sklearn.preprocessing import MinMaxScaler
4+
5+
file_path = "C:/Users/Marco/processed_prometheus_data_copy.csv"
6+
df = pd.read_csv(file_path).drop(columns=["timestamp"])
7+
8+
scaler = MinMaxScaler()
9+
scaler.fit(df)
10+
11+
joblib.dump(scaler, "C:/Users/Marco/scaler.pkl")
12+
print(f"New scaler trained on {scaler.n_features_in_} features and saved!")

ML_pipeline/tune_autoencoder.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import keras_tuner as kt
2+
from tensorflow import keras
3+
import numpy as np
4+
5+
6+
train_data = np.load("C:/Users/Marco/train_data.npy")
7+
test_data = np.load("C:/Users/Marco/test_data.npy")
8+
9+
def build_model(hp):
10+
input_dim = train_data.shape[1]
11+
model = keras.Sequential()
12+
13+
model.add(keras.layers.Dense(
14+
units=hp.Int('units1', min_value=8, max_value=32, step=4),
15+
activation='relu',
16+
input_shape=(input_dim,)
17+
))
18+
19+
model.add(keras.layers.Dense(
20+
units=hp.Int('units2', min_value=4, max_value=16, step=2),
21+
activation='relu'
22+
))
23+
24+
model.add(keras.layers.Dense(4, activation='relu'))
25+
26+
model.add(keras.layers.Dense(
27+
units=hp.Int('units3', min_value=4, max_value=16, step=2),
28+
activation='relu'
29+
))
30+
31+
model.add(keras.layers.Dense(
32+
units=hp.Int('units4', min_value=8, max_value=32, step=4),
33+
activation='relu'
34+
))
35+
36+
model.add(keras.layers.Dense(input_dim, activation='sigmoid'))
37+
38+
hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4])
39+
model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
40+
loss='mse')
41+
return model
42+
43+
44+
tuner = kt.RandomSearch(
45+
build_model,
46+
objective='val_loss',
47+
max_trials=10,
48+
executions_per_trial=2,
49+
directory='tuner_dir',
50+
project_name='autoencoder_tuning'
51+
)
52+
53+
tuner.search(train_data, train_data, epochs=50, validation_data=(test_data, test_data))
54+
55+
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
56+
print("Best Hyperparameters:")
57+
print(best_hp.values)
58+
59+
best_model = tuner.get_best_models(num_models=1)[0]
60+
print("Best Model Summary:")
61+
best_model.summary()

0 commit comments

Comments
 (0)