diff --git a/CMakeLists.txt b/CMakeLists.txt index 7540599..ef5f923 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,22 @@ add_executable(RandomForestClassifier tests/tree/RandomForestClassifierTest.cpp) target_compile_definitions(RandomForestClassifier PRIVATE TEST_RANDOM_FOREST_CLASSIFIER) target_link_libraries(RandomForestClassifier cpp_ml_library) +add_executable(KMeansClustering tests/clustering/KMeansClusteringTest.cpp) +target_compile_definitions(KMeansClustering PRIVATE TEST_KMEANS_CLUSTERING) +target_link_libraries(KMeansClustering cpp_ml_library) + +add_executable(KNNClassifier tests/clustering/KNNClassifierTest.cpp) +target_compile_definitions(KNNClassifier PRIVATE TEST_KKN_CLASSIFIER) +target_link_libraries(KNNClassifier cpp_ml_library) + +add_executable(KNNRegressor tests/clustering/KNNRegressorTest.cpp) +target_compile_definitions(KNNRegressor PRIVATE TEST_KNN_REGRESSOR) +target_link_libraries(KNNRegressor cpp_ml_library) + +add_executable(HierarchicalClustering tests/clustering/HierarchicalClusteringTest.cpp) +target_compile_definitions(HierarchicalClustering PRIVATE TEST_HIERARCHICAL_CLUSTERING) +target_link_libraries(HierarchicalClustering cpp_ml_library) + # Register individual tests add_test(NAME LogisticRegressionTest COMMAND LogisticRegressionTest) add_test(NAME PolynomialRegressionTest COMMAND PolynomialRegressionTest) @@ -61,6 +77,11 @@ add_test(NAME DecisionTreeRegressor COMMAND DecisionTreeRegressor) add_test(NAME RandomForestRegressor COMMAND RandomForestRegressor) add_test(NAME DecisionTreeClassifier COMMAND DecisionTreeClassifier) add_test(NAME RandomForestClassifier COMMAND RandomForestClassifier) +add_test(NAME KMeansClustering COMMAND KMeansClustering) +add_test(NAME KNNClassifier COMMAND KNNClassifier) +add_test(NAME KNNRegressor COMMAND KNNRegressor) +add_test(NAME HierarchicalClustering COMMAND HierarchicalClustering) + # Add example executables if BUILD_EXAMPLES is ON if(BUILD_EXAMPLES) @@ -87,6 +108,14 @@ if(BUILD_EXAMPLES) target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_DECISION_TREE_CLASSIFIER) elseif(EXAMPLE_NAME STREQUAL "RandomForestClassifierExample") target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_RANDOM_FOREST_CLASSIFIER) + elseif(EXAMPLE_NAME STREQUAL "KMeansExample") + target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_KMEANS_CLUSTERING) + elseif(EXAMPLE_NAME STREQUAL "KNNClassifierExample") + target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_KKN_CLASSIFIER) + elseif(EXAMPLE_NAME STREQUAL "KNNRegressorExample") + target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_KNN_REGRESSOR) + elseif(EXAMPLE_NAME STREQUAL "HierarchicalClusteringExample") + target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_HIERARCHICAL_CLUSTERING) endif() endforeach() endif() \ No newline at end of file diff --git a/README.md b/README.md index fcb6593..7b0394c 100644 --- a/README.md +++ b/README.md @@ -61,17 +61,19 @@ The following machine learning algorithms are planned, inspired by concepts and - [x] Polynomial Regression - [x] Multi-Linear Regression - [x] Logistic Regression - - [ ] Decision Tree Regression - - [ ] Random Forest Regression + - [x] Decision Tree Regression + - [x] Random Forest Regression + - [ ] K-Nearest Neighbors 2. **Classification** - - [ ] Decision Tree Classifier - - [ ] Random Forest Classifier + - [x] Decision Tree Classifier + - [x] Random Forest Classifier - [ ] K-Nearest Neighbors 3. **Clustering** - [ ] K-Means Clustering + - [ ] Hierarchical clustering 4. **Neural Networks** - [ ] Neural Network (NN) diff --git a/examples/HierarchicalClusteringExample.cpp b/examples/HierarchicalClusteringExample.cpp new file mode 100644 index 0000000..d476cf4 --- /dev/null +++ b/examples/HierarchicalClusteringExample.cpp @@ -0,0 +1,36 @@ +#include "../ml_library_include/ml/clustering/HierarchicalClustering.hpp" +#include + +int testHierarchicalClustering() { + // Sample data + std::vector> data = { + {1.0, 2.0}, + {1.5, 1.8}, + {5.0, 8.0}, + {6.0, 9.0}, + {1.0, 0.6}, + {9.0, 11.0}, + {8.0, 2.0}, + {10.0, 2.0}, + {9.0, 3.0} + }; + + // Create and fit the model + HierarchicalClustering hc(3, HierarchicalClustering::Linkage::AVERAGE); + hc.fit(data); + + // Get cluster labels + std::vector labels = hc.predict(); + + // Output cluster labels + for (size_t i = 0; i < labels.size(); ++i) { + std::cout << "Data point " << i << " is in cluster " << labels[i] << std::endl; + } + + return 0; +} + +int main(){ + testHierarchicalClustering(); + return 0; +} \ No newline at end of file diff --git a/examples/KMeansExample.cpp b/examples/KMeansExample.cpp new file mode 100644 index 0000000..a2a87ca --- /dev/null +++ b/examples/KMeansExample.cpp @@ -0,0 +1,45 @@ +#include "../ml_library_include/ml/clustering/KMeans.hpp" +#include + +int testKMeansClustering() { + // Sample data + std::vector> X = { + {1.0, 2.0}, + {1.5, 1.8}, + {5.0, 8.0}, + {8.0, 8.0}, + {1.0, 0.6}, + {9.0, 11.0}, + {8.0, 2.0}, + {10.0, 2.0}, + {9.0, 3.0}, + }; + + // Create and fit the model + KMeans kmeans(3); + kmeans.fit(X); + + // Predict cluster labels + std::vector labels = kmeans.predict(X); + + // Output results + for (size_t i = 0; i < labels.size(); ++i) { + std::cout << "Point: (" << X[i][0] << ", " << X[i][1] << ") - Cluster: " << labels[i] << std::endl; + } + + // Get cluster centers + const auto& centers = kmeans.get_cluster_centers(); + for (size_t k = 0; k < centers.size(); ++k) { + std::cout << "Cluster " << k << " center: (" << centers[k][0] << ", " << centers[k][1] << ")" << std::endl; + } + + return 0; +} + +// Only include main if TEST_DECISION_TREE_REGRESSION is defined +//#ifdef TEST_KMEANS_CLUSTERING +int main() { + testKMeansClustering(); + return 0; +} +//#endif \ No newline at end of file diff --git a/examples/KNNClassifierExample.cpp b/examples/KNNClassifierExample.cpp new file mode 100644 index 0000000..d703ff1 --- /dev/null +++ b/examples/KNNClassifierExample.cpp @@ -0,0 +1,41 @@ +#include "../ml_library_include/ml/clustering/KNNClassifier.hpp" +#include + +int testKNNClassifier() { + // Training data + std::vector> X_train = { + {1.0, 2.0}, + {1.5, 1.8}, + {5.0, 8.0}, + {8.0, 8.0}, + {1.0, 0.6}, + {9.0, 11.0} + }; + std::vector y_train = {0, 0, 1, 1, 0, 1}; + + // Test data + std::vector> X_test = { + {1.0, 1.0}, + {8.0, 9.0}, + {0.0, 0.0} + }; + + // Create and train the classifier + KNNClassifier knn(3); + knn.fit(X_train, y_train); + + // Make predictions + std::vector predictions = knn.predict(X_test); + + // Output predictions + for (size_t i = 0; i < predictions.size(); ++i) { + std::cout << "Sample " << i << " predicted class: " << predictions[i] << std::endl; + } + + return 0; +} + +int main(){ + testKNNClassifier(); + return 0; +} \ No newline at end of file diff --git a/examples/KNNRegressorExample.cpp b/examples/KNNRegressorExample.cpp new file mode 100644 index 0000000..015bdba --- /dev/null +++ b/examples/KNNRegressorExample.cpp @@ -0,0 +1,40 @@ +#include "../ml_library_include/ml/clustering/KNNRegressor.hpp" +#include + +int testKNNRegressor() { + // Training data + std::vector> X_train = { + {1.0}, + {2.0}, + {3.0}, + {4.0}, + {5.0} + }; + std::vector y_train = {2.0, 3.0, 4.0, 5.0, 6.0}; + + // Test data + std::vector> X_test = { + {1.5}, + {2.5}, + {3.5} + }; + + // Create and train the regressor + KNNRegressor knn(2); + knn.fit(X_train, y_train); + + // Make predictions + std::vector predictions = knn.predict(X_test); + + // Output predictions + for (size_t i = 0; i < predictions.size(); ++i) { + std::cout << "Sample " << i << " predicted value: " << predictions[i] << std::endl; + } + + return 0; +} + +int main(){ + testKNNRegressor(); + return 0; +} \ No newline at end of file diff --git a/ml_library_include/ml/clustering/HierarchicalClustering.hpp b/ml_library_include/ml/clustering/HierarchicalClustering.hpp new file mode 100644 index 0000000..ac78003 --- /dev/null +++ b/ml_library_include/ml/clustering/HierarchicalClustering.hpp @@ -0,0 +1,238 @@ +#ifndef HIERARCHICAL_CLUSTERING_HPP +#define HIERARCHICAL_CLUSTERING_HPP + +#include +#include +#include +#include +#include + +/** + * @file HierarchicalClustering.hpp + * @brief Implementation of Agglomerative Hierarchical Clustering. + */ + +/** + * @class HierarchicalClustering + * @brief Agglomerative Hierarchical Clustering for clustering tasks. + */ +class HierarchicalClustering { +public: + /** + * @brief Linkage criteria for clustering. + */ + enum class Linkage { + SINGLE, + COMPLETE, + AVERAGE + }; + + /** + * @brief Constructs a HierarchicalClustering instance. + * @param n_clusters The number of clusters to form. + * @param linkage The linkage criterion to use. + */ + HierarchicalClustering(int n_clusters = 2, Linkage linkage = Linkage::AVERAGE); + + /** + * @brief Destructor for HierarchicalClustering. + */ + ~HierarchicalClustering(); + + /** + * @brief Fits the clustering algorithm to the data. + * @param X A vector of feature vectors (data points). + */ + void fit(const std::vector>& X); + + /** + * @brief Predicts the cluster labels for the data. + * @return A vector of cluster labels. + */ + std::vector predict() const; + + /** + * @brief Retrieves the cluster centers (centroids) after fitting. + * @return A vector of cluster centroids. + */ + std::vector> get_cluster_centers() const; + +private: + int n_clusters; ///< Number of clusters to form. + Linkage linkage; ///< Linkage criterion. + std::vector> data; ///< Data points. + + struct Cluster { + int id; ///< Unique identifier for the cluster. + std::vector points; ///< Indices of data points in this cluster. + }; + + std::vector> clusters; ///< Current clusters. + + /** + * @brief Computes the Euclidean distance between two data points. + * @param a Index of the first data point. + * @param b Index of the second data point. + * @return The Euclidean distance. + */ + double euclidean_distance(int a, int b) const; + + /** + * @brief Computes the distance between two clusters based on the linkage criterion. + * @param cluster_a The first cluster. + * @param cluster_b The second cluster. + * @return The distance between the two clusters. + */ + double cluster_distance(const Cluster& cluster_a, const Cluster& cluster_b) const; + + /** + * @brief Merges the two closest clusters. + */ + void merge_clusters(); + + /** + * @brief Finds the pair of clusters with the minimum distance. + * @return A pair of indices representing the clusters to merge. + */ + std::pair find_closest_clusters() const; +}; + +HierarchicalClustering::HierarchicalClustering(int n_clusters, Linkage linkage) + : n_clusters(n_clusters), linkage(linkage) {} + +HierarchicalClustering::~HierarchicalClustering() {} + +void HierarchicalClustering::fit(const std::vector>& X) { + data = X; + + // Initialize each data point as a separate cluster + clusters.clear(); + for (size_t i = 0; i < data.size(); ++i) { + auto cluster = std::make_shared(); + cluster->id = static_cast(i); + cluster->points.push_back(static_cast(i)); + clusters.push_back(cluster); + } + + // Agglomerative clustering + while (static_cast(clusters.size()) > n_clusters) { + merge_clusters(); + } +} + +std::vector HierarchicalClustering::predict() const { + std::vector labels(data.size(), -1); + for (size_t i = 0; i < clusters.size(); ++i) { + for (int point_idx : clusters[i]->points) { + labels[point_idx] = static_cast(i); + } + } + return labels; +} + +std::vector> HierarchicalClustering::get_cluster_centers() const { + std::vector> centers; + centers.reserve(clusters.size()); + + for (const auto& cluster : clusters) { + std::vector centroid(data[0].size(), 0.0); + for (int idx : cluster->points) { + const auto& point = data[idx]; + for (size_t i = 0; i < point.size(); ++i) { + centroid[i] += point[i]; + } + } + // Divide by the number of points to get the mean + for (double& val : centroid) { + val /= cluster->points.size(); + } + centers.push_back(centroid); + } + + return centers; +} + +double HierarchicalClustering::euclidean_distance(int a, int b) const { + const auto& point_a = data[a]; + const auto& point_b = data[b]; + double distance = 0.0; + for (size_t i = 0; i < point_a.size(); ++i) { + double diff = point_a[i] - point_b[i]; + distance += diff * diff; + } + return std::sqrt(distance); +} + +double HierarchicalClustering::cluster_distance(const Cluster& cluster_a, const Cluster& cluster_b) const { + double distance = 0.0; + + if (linkage == Linkage::SINGLE) { + // Minimum distance between any two points in the clusters + distance = std::numeric_limits::max(); + for (int idx_a : cluster_a.points) { + for (int idx_b : cluster_b.points) { + double dist = euclidean_distance(idx_a, idx_b); + if (dist < distance) { + distance = dist; + } + } + } + } else if (linkage == Linkage::COMPLETE) { + // Maximum distance between any two points in the clusters + distance = 0.0; + for (int idx_a : cluster_a.points) { + for (int idx_b : cluster_b.points) { + double dist = euclidean_distance(idx_a, idx_b); + if (dist > distance) { + distance = dist; + } + } + } + } else if (linkage == Linkage::AVERAGE) { + // Average distance between all pairs of points in the clusters + distance = 0.0; + int count = 0; + for (int idx_a : cluster_a.points) { + for (int idx_b : cluster_b.points) { + distance += euclidean_distance(idx_a, idx_b); + count++; + } + } + distance /= count; + } + + return distance; +} + +void HierarchicalClustering::merge_clusters() { + auto [idx_a, idx_b] = find_closest_clusters(); + + // Merge cluster b into cluster a + clusters[idx_a]->points.insert(clusters[idx_a]->points.end(), + clusters[idx_b]->points.begin(), + clusters[idx_b]->points.end()); + + // Remove cluster b + clusters.erase(clusters.begin() + idx_b); +} + +std::pair HierarchicalClustering::find_closest_clusters() const { + double min_distance = std::numeric_limits::max(); + int idx_a = -1; + int idx_b = -1; + + for (size_t i = 0; i < clusters.size(); ++i) { + for (size_t j = i + 1; j < clusters.size(); ++j) { + double dist = cluster_distance(*clusters[i], *clusters[j]); + if (dist < min_distance) { + min_distance = dist; + idx_a = static_cast(i); + idx_b = static_cast(j); + } + } + } + + return {idx_a, idx_b}; +} + +#endif // HIERARCHICAL_CLUSTERING_HPP diff --git a/ml_library_include/ml/clustering/KMeans.hpp b/ml_library_include/ml/clustering/KMeans.hpp index e69de29..dea1bac 100644 --- a/ml_library_include/ml/clustering/KMeans.hpp +++ b/ml_library_include/ml/clustering/KMeans.hpp @@ -0,0 +1,239 @@ +#ifndef KMEANS_HPP +#define KMEANS_HPP + +#include +#include +#include +#include +#include + +/** + * @file KMeans.hpp + * @brief An implementation of the K-Means clustering algorithm with K-Means++ initialization. + */ + +/** + * @class KMeans + * @brief Implements the K-Means clustering algorithm with K-Means++ initialization. + */ +class KMeans { +public: + /** + * @brief Constructs a KMeans object. + * @param n_clusters The number of clusters to form. + * @param max_iter The maximum number of iterations. + * @param tol The tolerance to declare convergence. + * @param random_state Seed for random number generator (optional). + */ + KMeans(int n_clusters = 8, int max_iter = 300, double tol = 1e-4, unsigned int random_state = 0); + + /** + * @brief Destructor for KMeans. + */ + ~KMeans(); + + /** + * @brief Fits the KMeans model to the data. + * @param X A vector of feature vectors. + */ + void fit(const std::vector>& X); + + /** + * @brief Predicts the closest cluster each sample in X belongs to. + * @param X A vector of feature vectors. + * @return A vector of cluster labels. + */ + std::vector predict(const std::vector>& X) const; + + /** + * @brief Returns the cluster centers. + * @return A vector of cluster centers. + */ + const std::vector>& get_cluster_centers() const; + +private: + int n_clusters; + int max_iter; + double tol; + std::vector> cluster_centers; + std::vector labels; + + mutable std::mt19937 rng; ///< Random number generator declared as mutable + + /** + * @brief Computes the Euclidean distance between two points. + * @param a First point. + * @param b Second point. + * @return The Euclidean distance. + */ + double euclidean_distance(const std::vector& a, const std::vector& b) const; + + /** + * @brief Assigns each sample to the nearest cluster center. + * @param X A vector of feature vectors. + * @return A vector of cluster labels. + */ + std::vector assign_labels(const std::vector>& X) const; + + /** + * @brief Computes the cluster centers given the current labels. + * @param X A vector of feature vectors. + * @param labels A vector of cluster labels. + * @return A vector of new cluster centers. + */ + std::vector> compute_cluster_centers(const std::vector>& X, const std::vector& labels) const; + + /** + * @brief Initializes cluster centers using the K-Means++ algorithm. + * @param X A vector of feature vectors. + */ + void initialize_centers(const std::vector>& X); +}; + +KMeans::KMeans(int n_clusters, int max_iter, double tol, unsigned int random_state) + : n_clusters(n_clusters), max_iter(max_iter), tol(tol), rng(random_state) { + if (random_state == 0) { + std::random_device rd; + rng.seed(rd()); + } +} + +KMeans::~KMeans() {} + +void KMeans::fit(const std::vector>& X) { + size_t n_samples = X.size(); + + // Initialize cluster centers using K-Means++ initialization + initialize_centers(X); + + labels.resize(n_samples); + std::vector> old_cluster_centers; + + for (int iter = 0; iter < max_iter; ++iter) { + // Assign labels to each point + labels = assign_labels(X); + + // Save old centers + old_cluster_centers = cluster_centers; + + // Compute new centers + cluster_centers = compute_cluster_centers(X, labels); + + // Check for convergence + double max_center_shift = 0.0; + for (int i = 0; i < n_clusters; ++i) { + double shift = euclidean_distance(cluster_centers[i], old_cluster_centers[i]); + if (shift > max_center_shift) { + max_center_shift = shift; + } + } + if (max_center_shift <= tol) { + break; + } + } +} + +std::vector KMeans::predict(const std::vector>& X) const { + return assign_labels(X); +} + +const std::vector>& KMeans::get_cluster_centers() const { + return cluster_centers; +} + +double KMeans::euclidean_distance(const std::vector& a, const std::vector& b) const { + double sum = 0.0; + for (size_t i = 0; i < a.size(); ++i) { + double diff = a[i] - b[i]; + sum += diff * diff; + } + return std::sqrt(sum); +} + +std::vector KMeans::assign_labels(const std::vector>& X) const { + std::vector labels(X.size()); + for (size_t i = 0; i < X.size(); ++i) { + double min_dist = std::numeric_limits::max(); + int label = -1; + for (int k = 0; k < n_clusters; ++k) { + double dist = euclidean_distance(X[i], cluster_centers[k]); + if (dist < min_dist) { + min_dist = dist; + label = k; + } + } + labels[i] = label; + } + return labels; +} + +std::vector> KMeans::compute_cluster_centers(const std::vector>& X, const std::vector& labels) const { + size_t n_features = X[0].size(); + std::vector> new_centers(n_clusters, std::vector(n_features, 0.0)); + std::vector counts(n_clusters, 0); + + for (size_t i = 0; i < X.size(); ++i) { + int label = labels[i]; + counts[label]++; + for (size_t j = 0; j < n_features; ++j) { + new_centers[label][j] += X[i][j]; + } + } + + for (int k = 0; k < n_clusters; ++k) { + if (counts[k] == 0) { + // If a cluster lost all its members, reinitialize its center using K-Means++ logic + std::uniform_int_distribution dist(0, X.size() - 1); + new_centers[k] = X[dist(rng)]; + } else { + for (size_t j = 0; j < n_features; ++j) { + new_centers[k][j] /= counts[k]; + } + } + } + + return new_centers; +} + +void KMeans::initialize_centers(const std::vector>& X) { + size_t n_samples = X.size(); + size_t n_features = X[0].size(); + cluster_centers.clear(); + cluster_centers.reserve(n_clusters); + + // Step 1: Choose one center uniformly at random from the data points + std::uniform_int_distribution dist(0, n_samples - 1); + size_t first_center_idx = dist(rng); + cluster_centers.push_back(X[first_center_idx]); + + // Step 2: For each data point, compute its distance to the nearest center + std::vector distances(n_samples, std::numeric_limits::max()); + + for (int k = 1; k < n_clusters; ++k) { + double total_distance = 0.0; + for (size_t i = 0; i < n_samples; ++i) { + double dist_to_center = euclidean_distance(X[i], cluster_centers.back()); + if (dist_to_center < distances[i]) { + distances[i] = dist_to_center; + } + total_distance += distances[i]; + } + + // Step 3: Choose the next center with probability proportional to the square of the distance + std::uniform_real_distribution uniform_dist(0.0, total_distance); + double random_value = uniform_dist(rng); + + double cumulative_distance = 0.0; + size_t next_center_idx = 0; + for (size_t i = 0; i < n_samples; ++i) { + cumulative_distance += distances[i]; + if (cumulative_distance >= random_value) { + next_center_idx = i; + break; + } + } + cluster_centers.push_back(X[next_center_idx]); + } +} + +#endif // KMEANS_HPP diff --git a/ml_library_include/ml/clustering/KNNClassifier.hpp b/ml_library_include/ml/clustering/KNNClassifier.hpp new file mode 100644 index 0000000..4209aea --- /dev/null +++ b/ml_library_include/ml/clustering/KNNClassifier.hpp @@ -0,0 +1,130 @@ +#ifndef KNN_CLASSIFIER_HPP +#define KNN_CLASSIFIER_HPP + +#include +#include +#include +#include + +/** + * @file KNNClassifier.hpp + * @brief Implementation of the K-Nearest Neighbors Classifier. + */ + +/** + * @class KNNClassifier + * @brief K-Nearest Neighbors Classifier for classification tasks. + */ +class KNNClassifier { +public: + /** + * @brief Constructs a KNNClassifier. + * @param k The number of neighbors to consider. + */ + explicit KNNClassifier(int k = 3); + + /** + * @brief Destructor for KNNClassifier. + */ + ~KNNClassifier(); + + /** + * @brief Fits the classifier to the training data. + * @param X A vector of feature vectors (training data). + * @param y A vector of target class labels (training labels). + */ + void fit(const std::vector>& X, const std::vector& y); + + /** + * @brief Predicts class labels for the given input data. + * @param X A vector of feature vectors (test data). + * @return A vector of predicted class labels. + */ + std::vector predict(const std::vector>& X) const; + +private: + int k; ///< Number of neighbors to consider. + std::vector> X_train; ///< Training data features. + std::vector y_train; ///< Training data labels. + + /** + * @brief Computes the Euclidean distance between two feature vectors. + * @param a The first feature vector. + * @param b The second feature vector. + * @return The Euclidean distance. + */ + double euclidean_distance(const std::vector& a, const std::vector& b) const; + + /** + * @brief Predicts the class label for a single sample. + * @param x The feature vector of the sample. + * @return The predicted class label. + */ + int predict_sample(const std::vector& x) const; +}; + +KNNClassifier::KNNClassifier(int k) : k(k) {} + +KNNClassifier::~KNNClassifier() {} + +void KNNClassifier::fit(const std::vector>& X, const std::vector& y) { + X_train = X; + y_train = y; +} + +std::vector KNNClassifier::predict(const std::vector>& X) const { + std::vector predictions; + predictions.reserve(X.size()); + for (const auto& x : X) { + predictions.push_back(predict_sample(x)); + } + return predictions; +} + +double KNNClassifier::euclidean_distance(const std::vector& a, const std::vector& b) const { + double distance = 0.0; + for (size_t i = 0; i < a.size(); ++i) { + double diff = a[i] - b[i]; + distance += diff * diff; + } + return std::sqrt(distance); +} + +int KNNClassifier::predict_sample(const std::vector& x) const { + // Vector to store distances and corresponding labels + std::vector> distances; + distances.reserve(X_train.size()); + + // Compute distances to all training samples + for (size_t i = 0; i < X_train.size(); ++i) { + double dist = euclidean_distance(x, X_train[i]); + distances.emplace_back(dist, y_train[i]); + } + + // Sort distances + std::nth_element(distances.begin(), distances.begin() + k, distances.end(), + [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + }); + + // Get the labels of the k nearest neighbors + std::unordered_map class_counts; + for (int i = 0; i < k; ++i) { + int label = distances[i].second; + class_counts[label]++; + } + + // Determine the majority class + int max_count = 0; + int majority_class = -1; + for (const auto& [label, count] : class_counts) { + if (count > max_count) { + max_count = count; + majority_class = label; + } + } + + return majority_class; +} + +#endif // KNN_CLASSIFIER_HPP diff --git a/ml_library_include/ml/clustering/KNNRegressor.hpp b/ml_library_include/ml/clustering/KNNRegressor.hpp new file mode 100644 index 0000000..094945b --- /dev/null +++ b/ml_library_include/ml/clustering/KNNRegressor.hpp @@ -0,0 +1,117 @@ +#ifndef KNN_REGRESSOR_HPP +#define KNN_REGRESSOR_HPP + +#include +#include +#include + +/** + * @file KNNRegressor.hpp + * @brief Implementation of the K-Nearest Neighbors Regressor. + */ + +/** + * @class KNNRegressor + * @brief K-Nearest Neighbors Regressor for regression tasks. + */ +class KNNRegressor { +public: + /** + * @brief Constructs a KNNRegressor. + * @param k The number of neighbors to consider. + */ + explicit KNNRegressor(int k = 3); + + /** + * @brief Destructor for KNNRegressor. + */ + ~KNNRegressor(); + + /** + * @brief Fits the regressor to the training data. + * @param X A vector of feature vectors (training data). + * @param y A vector of target values (training labels). + */ + void fit(const std::vector>& X, const std::vector& y); + + /** + * @brief Predicts target values for the given input data. + * @param X A vector of feature vectors (test data). + * @return A vector of predicted target values. + */ + std::vector predict(const std::vector>& X) const; + +private: + int k; ///< Number of neighbors to consider. + std::vector> X_train; ///< Training data features. + std::vector y_train; ///< Training data target values. + + /** + * @brief Computes the Euclidean distance between two feature vectors. + * @param a The first feature vector. + * @param b The second feature vector. + * @return The Euclidean distance. + */ + double euclidean_distance(const std::vector& a, const std::vector& b) const; + + /** + * @brief Predicts the target value for a single sample. + * @param x The feature vector of the sample. + * @return The predicted target value. + */ + double predict_sample(const std::vector& x) const; +}; + +KNNRegressor::KNNRegressor(int k) : k(k) {} + +KNNRegressor::~KNNRegressor() {} + +void KNNRegressor::fit(const std::vector>& X, const std::vector& y) { + X_train = X; + y_train = y; +} + +std::vector KNNRegressor::predict(const std::vector>& X) const { + std::vector predictions; + predictions.reserve(X.size()); + for (const auto& x : X) { + predictions.push_back(predict_sample(x)); + } + return predictions; +} + +double KNNRegressor::euclidean_distance(const std::vector& a, const std::vector& b) const { + double distance = 0.0; + for (size_t i = 0; i < a.size(); ++i) { + double diff = a[i] - b[i]; + distance += diff * diff; + } + return std::sqrt(distance); +} + +double KNNRegressor::predict_sample(const std::vector& x) const { + // Vector to store distances and corresponding target values + std::vector> distances; + distances.reserve(X_train.size()); + + // Compute distances to all training samples + for (size_t i = 0; i < X_train.size(); ++i) { + double dist = euclidean_distance(x, X_train[i]); + distances.emplace_back(dist, y_train[i]); + } + + // Find the k nearest neighbors + std::nth_element(distances.begin(), distances.begin() + k, distances.end(), + [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + }); + + // Compute the average of the target values of the k nearest neighbors + double sum = 0.0; + for (int i = 0; i < k; ++i) { + sum += distances[i].second; + } + return sum / k; +} + +#endif // KNN_REGRESSOR_HPP diff --git a/ml_library_include/ml/regression/LogisticRegression.hpp b/ml_library_include/ml/regression/LogisticRegression.hpp index 98fefd7..3c26dae 100644 --- a/ml_library_include/ml/regression/LogisticRegression.hpp +++ b/ml_library_include/ml/regression/LogisticRegression.hpp @@ -5,6 +5,11 @@ #include /** + * @file LogisticRegression.hpp + * @brief A simple implementation of Logistic Regression. + */ +/** + * @class Logistic Regression * @brief Logistic Regression model for binary classification tasks. */ class LogisticRegression { diff --git a/ml_library_include/ml/regression/MultiLinearRegression.hpp b/ml_library_include/ml/regression/MultiLinearRegression.hpp index 814ec06..5dfcf4b 100644 --- a/ml_library_include/ml/regression/MultiLinearRegression.hpp +++ b/ml_library_include/ml/regression/MultiLinearRegression.hpp @@ -7,6 +7,12 @@ #include /** + * @file MultilinearRegression.hpp + * @brief A simple implementation of Multi Linear Regression. + */ + +/** + * @class Multilinear Regression * @brief A class that implements Multilinear Regression for predicting values * based on multiple features. */ diff --git a/ml_library_include/ml/regression/PolynomialRegression.hpp b/ml_library_include/ml/regression/PolynomialRegression.hpp index 15eb395..92176f8 100644 --- a/ml_library_include/ml/regression/PolynomialRegression.hpp +++ b/ml_library_include/ml/regression/PolynomialRegression.hpp @@ -6,6 +6,11 @@ #include /** + * @file PolynomialRegression.hpp + * @brief A simple implementation of Polynomial Regression. + */ +/** + * @class PolynomialRegression * @brief Polynomial Regression model for fitting polynomial curves. */ class PolynomialRegression { diff --git a/tests/clustering/HierarchicalClusteringTest.cpp b/tests/clustering/HierarchicalClusteringTest.cpp new file mode 100644 index 0000000..0460975 --- /dev/null +++ b/tests/clustering/HierarchicalClusteringTest.cpp @@ -0,0 +1,61 @@ +#include "../ml_library_include/ml/clustering/HierarchicalClustering.hpp" +#include +#include +#include +#include "../TestUtils.hpp" // Utility file for approxEqual or similar functions + +int main() { + // Sample dataset with three distinct groups + std::vector> data = { + {1.0, 2.0}, {1.5, 1.8}, {1.0, 0.6}, // Group 1 + {5.0, 10.0}, {5.5, 10.8}, {5.0, 10.6}, // Group 1 + {25.0, 72.0}, {24.5, 71.8}, {26.0, 70.6}, // Group 1 + }; + + // Initialize HierarchicalClustering with 3 clusters + HierarchicalClustering hc(3, HierarchicalClustering::Linkage::AVERAGE); + hc.fit(data); + + // Predict cluster labels + std::vector labels = hc.predict(); + + // Ensure there are three unique clusters + std::vector actual_cluster_counts(3, 0); + for (const int label : labels) { + assert(label >= 0 && label < 3 && "Cluster label out of expected range."); + actual_cluster_counts[label]++; + } + + // Check that no cluster is empty + for (size_t count : actual_cluster_counts) { + assert(count > 0 && "One of the clusters is empty."); + } + + // Expected cluster centers (approximately, for validation) + std::vector> expected_centers = { + {1.2, 1.57}, {5.25, 10.3}, {25.0, 71.5} // Approximate expected values + }; + + // Get actual centers and validate against expected centers + const auto& centers = hc.get_cluster_centers(); + bool centers_match = true; + std::cout << "Hierarchical Clustering Centers:" << std::endl; + for (const auto& center : centers) { + std::cout << "Cluster center: (" << center[0] << ", " << center[1] << ")" << std::endl; + bool matched = false; + for (const auto& expected : expected_centers) { + if (approxEqual(center[0], expected[0], 3.0) && approxEqual(center[1], expected[1], 3.0)) { + matched = true; + break; + } + } + centers_match &= matched; + } + + assert(centers_match && "Cluster centers do not match expected locations within tolerance."); + + // Inform user of successful test + std::cout << "Hierarchical Clustering Test passed." << std::endl; + + return 0; +} diff --git a/tests/clustering/KMeansClusteringTest.cpp b/tests/clustering/KMeansClusteringTest.cpp new file mode 100644 index 0000000..a79f0e9 --- /dev/null +++ b/tests/clustering/KMeansClusteringTest.cpp @@ -0,0 +1,62 @@ +#include "../ml_library_include/ml/clustering/KMeans.hpp" +#include +#include +#include +#include +#include "../TestUtils.hpp" + + +int main() { + // Sample dataset with three distinct groups + std::vector> X = { + {1.0, 1.1}, {1.0, 1.2}, {1.0, 1.3}, // group 1 + {11.0, 12.0}, {12.0, 12.0}, {13.0, 12.0}, // group 2 + {22.0, 22.0}, {21.0, 22.0}, {23.0, 22.0} // group 3 + }; + + // Initialize KMeans with 3 clusters + KMeans kmeans(3); + kmeans.fit(X); + + // Predict cluster labels + std::vector labels = kmeans.predict(X); + + // Ensure there are three unique clusters + std::vector actual_cluster_counts(3, 0); + for (const int label : labels) { + assert(label >= 0 && label < 3 && "Cluster label out of expected range."); + actual_cluster_counts[label]++; + } + + // Check that no cluster is empty + for (size_t count : actual_cluster_counts) { + assert(count > 0 && "One of the clusters is empty."); + } + + // Expected cluster centers for reference + std::vector> expected_centers = { + {1.0, 1.2}, {12.0, 12.0}, {22.0, 22.0} + }; + + // Get actual centers and check each center against any expected center + const auto& centers = kmeans.get_cluster_centers(); + std::cout << "K-Means Cluster Centers:" << std::endl; + bool centers_match = true; + for (const auto& center : centers) { + std::cout << "Cluster center: (" << center[0] << ", " << center[1] << ")" << std::endl; + bool matched = false; + for (const auto& expected : expected_centers) { + if (approxEqual(center[0], expected[0], 1.5) && approxEqual(center[1], expected[1], 1.5)) { + matched = true; + break; + } + } + centers_match &= matched; + } + + assert(centers_match && "Cluster centers do not match expected locations within tolerance."); + + // Inform user of successful test + std::cout << "K-Means Clustering Basic Test passed." << std::endl; + return 0; +} diff --git a/tests/clustering/KNNClassifierTest.cpp b/tests/clustering/KNNClassifierTest.cpp new file mode 100644 index 0000000..0534709 --- /dev/null +++ b/tests/clustering/KNNClassifierTest.cpp @@ -0,0 +1,48 @@ +#include "../ml_library_include/ml/clustering/KNNClassifier.hpp" +#include +#include +#include +#include +#include "../TestUtils.hpp" + +int main() { + // Training data + std::vector> X_train = { + {1.0, 2.0}, + {1.5, 1.8}, + {5.0, 8.0}, + {8.0, 8.0}, + {1.0, 0.6}, + {9.0, 11.0} + }; + std::vector y_train = {0, 0, 1, 1, 0, 1}; + + // Test data + std::vector> X_test = { + {1.0, 1.0}, // Expected class: 0 + {8.0, 9.0}, // Expected class: 1 + {0.0, 0.0} // Expected class: 0 + }; + + // Expected classes for test data + std::vector expected_classes = {0, 1, 0}; + + // Create and train the KNN classifier with k = 3 + KNNClassifier knn(3); + knn.fit(X_train, y_train); + + // Make predictions + std::vector predictions = knn.predict(X_test); + + // Verify predictions by comparing them with expected values + for (size_t i = 0; i < predictions.size(); ++i) { + std::cout << "Sample " << i << " predicted class: " << predictions[i] + << ", Expected class: " << expected_classes[i] << std::endl; + assert(predictions[i] == expected_classes[i] && "KNN prediction does not match expected class."); + } + + // Inform user of successful test + std::cout << "KNN Classifier Basic Test passed." << std::endl; + + return 0; +} diff --git a/tests/clustering/KNNRegressorTest.cpp b/tests/clustering/KNNRegressorTest.cpp new file mode 100644 index 0000000..7ba1713 --- /dev/null +++ b/tests/clustering/KNNRegressorTest.cpp @@ -0,0 +1,47 @@ +#include "../ml_library_include/ml/clustering/KNNRegressor.hpp" +#include +#include +#include +#include +#include "../TestUtils.hpp" + + +int main() { + // Training data + std::vector> X_train = { + {1.0}, + {2.0}, + {3.0}, + {4.0}, + {5.0} + }; + std::vector y_train = {2.0, 3.0, 4.0, 5.0, 6.0}; + + // Test data + std::vector> X_test = { + {1.5}, // Expected output ~2.5 + {2.5}, // Expected output ~3.5 + {3.5} // Expected output ~4.5 + }; + std::vector expected_values = {2.5, 3.5, 4.5}; + + // Create and train the KNN regressor with k = 2 + KNNRegressor knn(2); + knn.fit(X_train, y_train); + + // Make predictions + std::vector predictions = knn.predict(X_test); + + // Verify predictions by comparing them with expected values + for (size_t i = 0; i < predictions.size(); ++i) { + std::cout << "Sample " << i << " predicted value: " << predictions[i] + << ", Expected value: " << expected_values[i] << std::endl; + assert(approxEqual(predictions[i], expected_values[i], 0.1) && + "KNN regression prediction does not match expected value."); + } + + // Inform user of successful test + std::cout << "KNN Regressor Basic Test passed." << std::endl; + + return 0; +}