diff --git a/README.md b/README.md index 5b6a0599ba..f527551438 100644 --- a/README.md +++ b/README.md @@ -94,13 +94,91 @@ A demo of computing graph kernels can be found on [Google Colab](https://colab.r ### 2 Graph Edit Distances +We currently support a GEDModel class compatible with the `scikit-learn` transformer interface, +which can be used to compute the graph edit distance between attributed graphs. +The `GEDModel` class is based on the extended [`GEDLIB`](https://github.com/dbblumenthal/gedlib) library. See Section +[GEDLIB](#4-interface-to-gedlib) for more details. + +#### The following GED methods are supported: + +- BRANCH +- BRANCH_FAST +- BRANCH_TIGHT +- BRANCH_UNIFORM +- BRANCH_COMPACT +- PARTITION +- HYBRID +- RING +- ANCHOR_AWARE_GED +- WALKS +- IPFP +- BIPARTITE +- SUBGRAPH +- NODE +- RING_ML +- BIPARTITE_ML +- REFINE +- BP_BEAM +- SIMULATED_ANNEALING +- HED +- STAR + +with `GUROBI`: + +- F1 +- F2 +- COMPACT_MIP +- BLP_NO_EDGE_LABELS + +#### The following GED cost functions are supported: + +- CHEM_1 +- CHEM_2 +- CMU +- GREC_1 +- GREC_2 +- PROTEIN +- FINGERPRINT +- LETTER +- LETTER2 + - Similar to `LETTER`, but uses 6 cost constants instead of 3. See details [here](https://github.com/jajupmochi/gedlib/blob/master/src/edit_costs/letter_2.hpp). +- NON_SYMBOLIC + - Edit costs for graphs containing only non-symbolic (numeric) node and edge + labels. These labels are used to compute relabeling (substitution) costs, using + e.g., the Euclidean distance. See details [here](https://github.com/jajupmochi/gedlib/blob/master/src/edit_costs/non_symbolic.hpp#L35). +- GEOMETRIC + - Edit costs for graphs containing mixed node and edge attributes (e.g., string (symbolic) and numeric (non-symbolic)). + Users can choose the (dis-)similarity measure for each label type, e.g., + `cosine_distance` for numeric vectors. See details [here](https://github.com/jajupmochi/gedlib/blob/master/src/edit_costs/geometric.hpp#L42). +- CONSTANT + +Detailed documentation can be found [here](https://dbblumenthal.github.io/gedlib/index.html). + ### 3 Graph preimage methods A demo of generating graph preimages can be found on [Google Colab](https://colab.research.google.com/drive/1PIDvHOcmiLEQ5Np3bgBDdu0kLOquOMQK?usp=sharing) and in the [`examples`](https://github.com/jajupmochi/graphkit-learn/blob/master/gklearn/examples/median_preimege_generator.py) folder. ### 4 Interface to `GEDLIB` -[`GEDLIB`](https://github.com/dbblumenthal/gedlib) is an easily extensible C++ library for (suboptimally) computing the graph edit distance between attributed graphs. [A Python interface](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/gedlib) for `GEDLIB` is integrated in this library, based on [`gedlibpy`](https://github.com/Ryurin/gedlibpy) library. +[`GEDLIB`](https://github.com/dbblumenthal/gedlib) is an easily extensible C++ library for (suboptimally) computing the +graph edit distance between attributed graphs. [A Python interface](https://github.com/jajupmochi/graphkit-learn/tree/master/gklearn/gedlib) for `GEDLIB` is +integrated in this library, based on [`gedlibpy`](https://github.com/Ryurin/gedlibpy) library. We also extended the +library, adding the following features: + +- Support attributed graphs with the following node and edge label types: + - strings, integers, floats, lists / `numpy` arrays of floats and integers. Arbitrary + numbers of features can be added. + +- Support fast vectorized computation between labels using `Eigen` (e.g., cosine or + Euclidean distances). + - To benefit from this, we recommend merging numeric labels into + a single label with a `numpy` array. + +- Support the following GED cost functions: + - `LETTER2`, `NON_SYMBOLIC`, `GEOMETRIC`. + - See Section [GED](#3-graph-edit-distances) for more details. + +- Use modern C++ 17 features, such as `std::optional`, `std::variant`, `std::any`. ### 5 Computation optimization methods diff --git a/gklearn/experiments/ged/check_results_of_ged_env.py b/gklearn/experiments/ged/check_results_of_ged_env.py index 7c81c5d4af..bc7d5c0648 100644 --- a/gklearn/experiments/ged/check_results_of_ged_env.py +++ b/gklearn/experiments/ged/check_results_of_ged_env.py @@ -79,7 +79,7 @@ def compute_geds_by_GEDEnv(dataset): def compute_geds_by_GEDLIB(dataset): - from gklearn.gedlib import librariesImport, gedlibpy + from gklearn.gedlib import libraries_import, gedlibpy from gklearn.ged.util import ged_options_to_string import numpy as np diff --git a/gklearn/experiments/ged/ged_model/compare_gedlib_with_coords_in_string_and_attr_format.py b/gklearn/experiments/ged/ged_model/compare_gedlib_with_coords_in_string_and_attr_format.py index e90930c34d..f4a0f023b6 100644 --- a/gklearn/experiments/ged/ged_model/compare_gedlib_with_coords_in_string_and_attr_format.py +++ b/gklearn/experiments/ged/ged_model/compare_gedlib_with_coords_in_string_and_attr_format.py @@ -401,9 +401,9 @@ def compare_gedlib_with_coords_in_string_and_attr_format( seed = 42 n_graphs = 500 n_emb_dim = 100 - parellel = True + parallel = True compare_gedlib_with_coords_in_string_and_attr_format( - seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parellel + seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel ) # # Comparison of the two versions: diff --git a/gklearn/experiments/ged/ged_model/ged_model_global_env.py b/gklearn/experiments/ged/ged_model/ged_model_global_env.py new file mode 100644 index 0000000000..553db6dd4f --- /dev/null +++ b/gklearn/experiments/ged/ged_model/ged_model_global_env.py @@ -0,0 +1,1649 @@ +""" +ged_model_global_env.py + +The GEDModel class using a GEDEnv as a global environment inside the class for testing purposes. + +@Author: jajupmochi +@Date: June 4 2025 +""" +import multiprocessing +import os +import sys +import time +from contextlib import contextmanager +from functools import partial +from itertools import combinations, product +from multiprocessing import shared_memory + +import networkx as nx +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_is_fitted +from tqdm import tqdm + +from gklearn.experiments.ged.ged_model.profile_ged_model import INFO_TAG +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util.util import ged_options_to_string +from gklearn.utils import get_iters + + +class GEDModel(BaseEstimator): # , ABC): + """The graph edit distance model class compatible with `scikit-learn`. + + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the computation.** + + Notes + ----- + This class uses the `gedlibpy` module to compute the graph edit distance. + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + + def __init__( + self, + env_type: str | None = None, + ed_method='BIPARTITE', + edit_cost_fun='CONSTANT', + init_edit_cost_constants=[3, 3, 1, 3, 3, 1], + edit_cost_config: dict = {}, + optim_method='init', + optim_options={'y_distance': euclid_d, 'mode': 'reg'}, + ged_init_options: dict | None = None, + node_labels=[], + edge_labels=[], + parallel=None, + n_jobs=None, + chunksize=None, + # normalize=True, + copy_graphs=True, # make sure it is a full deep copy. and faster! + verbose=2 + ): + """`__init__` for `GEDModel` object. + + Parameters + ---------- + env_type : str, optional + The type of the GED environment. Default is None. If None, try to determine + the type automatically based on the given graph node / edge labels. + + Available types are: + + - 'attr': Attribute-based environment (with complex node and edge labels). + Each node or edge can have multiple key-value label pairs, and each value can + be of the following types: int, float, str, list/np.ndarray of int or float. + This is the default type if no node or edge labels are provided. + + - 'gxl' or 'str': GXLLabel environment (with string labels). Each node or + edge can have multiple key-value label pairs, but all values must be strings. + The type will be set to GXL only if at least one node or edge label is + provided. + """ + # @todo: the default settings of the parameters are different from those in the self.compute method. + # self._graphs = None + self.env_type = env_type + self.ed_method = ed_method + self.edit_cost_fun = edit_cost_fun + self.init_edit_cost_constants = init_edit_cost_constants + self.edit_cost_config = edit_cost_config + self.optim_method = optim_method + self.optim_options = optim_options + self.ged_init_options = ged_init_options + self.node_labels = node_labels + self.edge_labels = edge_labels + self.parallel = parallel + self.n_jobs = ((multiprocessing.cpu_count() - 1) if n_jobs is None else n_jobs) + self.chunksize = chunksize + # self.normalize = normalize + self.copy_graphs = copy_graphs + self.verbose = verbose + + self._ged_env = None # The GED environment to use for the model. + self._graphs = None # The input graphs to the model. + self._is_transformed = False # Whether the model has been transformed. + self._run_time = 0 # The run time of the last computation. + self._Y = None # The target graphs for the model. + self._dm_train = None # The distance matrix of the training data. + self._dm_test = None # The distance matrix of the test data. + self._edit_cost_constants = None # The edit cost constants for the model. + self._X_diag = None # The diagonal of the metric matrix for the training data (0's in this case). + self._Y_diag = None # The diagonal of the metric matrix for the test data (0's in this case). + self._targets = None # The targets for the model, if any. + + self.env_stats = {} # Store environment stats for the model. + + + # self._run_time = 0 + # self._gram_matrix = None + # self._gram_matrix_unnorm = None + + ########################################################################## + # The following is the 1st paradigm to compute GED distance matrix, which is + # compatible with `scikit-learn`. + ########################################################################## + + def fit(self, X, y=None, **kwargs): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + kwargs : dict, optional + Additional parameters for the transformer. The following parameters can be included: + + Returns + ------- + object + Returns self. + + """ + # self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + if y is not None: + self._targets = y + # self._targets = self.validate_input(y) + + # Compute edit cost constants. + self.compute_edit_costs(**kwargs) + + # Create the GED environment if not set: + # Only do this if no parallelization will be used. Otherwise, a separate GEDEnv will be + # created in each worker in transforming. + # todo: we plan to refactor this in the future for better performance. + if self.parallel is None: + # `self._edit_cost_constants` is needed from `self.compute_edit_costs` to initialize the + # GED environment. + self._ged_env, env_setting_time = self.create_and_setup_ged_env( + self.env_type, graph=X[0], + **{ + 'ed_method': self.ed_method, + 'edit_cost_fun': self.edit_cost_fun, + 'edit_cost_constants': self._edit_cost_constants, + 'edit_cost_config': self.edit_cost_config, + } + ) + self.env_stats['env_setting_time'] = env_setting_time + # Add graphs to the environment: + graphs_adding_time = self.add_graphs_to_ged_env( + self._graphs, self._ged_env, self.verbose, **{'copy_graphs': self.copy_graphs} + ) + self.env_stats['graphs_adding_time'] = graphs_adding_time + + # self._X = X + # self._kernel = self._get_kernel_instance() + + # Return the transformer. + return self + + + def transform( + self, + X=None, + return_dm_train=False, + save_dm_test=False, + return_dm_test=False, + **kwargs + ): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # If `return_dm_train`, return the fitted GED distance matrix of training data. + if return_dm_train: + check_is_fitted(self, '_dm_train') + self._is_transformed = True + return self._dm_train # @TODO: copy or not? + + if return_dm_test: + check_is_fitted(self, '_dm_test') + return self._dm_test # @TODO: copy or not? + + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + dis_matrix = self.compute_distance_matrix(Y, **kwargs) + self._Y = Y + + # Self transform must appear before the diagonal call on normalization. + self._is_transformed = True # @TODO: When to set this to True? When return dm test? + # if self.normalize: + # X_diag, Y_diag = self.diagonals() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_dm_test: + self._dm_test = dis_matrix + # If the model is retransformed and the `save_dm_test` flag is not set, + # then remove the previously computed dm_test to prevent conflicts. + else: + if hasattr(self, '_dm_test'): + delattr(self, '_dm_test') + + return dis_matrix + + + def fit_transform( + self, + X, + y=None, + save_dm_train=False, + save_mm_train: bool = False, + **kwargs + ): + """Fit and transform: compute GED distance matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [len(X), len(X)] + The distance matrix of X. + + """ + self.fit(X, y, **kwargs) + + # Transform: compute Gram matrix. + dis_matrix = self.compute_distance_matrix(**kwargs) + + # # Normalize. + # if self.normalize: + # self._X_diag = np.diagonal(gram_matrix).copy() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_mm_train or save_dm_train: + self._dm_train = dis_matrix + # If the model is refitted and the `save_dm_train` flag is not set, then + # remove the previously computed dm_train to prevent conflicts. + else: + if hasattr(self, '_dm_train'): + delattr(self, '_dm_train') + + return dis_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): # @todo: update + # if hasattr(self, '_X_diag'): + # delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + self._run_time = 0 + if hasattr(self, '_test_run_time'): + delattr(self, '_test_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel == False: + self.parallel = None + elif self.parallel == True: + self.parallel = 'imap_unordered' + if self.parallel is not None and self.parallel not in [ + 'imap_unordered', 'multiprocessing', 'joblib', 'concurrent' + ]: + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs. The input must be a list.') + elif len(X) == 0: + raise ValueError( + 'The graph list given is empty. No computation will be performed.' + ) + + return X + + + def compute_distance_matrix(self, Y=None, **kwargs): + """Compute the distance matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the distance matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None distance is computed + between X and itself. + + Returns + ------- + dis_matrix : numpy array, shape = [n_targets, n_inputs] + The computed distance matrix. + + """ + if Y is None: + # Compute metric matrix for self._graphs (X). + dis_matrix = self._compute_self_distance_matrix(**kwargs) + + else: + # This will be done when loading the graphs into the GEDEnv. + # # Compute metric matrix between Y and self._graphs (X). + # Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) + # graphs_copy = ( + # [g.copy() for g in self._graphs] + # if self.copy_graphs else self._graphs + # ) + + dis_matrix = self._compute_cross_distance_matrix(Y, **kwargs) + + return dis_matrix + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + graphs = ([g.copy() for g in + self._graphs] if self.copy_graphs else self._graphs) + for i, x in enumerate(graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) + for (i, y) in enumerate(Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + + # @abstractmethod + def pairwise_distance(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError( + 'Pairwise kernel computation is not implemented!' + ) + + + def compute_edit_costs(self, Y=None, Y_targets=None, **kwargs): + # todo: this function is not optimized to use global environment. + """Compute edit cost constants. When optimizing method is `fiited`, + apply Jia2021's metric learning method by using a given target graphs (Y) + the fitted graphs (X / self._graphs). + + Parameters + ---------- + Y : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + # Get or compute. + if self.optim_method == 'random': + self._edit_cost_constants = np.random.rand(6) + + elif self.optim_method == 'init': + self._edit_cost_constants = self.init_edit_cost_constants + + elif self.optim_method == 'expert': + self._edit_cost_constants = [3, 3, 1, 3, 3, 1] + + elif self.optim_method == 'fitted': # Jia2021 method + # Get proper inputs. + if Y is None: + check_is_fitted(self, ['_graphs']) + check_is_fitted(self, ['_targets']) + graphs = ([g.copy() for g in + self._graphs] if self.copy_graphs else self._graphs) + targets = self._targets + else: + graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) + targets = Y_targets + + # Get optimization options. + node_labels = self.node_labels + edge_labels = self.edge_labels + unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) + repeats = kwargs.get('repeats', 1) + from gklearn.ged.model.optim_costs import compute_optimal_costs + self._edit_cost_constants = compute_optimal_costs( + graphs, targets, + node_labels=node_labels, edge_labels=edge_labels, + unlabeled=unlabeled, + init_costs=self.init_edit_cost_constants, + ed_method=self.ed_method, + edit_cost_fun=self.edit_cost_fun, + repeats=repeats, + rescue_optim_failure=False, + verbose=(self.verbose >= 2), + **self.optim_options + ) + + + # %% Self distance matrix computation methods: + + def _compute_self_distance_matrix(self, **kwargs): + # # Useless if graphs were loaded into GEDEnv beforehand: + # graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + + start_time = time.time() + + # if self.parallel == 'imap_unordered': + # dis_matrix = self._compute_X_dm_imap_unordered(graphs, **kwargs) + if self.parallel in ['imap_unordered', 'joblib', 'concurrent', 'multiprocessing']: + dis_matrix = self._compute_self_distance_matrix_parallel(**kwargs) + elif self.parallel is None: + # dis_matrix = self._compute_X_dm_series(graphs, **kwargs) + dis_matrix = self._compute_self_distance_matrix_series(**kwargs) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time += time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size %d built in %s seconds.' + % (len(self._graphs), self._run_time) + ) + + return dis_matrix + + + def _compute_self_distance_matrix_series(self, **kwargs): + # We put the initialization of the GED environment here for these reasons: + # 1. To process the computation of costs between labels separately for series and parallel mode. + # 2. To include the time of this initialization in the total run time. + # 3. For cross distance matrix, target graphs (Y) need to be added to the environment. + eager_label_cost_computing_time = self.init_ged_env_and_method( + self._ged_env, **{'ged_init_options': self.ged_init_options} + ) + if eager_label_cost_computing_time is not None: + self.env_stats['eger_label_cost_computing_time'] = eager_label_cost_computing_time + + graph_ids = self._ged_env.get_all_graph_ids() + n = len(graph_ids) + if n != len(self._graphs): + raise ValueError( + f'Number of graphs in the GEDEnv ({n}) does not match ' + f'number of input graphs in the GEDModel ({len(self._graphs)}).' + ) + + dis_matrix = np.zeros((n, n)) + iterator = combinations(range(n), 2) + len_itr = int(n * (n - 1) / 2) + if self.verbose: + print('Graphs in total: %d.' % n) + print('The total # of pairs is %d.' % len_itr) + self.env_stats['ged_computing_time'] = [] + for i, j in get_iters( + iterator, desc='Computing distance matrix', + file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + ): + gid1, gid2 = graph_ids[i], graph_ids[j] + dis_matrix[i, j], stats = GEDModel.pairwise_ged_with_gids( + gid1, gid2, self._ged_env, self._graphs, **kwargs + ) + dis_matrix[j, i] = dis_matrix[i, j] + self.env_stats['ged_computing_time'].append(stats['ged_computing_time']) + return dis_matrix + + + # todo: this is not refactored yet. + def _compute_self_distance_matrix_parallel(self, **kwargs): + """ + Highly optimized parallelized version of distance matrix computation between graphs. + + Parameters: + ----------- + graphs : list + List of graph objects to compute pairwise distances + n_jobs : int, default=-1 + Number of parallel jobs. -1 means using all available cores. + chunk_size : int, default=None + Number of tasks per chunk. If None, will be auto-calculated. + memory_limit : str or int, default='auto' + Memory limit per worker in MB or 'auto' to determine automatically. + method : str, default='joblib' + Parallelization backend: 'joblib', 'concurrent', or 'multiprocessing' + + Returns: + -------- + np.ndarray + Distance matrix of shape (n, n) + """ + n = len(self._graphs) + + # Get all pairs of indices + pairs = list(combinations(range(n), 2)) + len_itr = len(pairs) + + n_jobs = self.n_jobs + chunksize = self.chunksize + method = self.parallel + memory_limit = kwargs.get('memory_limit', 'auto') + + if self.verbose: + print('Graphs in total: %d.' % n) + print('The total # of pairs is %d.' % len_itr) + + # Determine the number of processes: + if n_jobs == -1: + n_jobs = os.cpu_count() - 1 + n_jobs = min(n_jobs, os.cpu_count(), len_itr) + + # Auto-calculate optimal chunk size if not provided + if chunksize is None: + # # this seems to be slightly faster when using `test_ged_model.py` + # # with 100 graphs (0.0012 s vs 0.0016 s per pair). Yet gets slower with + # # larger number of graphs (e.g., 1000) (~ 31 mins vs ~ 40 mins in total). + # if len_itr < 100 * n_jobs: + # chunksize = int(len_itr / n_jobs) + 1 + # else: + # chunksize = 100 + + # Balancing chunk size: larger chunks reduce overhead but limit load balancing + # A good heuristic is sqrt(len_itr / n_jobs) * 4 + chunksize = max(1, int(np.sqrt(len_itr / n_jobs) * 4)) + + if self.verbose >= 2: + print( + f"Running with {n_jobs} parallel processes and chunk size of {chunksize}" + ) + + # # For networkx graphs, we need to use a Manager to share them between processes: + # with Manager() as manager: + # # Create a managed shared list for the graphs + # # todo: + # # 1. This operation will serialize the graphs, which will make a deep copy of each graph, + # # so it is not efficient. + # # + # # 2. When using multiprocessing.Manager to share graphs, a separate manager process is launched + # # to hold the shared objects. Accessing these shared graphs from other processes involves + # # serialization (pickling), inter-process communication (IPC), and deserialization (unpickling), + # # which can be very costly for large NetworkX graphs. + # # + # # In contrast, if we use per-process global variables initialized via init_worker(), + # # each process gets a local copy of the graph data, which avoids the IPC overhead, + # # but requires duplicating memory (one full copy per worker). + # # + # # To compare the overheads: + # # - Using a Manager: + # # -- Every access to a graph (e.g., shared_graphs[i]) involves: pickle → IPC → unpickle. + # # -- Graphs are not truly shared in memory; they are proxied through the manager process. + # # Since we then create GEDEnv graphs, so no more pickling is needed. + # # + # # - Using global variables in worker init: + # # -- Graphs are copied once to each worker during process start (via memory fork or pickle). + # # -- After that, all access is purely local (no IPC, no further serialization). + # # -- This is faster at runtime but uses more memory. + # # + # # 3. Since manager uses a proxy object, it may cause issues when trying to modify + # # the graphs. + # shared_graphs = manager.list(self._graphs) + + # Get a function reference to compute_ged that can be pickled + # Using a Python trick to make the instance method picklable + compute_ged_func = partial(GEDModel.pairwise_ged_with_gids_parallel, **kwargs) + + # Create a shared memory array for results + with numpy_shared_memory((n, n), dtype=np.float64) as (dis_matrix, shm_name): + + # Create a partial function with fixed arguments - must use module-level function + worker = partial( + self._process_pair_worker, + shm_name=shm_name, + matrix_shape=(n, n), + compute_ged_func=compute_ged_func, + **kwargs + ) + + try: + # Three different parallelization options for different scenarios + if method == 'joblib': + raise NotImplementedError( + 'Joblib parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + + elif method == 'concurrent': + # Option 2: ProcessPoolExecutor - cleaner API, slightly faster for CPU-bound tasks + # Use thread instead of the process to support shared memory for pre-created + # Cython objects: + raise NotImplementedError( + 'concurrent parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + # if self.verbose >= 2: + # print(f'Using ThreadPoolExecutor.') + # + # with ThreadPoolExecutor(max_workers=n_jobs) as executor: + # futures = [executor.submit(worker, pair) for pair in pairs] + # + # # Track progress if verbose + # if self.verbose >= 2: + # results = [] + # # When `as_completed` is used, the order of results is not guaranteed: + # for f in tqdm( + # as_completed(futures), total=len(futures), + # # futures, total=len(futures), + # desc='Computing distance matrix', file=sys.stdout + # ): + # results.append(f.result()) + # else: + # results = [f.result() for f in as_completed(futures)] + + # # This does not guarantee the order of results: + # self.env_stats['ged_computing_time'] = [ + # stats['ged_computing_time'] for _, _, _, stats in results + # ] + + elif method in ['imap_unordered' or 'multiprocessing']: + # Option 3: multiprocessing.Pool with imap_unordered - more control, classic approach + # Does not work with pre-created GEDEnv Cython objects: + # TypeError: no default __reduce__ due to non-trivial __cinit__ + # So create a GEDEnv for each worker during the initialization. + # todo: maybe it is better to + # parallelize directly in C++ with pybind with e.g., openmp + + if self.verbose >= 2: + print(f'Using multiprocessing imap_unordered.') + + init_kwargs = { + 'ed_method': self.ed_method, + 'edit_cost_fun': self.edit_cost_fun, + 'edit_cost_constants': self._edit_cost_constants, + 'edit_cost_config': self.edit_cost_config, + 'ged_init_options': self.ged_init_options, + 'copy_graphs': False, + # Do not copy graphs here, they are already copied in the worker + } + + + # todo: we can actually control the part of graphs that each worker will process, + # but it is not worth the effort for now. + def init_worker_self_metric_matrix(graphs): + """Initialize each worker process with a GED environment""" + global g_ged_env # <- This will be created for each worker + global g_graphs + global g_env_stats + global g_stats_reported + g_graphs = graphs # Set the graphs for the worker + g_stats_reported = False # Reset the stats reported flag + ( + g_ged_env, env_setting_time, graphs_adding_time, + eager_label_cost_computing_time + ) = GEDModel.create_and_init_ged_env_for_parallel(g_graphs, **init_kwargs) + g_env_stats = { + 'env_setting_time': env_setting_time, + 'graphs_adding_time': graphs_adding_time + } + if eager_label_cost_computing_time is not None: + g_env_stats[ + 'eger_label_cost_computing_time'] = eager_label_cost_computing_time + + + with multiprocessing.Pool( + processes=n_jobs, initializer=init_worker_self_metric_matrix, + initargs=(self._graphs,) + ) as pool: + if self.verbose >= 2: + results = list( + tqdm( + pool.imap_unordered(worker, pairs, chunksize=chunksize), + total=len_itr, + desc='Computing distance matrix', + file=sys.stdout + ) + ) + else: + results = list( + pool.imap_unordered(worker, pairs, chunksize=chunksize) + ) + + stats = [stats for _, _, _, stats, _ in results] + init_stats = [init_stats for _, _, _, _, init_stats in results if + init_stats is not None] + if len(init_stats) != n_jobs: + raise ValueError( + f'Number of init_stats ({len(init_stats)}) does not match ' + f'number of workers ({n_jobs}).' + ) + else: + print(f'Number of init_stats: {len(init_stats)}.') + for s in init_stats: + for k, v in s.items(): + if f'{k}_parallel' not in self.env_stats: + self.env_stats[f'{k}_parallel'] = [] + self.env_stats[f'{k}_parallel'].append(v) + # print(stats) + for s in stats: + for k, v in s.items(): + if f'{k}_parallel' not in self.env_stats: + self.env_stats[f'{k}_parallel'] = [] + self.env_stats[f'{k}_parallel'].append(v) + # print(self.env_stats) + + else: + raise ValueError( + f"Unsupported parallelization method: {method}." + ) + + # Copy the result from shared memory to a regular numpy array + result = dis_matrix.copy() + + except Exception as e: + # Make sure we log any errors that occur during parallel execution + if self.verbose: + print(f"Error during parallel execution: {e}.") + raise + + # At this point, the Manager will automatically clean up shared resources + + return result + + + @staticmethod + def _process_pair_worker( + pair, shm_name, matrix_shape, + compute_ged_func, **kwargs + ): + """Worker function that processes a pair of graphs and updates the shared matrix. + Must be defined at module level to be picklable.""" + # # test only: + # print(f'[{multiprocessing.current_process().name}] Processing pair: {pair}.') + + i, j = pair + + try: + # Access the shared memory + existing_shm = shared_memory.SharedMemory(name=shm_name) + shared_matrix = np.ndarray( + matrix_shape, dtype=np.float64, buffer=existing_shm.buf + ) + + # Compute distance using the function reference + distance, stats, init_stats = compute_ged_func(i, j, **kwargs) + + # Update the matrix + shared_matrix[i, j] = distance + shared_matrix[j, i] = distance + + finally: + # Clean up local shared memory reference + if 'existing_shm' in locals(): + existing_shm.close() + + return i, j, distance, stats, init_stats # Return for progress tracking + + + # %% Cross distance matrix computation methods: + + + def _compute_cross_distance_matrix(self, graphs_t: nx.Graph, **kwargs): + start_time = time.time() + + if self.parallel in ['imap_unordered', 'joblib', 'concurrent', 'multiprocessing']: + dis_matrix = self._compute_distance_matrix_parallel_unified( + self._graphs, graphs_t, **kwargs + ) + + elif self.parallel is None: + dis_matrix = self._compute_cross_distance_matrix_series( + self._graphs, graphs_t, **kwargs + ) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time += time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size (%d, %d) built in %s seconds.' + % (len(graphs_t), len(self._graphs), self._run_time) + ) + + return dis_matrix + + + def _compute_cross_distance_matrix_series( + self, graphs_f: list[nx.Graph], graphs_t: list[nx.Graph], **kwargs): + """Compute the GED distance matrix between two sets of graphs (X and Y) + without parallelization. + + Parameters + ---------- + graphs_f : list of graphs + The fitted graphs (X / self._graphs). + + graphs_t : list of graphs + The target graphs (Y). + + Returns + ------- + dis_matrix : numpy array, shape = [n_Y, n_X] + The computed distance matrix. + """ + # Add graphs to the environment: + graphs_adding_time = self.add_graphs_to_ged_env( + graphs_t, self._ged_env, self.verbose, **{'copy_graphs': self.copy_graphs} + ) + self.env_stats['graphs_adding_time'] += graphs_adding_time + # We put the initialization of the GED environment here for these reasons: + # 1. To process the computation of costs between labels separately for series and parallel mode. + # 2. To include the time of this initialization in the total run time. + # 3. For cross distance matrix, target graphs (Y) need to be added to the environment. + eager_label_cost_computing_time = self.init_ged_env_and_method( + self._ged_env, **{'ged_init_options': self.ged_init_options} + ) + if eager_label_cost_computing_time is not None: + self.env_stats['eger_label_cost_computing_time'] = eager_label_cost_computing_time + + n_f = len(graphs_f) + n_t = len(graphs_t) + n_graphs_in_env = self._ged_env.get_num_graphs() + if n_graphs_in_env != n_f + n_t: + raise ValueError( + f'Number of graphs in the GEDEnv ({n_graphs_in_env}) does not match ' + f'the total number of fitted and target graphs in the GEDModel ({n_f} + {n_t} = {n_f + n_t}).' + ) + + # Initialize distance matrix with zeros + dis_matrix = np.zeros((n_t, n_f)) + iterator = product(range(n_f), range(n_t)) + len_itr = n_f * n_t + if self.verbose: + print(f'Computing distances between {n_t} and {n_f} graphs.') + print(f'The total # of pairs is {len_itr}.') + + self.env_stats['ged_computing_time'] = [] + for i_f, j_t in get_iters( + iterator, desc='Computing distance matrix', + file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + ): + gid_f, gid_t = i_f, j_t + dis_matrix[j_t, i_f], stats = self.pairwise_ged_with_gids( + gid_f, gid_t, self._ged_env, graphs_f + graphs_t, **kwargs + ) + self.env_stats['ged_computing_time'].append(stats['ged_computing_time']) + + return dis_matrix + + + def _compute_distance_matrix_parallel_unified( + self, graphs_f, graphs_t: nx.Graph | None = None, **kwargs + ): + """Compute the GED distance matrix between two sets of graphs (X and Y) + with parallelization. + + Parameters + ---------- + graphs_f : list of graphs + The fitted graphs (X). + + graphs_t : list of graphs + The target graphs (Y). If None, the distance is computed between + the fitted graphs (X) and itself. + + + Returns + ------- + dis_matrix : numpy array, shape = [n_Y, n_X] + The computed distance matrix. + + References + ---------- + This method is written with the help of the Claude 3.7 Sonnet AI, accessed on 2025.05.15. + + todo: this can be merged with the _compute_X_dm_parallel method. + """ + # Handle the case where graphs2 is not provided + is_same_set = graphs_t is None + if is_same_set: + graphs_t = graphs_f + + n_f = len(graphs_f) + n_t = len(graphs_t) + + # Get all pairs of indices to compute + if is_same_set: + # Only compute the upper triangular portion for efficiency when comparing within same set + pairs = list(combinations(range(n_f), 2)) + else: + # Compute all pairs when comparing between different sets: + # Notice this has different order (fiited / col first) as the matrix (target / row first): + pairs = list(product(range(n_f), range(n_t))) + + len_itr = len(pairs) + + n_jobs = self.n_jobs + chunksize = self.chunksize + method = self.parallel + # memory_limit = kwargs.get('memory_limit', 'auto') + + if self.verbose: + if is_same_set: + print(f'Graphs in total: {n_f}.') + else: + print(f'Computing distances between {n_t} and {n_f} graphs.') + print(f'The total # of pairs is {len_itr}.') + + # Determine the number of workers: + if n_jobs == -1 or n_jobs is None: + n_jobs = os.cpu_count() - 1 + n_jobs = min(n_jobs, os.cpu_count(), len_itr) + + # Auto-calculate optimal chunk size if not provided + if chunksize is None: + # # this seems to be slightly faster when using `test_ged_model.py` + # # with 100 graphs (0.0012 s vs 0.0016 s per pair). Yet gets slower with + # # larger number of graphs (e.g., 1000) (~ 31 mins vs ~ 40 mins in total). + # if len_itr < 100 * n_jobs: + # chunksize = int(len_itr / n_jobs) + 1 + # else: + # chunksize = 100 + + # Balancing chunk size: larger chunks reduce overhead but limit load balancing + # A good heuristic is sqrt(len_itr / n_jobs) * 4 + chunksize = max(1, int(np.sqrt(len_itr / n_jobs) * 4)) + + if self.verbose >= 2: + print( + f"Running with {n_jobs} parallel processes and chunk size of {chunksize}..." + ) + + # Get a function reference to compute_ged that can be pickled + # Using a Python trick to make the instance method picklable + compute_ged_func = partial( + GEDModel.pairwise_ged_with_gids_parallel, is_same_set=is_same_set, **kwargs + ) + + # Create a shared memory array for results + with numpy_shared_memory((n_t, n_f), dtype=np.float64) as (dis_matrix, shm_name): + # Create a partial function with fixed arguments - MUST NOT use + # inline function here, as it won't be picklable: + worker = partial( + self._process_pair_worker_unified, + shm_name=shm_name, + matrix_shape=(n_t, n_f), + compute_ged_func=compute_ged_func, + is_same_set=is_same_set, + **kwargs + ) + + try: + # Three different parallelization options for different scenarios + if method == 'joblib': + raise NotImplementedError( + 'Joblib parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + + elif method == 'concurrent': + # Option 2: ProcessPoolExecutor - cleaner API, slightly faster for CPU-bound tasks + raise NotImplementedError( + 'concurrent parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + + elif method in ['imap_unordered' or 'multiprocessing']: + # Option 3: multiprocessing.Pool with imap_unordered - more control, classic approach + if self.verbose >= 2: + print(f'Using multiprocessing imap_unordered.') + + init_kwargs = { + 'ed_method': self.ed_method, + 'edit_cost_fun': self.edit_cost_fun, + 'edit_cost_constants': self._edit_cost_constants, + 'edit_cost_config': self.edit_cost_config, + 'ged_init_options': self.ged_init_options, + 'copy_graphs': False, + # Do not copy graphs here, they are already copied in the worker + } + + + def init_worker_cross_metric_matrix(graphs_f, graphs_t): + """Initialize each worker process with a GED environment""" + global g_ged_env # <- This will be created for each worker + global g_graphs_f + global g_graphs_t + global g_env_stats + global g_stats_reported + g_graphs_f = graphs_f # Set the graphs for the worker + g_graphs_t = graphs_t + g_stats_reported = False # Reset the stats reported flag + ( + g_ged_env, env_setting_time, graphs_adding_time, + eager_label_cost_computing_time + ) = GEDModel.create_and_init_ged_env_for_parallel( + g_graphs_f + g_graphs_t, **init_kwargs + ) + g_env_stats = { + 'env_setting_time': env_setting_time, + 'graphs_adding_time': graphs_adding_time + } + if eager_label_cost_computing_time is not None: + g_env_stats[ + 'eger_label_cost_computing_time'] = eager_label_cost_computing_time + + + with multiprocessing.Pool( + processes=n_jobs, initializer=init_worker_cross_metric_matrix, + initargs=(graphs_f, graphs_t,) + ) as pool: + if self.verbose >= 2: + results = list( + tqdm( + pool.imap_unordered(worker, pairs, chunksize=chunksize), + total=len_itr, + desc='Computing distance matrix', + file=sys.stdout + ) + ) + else: + results = list(pool.imap_unordered(worker, pairs, chunksize=chunksize)) + + stats = [stats for _, _, _, stats, _ in results] + init_stats = [ + init_stats for _, _, _, _, init_stats in results if + init_stats is not None + ] + if len(init_stats) != n_jobs: + raise ValueError( + f'Number of init_stats ({len(init_stats)}) does not match ' + f'number of workers ({n_jobs}).' + ) + else: + print(f'Number of init_stats: {len(init_stats)}.') + for s in init_stats: + for k, v in s.items(): + if f'{k}_parallel' not in self.env_stats: + self.env_stats[f'{k}_parallel'] = [] + self.env_stats[f'{k}_parallel'].append(v) + # print(stats) + for s in stats: + for k, v in s.items(): + if f'{k}_parallel' not in self.env_stats: + self.env_stats[f'{k}_parallel'] = [] + self.env_stats[f'{k}_parallel'].append(v) + # print(self.env_stats) + + else: + raise ValueError( + f"Unsupported parallelization method: {method}." + ) + + # Copy the result from shared memory to a regular numpy array + result = dis_matrix.copy() + + except Exception as e: + # Make sure we log any errors that occur during parallel execution + if self.verbose: + print(f"Error during parallel execution: {e}.") + raise + + # At this point, the Manager will automatically clean up shared resources + + return result + + + # %% Parallelization methods: + + @staticmethod + def create_and_init_ged_env_for_parallel(graphs: list[nx.Graph], **kwargs): + """Create and initialize a GED environment for parallel processing.""" + # Create a new GEDEnv instance for each worker + ged_env, env_setting_time = GEDModel.create_and_setup_ged_env(graph=graphs[0], **kwargs) + # print(f'[{multiprocessing.current_process().name}] ') + # print(ged_env) + + # Add all graphs to the environment: + graphs_adding_time = GEDModel.add_graphs_to_ged_env(graphs, ged_env, verbose=0, **kwargs) + # print('fnished adding graphs to the GEDEnv in worker.') + # print(ged_env.get_all_graph_ids()) + eager_label_cost_computing_time = GEDModel.init_ged_env_and_method(ged_env, **kwargs) + + graph_ids = ged_env.get_all_graph_ids() + n = len(graph_ids) + if n != len(graphs): + raise ValueError( + f'Number of graphs in the GEDEnv ({n}) does not match ' + f'number of graphs set from GEDModel to the worker ({len(graphs)}).' + ) + return ged_env, env_setting_time, graphs_adding_time, eager_label_cost_computing_time + + + # @staticmethod + # def _thread_pair_worker(pair, distance_matrix, compute_ged_func, **kwargs): + # """ + # Worker function that processes a pair of graphs and updates the shared matrix. + # Used for processing pre-created Cython objects (GEDEnv) in threads (with shared memory). + # Please make sure that the Cython objects are thread-safe!!! + # """ + # # # test only: + # # print(f'[{threading.current_thread().name}] Processing pair: {pair}.') + # # # Sleep for 1 second to simulate work: + # # time.sleep(1) + # + # i, j = pair + # + # # Compute distance using the function reference + # distance, stats = compute_ged_func(i, j, **kwargs) + # # Update the matrix + # distance_matrix[i, j] = distance + # distance_matrix[j, i] = distance + # + # return i, j, distance, stats # Return for progress tracking + + @staticmethod + def _process_pair_worker_unified( + pair, shm_name, matrix_shape, compute_ged_func, is_same_set=True, **kwargs + ): + """Worker function that processes a pair of graphs and updates the shared matrix. + Must be defined at module level to be picklable.""" + i_f, j_t = pair # Indices of the fitted and target graphs in the original lists in GEDModel + + try: + # Access the shared memory + existing_shm = shared_memory.SharedMemory(name=shm_name) + shared_matrix = np.ndarray(matrix_shape, dtype=np.float64, buffer=existing_shm.buf) + + # Compute distance using the function reference + distance, stats, init_stats = compute_ged_func(i_f, j_t, **kwargs) + + # Update the matrix + shared_matrix[j_t, i_f] = distance + + # If computing within the same set, update symmetric position: + if is_same_set and i_f != j_t: + shared_matrix[i_f, j_t] = distance + + finally: + # Clean up local shared memory reference + if 'existing_shm' in locals(): + existing_shm.close() + + return i_f, j_t, distance, stats, init_stats # Return for progress tracking + + + @staticmethod + def pairwise_ged_with_gids_parallel( + graph_id_f: int, graph_id_t: int, is_same_set: bool = True, **kwargs + ): + global g_ged_env # <- Use the global GEDEnv created in the worker initializer + if is_same_set: + global g_graphs + graphs1, graphs2 = g_graphs, None + else: + global g_graphs_f, g_graphs_t + graphs1, graphs2 = g_graphs_f, g_graphs_t + + dis, stats = GEDModel.pairwise_ged_with_gids( + graph_id_f, graph_id_t, g_ged_env, graphs1, + is_same_set=is_same_set, graphs2=graphs2, **kwargs + ) + + global g_stats_reported + # print(g_stats_reported) + if not g_stats_reported: + # Report the stats only once per worker + g_stats_reported = True + global g_env_stats + return dis, stats, g_env_stats + else: + return dis, stats, None # Return None for env_stats if already reported + + + # %% GEDEnv related methods: + + @staticmethod + def get_env_type(graph: nx.Graph | None = None): + """ + Check the environment type of the graph. + If `env_type` is set on initialization, return it. + Otherwise, check the given graph's node and edge labels to determine the type. + + Only one node and one edge are checked to determine the type. + This function expects that all nodes have the same type of labels, so as all + edges. + """ + if graph is None: + raise ValueError( + 'Graph is not provided while `env_type` not set on initialization. ' + 'Cannot determine environment type.' + ) + # Use 'gxl' env type only if all nodes and edge labes are strings, and at least one + # node or edge label is present: + one_n_labels = graph.nodes[list(graph.nodes)[0]] + for k, v in one_n_labels.items(): + if not isinstance(v, str): + return 'attr' + if nx.number_of_edges(graph) != 0: + one_e_labels = graph.edges[list(graph.edges)[0]] + for k, v in one_e_labels.items(): + if not isinstance(v, str): + return 'attr' + if len(one_n_labels) > 0 or ( + nx.number_of_edges(graph) != 0 and len(one_e_labels) > 0 + ): + return 'gxl' + return 'attr' + + + @staticmethod + def create_and_setup_ged_env(env_type: str | None = None, graph: nx.Graph = None, **kwargs): + """ + Create and set up the GED environment. + + Notes + ----- + `GEDENV.init()` and `GEDENV.init_method()` must be called after all graphs are added + to the GEDEnv. They are not called here. + """ + env_setting_time = time.time() + + from gklearn.gedlib import gedlibpy + + if env_type is None: + env_type = GEDModel.get_env_type(graph=graph) + ged_options = { + 'env_type': env_type, + 'edit_cost': kwargs['edit_cost_fun'], + 'method': kwargs['ed_method'], + 'edit_cost_constants': kwargs['edit_cost_constants'], + 'edit_cost_config': kwargs['edit_cost_config'], + } + + ged_env = gedlibpy.GEDEnv(env_type=ged_options.get('env_type', 'attr'), verbose=False) + ged_env.set_edit_cost( + ged_options['edit_cost'], + edit_cost_constant=ged_options['edit_cost_constants'], + **ged_options.get('edit_cost_config') and { + 'edit_cost_config': ged_options['edit_cost_config'] + } or {} + ) + + ged_env.set_method(ged_options['method'], ged_options_to_string(ged_options)) + + env_setting_time = time.time() - env_setting_time + + return ged_env, env_setting_time + + + @staticmethod + def add_graphs_to_ged_env(graphs: list[nx.Graph], ged_env, verbose: int = 1, **kwargs): + # `init()` and `init_method()` must be called after all graphs are added to the GEDEnv. + + iterator = enumerate(graphs) + if verbose >= 2: + iterator = tqdm( + iterator, desc='Adding graphs to the GED environment', + file=sys.stdout, total=len(graphs) + ) + graphs_adding_time = [] + for i, g in iterator: + graph_adding_start_time = time.time() + GEDModel.add_graph_to_ged_env(g.copy() if kwargs['copy_graphs'] else g, ged_env=ged_env) + graphs_adding_time.append(time.time() - graph_adding_start_time) + + return graphs_adding_time + + + @staticmethod + def add_graph_to_ged_env(graph: nx.Graph, ged_env): + ged_env.add_nx_graph(graph, '', ignore_duplicates=True) + + + @staticmethod + def init_ged_env_and_method(ged_env, **kwargs): + # `init()` must be called after all graphs are added to the GEDEnv: + # todo: determine which is faster: lazy or eager. Maybe do this automatically. + # (eager can not show progress bar): + init_options = 'LAZY_WITHOUT_SHUFFLED_COPIES' if kwargs['ged_init_options'] is None else \ + kwargs['ged_init_options'] + if init_options.startswith('EAGER_'): + eager_label_cost_computing_time = time.time() + print(f'{INFO_TAG}Starting eager label cost computing. This may take a while...') + ged_env.init(init_options) + print(f'{INFO_TAG}Eager label cost computing finished.') + eager_label_cost_computing_time = time.time() - eager_label_cost_computing_time + else: + ged_env.init(init_options) + eager_label_cost_computing_time = None + # `init_method()` must be called after `init()`: + ged_env.init_method() + return eager_label_cost_computing_time + + + @staticmethod + def pairwise_ged_with_gids( + graph_id1: int, graph_id2: int, ged_env, graphs: list[nx.Graph], + is_same_set: bool = True, graphs2: list[nx.Graph] | None = None, **kwargs + ): + """ + Compute pairwise GED between two graphs using their IDs in the GEDEnv. + + This method uses the GEDEnv member globally available in the class. + + Parameters + ---------- + graph_id1 : int + ID of the first graph in the GEDEnv. If `is_same_set` is False, it refers to the fitted + (reference) graph. + + graph_id2 : int + ID of the second graph in the GEDEnv. If `is_same_set` is False, it refers to the target + graph. + + Notes + ----- + - Be careful with the order between `graph_id1` and `graph_id2`. When `is_same_set` = False, + `graph_id1` is the fitted (reference) graph and `graph_id2` is the target graph. + + Todo + ---- + - Since GED is not normally symmetric, maybe add an option to compute the average of the two + - distances (forward and backward) or the minimum of the two distances. + """ + ged_computing_time = time.time() + + repeats = kwargs.get('repeats', 1) + + dis_min = np.inf + + if is_same_set: + graph_id2_env = graph_id2 + else: + graph_id2_env = len(graphs) + graph_id2 # Both graph lists were added to the GEDEnv. + + for i in range(0, repeats): + ged_env.run_method(graph_id1, graph_id2_env) + upper = ged_env.get_upper_bound(graph_id1, graph_id2_env) + dis = upper + # print(dis) + if dis < dis_min: + dis_min = dis + pi_forward = ged_env.get_forward_map(graph_id1, graph_id2_env) + pi_backward = ged_env.get_backward_map(graph_id1, graph_id2_env) + # lower = ged_env.get_lower_bound(g, h) + + ged_computing_time = time.time() - ged_computing_time + + # make the map label correct (label remove mappings as np.inf): + if is_same_set: + g1, g2 = graphs[graph_id1], graphs[graph_id2] + else: + g1, g2 = graphs[graph_id1], graphs2[graph_id2] + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + # print(pi_forward) + + stats = { + 'ged_computing_time': ged_computing_time + } + + # @TODO: Better to have a if here. + # if self.compute_n_eo: + # n_eo_tmp = get_nb_edit_operations( + # Gi, Gj, pi_forward, pi_backward, + # edit_cost=self.edit_cost_fun, + # node_labels=self.node_labels, edge_labels=self.edge_labels + # ) + # else: + # n_eo_tmp = None + # return dis, n_eo_tmp + return dis, stats + + + # %% + + def is_graph(self, graph): + if isinstance(graph, nx.Graph): + return True + if isinstance(graph, nx.DiGraph): + return True + if isinstance(graph, nx.MultiGraph): + return True + if isinstance(graph, nx.MultiDiGraph): + return True + return False + + + def __repr__(self): + return ( + f"{self.__class__.__name__}(" + f"optim_method={self.optim_method}, " + f"ed_method={self.ed_method}, " + f"edit_cost_fun={self.edit_cost_fun}, " + f"node_labels={self.node_labels}, " + f"edge_labels={self.edge_labels}, " + f"optim_options={self.optim_options}, " + f"init_edit_cost_constants={self.init_edit_cost_constants}, " + f"copy_graphs={self.copy_graphs}, " + f"parallel={self.parallel}, " + f"n_jobs={self.n_jobs}, " + f"verbose={self.verbose}, " + + (f"normalize={self.normalize}, " if hasattr(self, 'normalize') else "") + + f"run_time={self.run_time}" + f")" + ) + + + @property + def graphs(self): + return self._graphs + + + # @property + # def parallel(self): + # return self.parallel + + # @property + # def n_jobs(self): + # return self.n_jobs + + # @property + # def verbose(self): + # return self.verbose + + # @property + # def normalize(self): + # return self.normalize + + @property + def run_time(self): + return self._run_time + + + @property + def test_run_time(self): + return self._test_run_time + + + @property + def dis_matrix(self): + return self._dm_train + + + @dis_matrix.setter + def dis_matrix(self, value): + self._dm_train = value + + + @property + def metric_matrix(self): + return self._dm_train + + + @metric_matrix.setter + def metric_matrix(self, value): + self._dm_train = value + + + @property + def edit_cost_constants(self): + return self._edit_cost_constants + + + # @property + # def gram_matrix_unnorm(self): + # return self._gram_matrix_unnorm + + # @gram_matrix_unnorm.setter + # def gram_matrix_unnorm(self, value): + # self._gram_matrix_unnorm = value + + @property + def n_pairs(self): + """ + The number of pairs of graphs between which the GEDs are computed. + """ + try: + check_is_fitted(self, '_dm_train') + return len(self._dm_train) * (len(self._dm_train) - 1) / 2 + except NotFittedError: + return None + + +# Context manager for shared memory with automatic cleanup +@contextmanager +def numpy_shared_memory(shape, dtype=np.float64): + """Create a numpy array in shared memory that automatically cleans up.""" + size = int(np.prod(shape)) * np.dtype(dtype).itemsize + shm = shared_memory.SharedMemory(create=True, size=size) + try: + array = np.ndarray(shape, dtype=dtype, buffer=shm.buf) + array.fill(0) # Initialize with zeros + yield array, shm.name + finally: + shm.close() + shm.unlink() diff --git a/gklearn/experiments/ged/ged_model/ged_model_parallel.py b/gklearn/experiments/ged/ged_model/ged_model_parallel.py index 7841ffd380..0c570457fd 100644 --- a/gklearn/experiments/ged/ged_model/ged_model_parallel.py +++ b/gklearn/experiments/ged/ged_model/ged_model_parallel.py @@ -1,5 +1,7 @@ """ -basic +ged_model_parallel.py + +The parallel version of the GEDModel for testing. Local GEDEnv is used. @Author: jajupmochi @Date: May 22 2025 @@ -9,6 +11,7 @@ import os import sys import time +import warnings from concurrent.futures import ProcessPoolExecutor from contextlib import contextmanager from functools import partial @@ -24,11 +27,10 @@ from tqdm import tqdm from gklearn.ged.model.distances import euclid_d -from gklearn.ged.util import pairwise_ged +from gklearn.ged.util.util import ged_options_to_string from gklearn.utils import get_iters -# @TODO: it should be faster if creating a global env variable. class GEDModel(BaseEstimator): # , ABC): """The graph edit distance model class compatible with `scikit-learn`. @@ -59,6 +61,7 @@ def __init__( edit_cost_config: dict = {}, optim_method='init', optim_options={'y_distance': euclid_d, 'mode': 'reg'}, + ged_init_options=None, node_labels=[], edge_labels=[], parallel=None, @@ -107,6 +110,8 @@ def __init__( self.copy_graphs = copy_graphs self.verbose = verbose + self.pairwise_stats = [] # Store pairwise stats for each pair of graphs. + # self._run_time = 0 # self._gram_matrix = None @@ -562,6 +567,11 @@ def _compute_X_distance_matrix(self, **kwargs): def _compute_X_dm_series(self, graphs, **kwargs): n = len(graphs) dis_matrix = np.zeros((n, n)) + if self.pairwise_stats: + warnings.warn( + '`pairwise_stats` is not empty. Cleaning it for the new computation.' + ) + self.pairwise_stats = [] iterator = combinations(range(n), 2) len_itr = int(n * (n - 1) / 2) @@ -573,8 +583,9 @@ def _compute_X_dm_series(self, graphs, **kwargs): file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr ): g1, g2 = graphs[i], graphs[j] - dis_matrix[i, j], _ = self.compute_ged(g1, g2, **kwargs) + dis_matrix[i, j], stats = self.compute_ged(g1, g2, **kwargs) dis_matrix[j, i] = dis_matrix[i, j] + self.pairwise_stats.append(stats) return dis_matrix @@ -640,6 +651,12 @@ def _compute_X_dm_parallel(self, graphs, **kwargs): f"Running with {n_jobs} parallel processes and chunk size of {chunksize}" ) + if self.pairwise_stats: + warnings.warn( + '`pairwise_stats` is not empty. Cleaning it for the new computation.' + ) + self.pairwise_stats = [] + # For networkx graphs, we need to use a Manager to share them between processes with Manager() as manager: # Create a managed shared list for the graphs @@ -654,33 +671,6 @@ def _compute_X_dm_parallel(self, graphs, **kwargs): dis_matrix, shm_name ): - # # Define worker function that updates the shared matrix directly - # def process_pair(pair): - # i, j = pair - # g1, g2 = graphs[i], graphs[j] - # - # try: - # # Access the shared memory - # existing_shm = shared_memory.SharedMemory(name=shm_name) - # shared_matrix = np.ndarray( - # (n, n), dtype=np.float64, buffer=existing_shm.buf - # ) - # - # # Compute distance - use graph indices to avoid serializing graphs - # distance = self.compute_ged(g1, g2, **kwargs) - # - # # Update the matrix with thread/process-safe approach - # # We're only writing to unique cells so no locking needed - # shared_matrix[i, j] = distance - # shared_matrix[j, i] = distance - # - # finally: - # # Clean up local shared memory reference - # if 'existing_shm' in locals(): - # existing_shm.close() - # - # return i, j, distance # Return for progress tracking - # Create a partial function with fixed arguments - must use module-level function worker = partial( self._process_pair_worker, @@ -764,6 +754,8 @@ def _compute_X_dm_parallel(self, graphs, **kwargs): ) ) + self.pairwise_stats = [stats for _, _, _, stats in results] + else: raise ValueError( f"Unsupported parallelization method: {method}." @@ -778,7 +770,7 @@ def _compute_X_dm_parallel(self, graphs, **kwargs): print(f"Error during parallel execution: {e}.") raise - # At this point, the Manager will automatically clean up shared resources + # At this point, the Manager will automatically clean up shared resources return result @@ -804,7 +796,7 @@ def _process_pair_worker( ) # Compute distance using the function reference - distance, _ = compute_ged_func(g1, g2, **kwargs) + distance, stats = compute_ged_func(g1, g2, **kwargs) # Update the matrix shared_matrix[i, j] = distance @@ -815,7 +807,7 @@ def _process_pair_worker( if 'existing_shm' in locals(): existing_shm.close() - return i, j, distance # Return for progress tracking + return i, j, distance, stats # Return for progress tracking def _compute_cross_distance_matrix_series(self, graphs1, graphs2, **kwargs): @@ -1049,7 +1041,7 @@ def _compute_cross_distance_matrix_parallel( print(f"Error during parallel execution: {e}.") raise - # At this point, the Manager will automatically clean up shared resources + # At this point, the Manager will automatically clean up shared resources return result @@ -1075,7 +1067,7 @@ def _process_pair_worker_cross( ) # Compute distance using the function reference - distance, _ = compute_ged_func(g1, g2, **kwargs) + distance, stats = compute_ged_func(g1, g2, **kwargs) # Update the matrix shared_matrix[i, j] = distance @@ -1092,14 +1084,6 @@ def _process_pair_worker_cross( return i, j, distance # Return for progress tracking - def _wrapper_compute_ged(self, itr): - i = itr[0] - j = itr[1] - # @TODO: repeats are not considered here. - dis, _ = self.compute_ged(G_gn[i], G_gn[j]) - return i, j, dis - - def compute_ged(self, Gi, Gj, **kwargs): """ Compute GED between two graphs according to edit_cost. @@ -1113,7 +1097,7 @@ def compute_ged(self, Gi, Gj, **kwargs): 'edit_cost_config': self.edit_cost_config, } repeats = kwargs.get('repeats', 1) - dis, pi_forward, pi_backward = pairwise_ged( + dis, pi_forward, pi_backward, stats = pairwise_ged( Gi, Gj, ged_options, repeats=repeats ) # @TODO: Better to have a if here. @@ -1126,7 +1110,7 @@ def compute_ged(self, Gi, Gj, **kwargs): # else: # n_eo_tmp = None # return dis, n_eo_tmp - return dis, None + return dis, stats def get_env_type(self, graph: nx.Graph | None = None): @@ -1164,44 +1148,6 @@ def get_env_type(self, graph: nx.Graph | None = None): return 'attr' - - # def _compute_kernel_list(self, g1, g_list): - # start_time = time.time() - - # if self.parallel == 'imap_unordered': - # kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) - # elif self.parallel is None: - # kernel_list = self._compute_kernel_list_series(g1, g_list) - # else: - # raise Exception('Parallel mode is not set correctly.') - - # self._run_time = time.time() - start_time - # if self.verbose: - # print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' - # % (len(g_list), self._run_time)) - - # return kernel_list - - # def _compute_kernel_list_series(self, g1, g_list): - # pass - - # def _compute_kernel_list_imap_unordered(self, g1, g_list): - # pass - - # def _compute_single_kernel(self, g1, g2): - # start_time = time.time() - - # kernel = self._compute_single_kernel_series(g1, g2) - - # self._run_time = time.time() - start_time - # if self.verbose: - # print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) - - # return kernel - - # def _compute_single_kernel_series(self, g1, g2): - # pass - def is_graph(self, graph): if isinstance(graph, nx.Graph): return True @@ -1216,23 +1162,21 @@ def is_graph(self, graph): def __repr__(self): return ( - f"{self.__class__.__name__}(" - f"optim_method={self.optim_method}, " - f"ed_method={self.ed_method}, " - f"edit_cost_fun={self.edit_cost_fun}, " - f"node_labels={self.node_labels}, " - f"edge_labels={self.edge_labels}, " - f"optim_options={self.optim_options}, " - f"init_edit_cost_constants={self.init_edit_cost_constants}, " - f"copy_graphs={self.copy_graphs}, " - f"parallel={self.parallel}, " - f"n_jobs={self.n_jobs}, " - f"verbose={self.verbose}, " - f"normalize={self.normalize}, " if hasattr( - self, 'normalize' - ) else "" - f"run_time={self.run_time}" - f")" + f"{self.__class__.__name__}(" + f"optim_method={self.optim_method}, " + f"ed_method={self.ed_method}, " + f"edit_cost_fun={self.edit_cost_fun}, " + f"node_labels={self.node_labels}, " + f"edge_labels={self.edge_labels}, " + f"optim_options={self.optim_options}, " + f"init_edit_cost_constants={self.init_edit_cost_constants}, " + f"copy_graphs={self.copy_graphs}, " + f"parallel={self.parallel}, " + f"n_jobs={self.n_jobs}, " + f"verbose={self.verbose}, " + + (f"normalize={self.normalize}, " if hasattr(self, 'normalize') else "") + + f"run_time={self.run_time}" + f")" ) @@ -1303,7 +1247,7 @@ def edit_cost_constants(self): @property def n_pairs(self): """ - The number of pairs of graphs between which the GEDs are computed. + The number of graph pairs between which the GEDs are computed. """ try: check_is_fitted(self, '_dm_train') @@ -1312,11 +1256,6 @@ def n_pairs(self): return None -def _init_worker_ged_mat(gn_toshare): - global G_gn - G_gn = gn_toshare - - # Context manager for shared memory with automatic cleanup @contextmanager def numpy_shared_memory(shape, dtype=np.float64): @@ -1332,3 +1271,93 @@ def numpy_shared_memory(shape, dtype=np.float64): shm.unlink() +def pairwise_ged( + g1, g2, options={}, sort=True, repeats=1, parallel=False, verbose=True +): + """Compute the graph edit distance between two graphs using the gedlib library + with repeats. + + Notes + ----- + - For methods such as BIPARTITE, the repeats may result same results. + - # of edit operations are not computed in this method. + """ + from gklearn.gedlib import gedlibpy + + env_setting_time, graphs_adding_time, ged_computing_time = 0, 0, 0 + + total_time = time.time() + start_time = total_time + + ged_env = gedlibpy.GEDEnv(env_type=options.get('env_type', 'attr'), verbose=False) + ged_env.set_edit_cost( + options['edit_cost'], + edit_cost_constant=options['edit_cost_constants'], + **options.get('edit_cost_config') and { + 'edit_cost_config': options['edit_cost_config'] + } or {} + ) + + env_setting_time += time.time() - start_time + + start_time = time.time() + + ged_env.add_nx_graph(g1, '') + ged_env.add_nx_graph(g2, '') + + graphs_adding_time += time.time() - start_time + + list_id = ged_env.get_all_graph_ids() + + start_time = time.time() + + ged_env.init( + init_option=( + options[ + 'init_option'] if 'init_option' in options else 'EAGER_WITHOUT_SHUFFLED_COPIES' + ) + ) + ged_env.set_method(options['method'], ged_options_to_string(options)) + ged_env.init_method() + + env_setting_time += time.time() - start_time + + g = list_id[0] + h = list_id[1] + dis_min = np.inf + # print('------------------------------------------') + + start_time = time.time() + + for i in range(0, repeats): + ged_env.run_method(g, h) + upper = ged_env.get_upper_bound(g, h) + dis = upper + # print(dis) + if dis < dis_min: + dis_min = dis + pi_forward = ged_env.get_forward_map(g, h) + pi_backward = ged_env.get_backward_map(g, h) + # lower = ged_env.get_lower_bound(g, h) + + ged_computing_time += time.time() - start_time + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + # print(pi_forward) + + total_time = time.time() - total_time + + stats = { + 'pairwise_total_time': total_time, + 'env_setting_time': env_setting_time, + 'graphs_adding_time': graphs_adding_time, + 'ged_computing_time': ged_computing_time + } + + return dis, pi_forward, pi_backward, stats diff --git a/gklearn/experiments/ged/ged_model/profile_ged_model.py b/gklearn/experiments/ged/ged_model/profile_ged_model.py new file mode 100644 index 0000000000..4b5cbad7d1 --- /dev/null +++ b/gklearn/experiments/ged/ged_model/profile_ged_model.py @@ -0,0 +1,611 @@ +""" +@File: profile_ged_model_cross_matrix.py + +@Author: jajupmochi +@Date: June 3 2025 +""" +from typing import List + +import networkx as nx +import numpy as np + +ISSUE_TAG = "\033[91m[issue]\033[0m " # Red +INFO_TAG = "\033[94m[info]\033[0m " # Blue +SUCCESS_TAG = "\033[92m[success]\033[0m " # Green + + +def fit_model_ged( + graphs_X: List[nx.Graph], + graphs_Y: List[nx.Graph] = None, + ged_options: dict = None, + parallel: bool = None, + n_jobs: int = None, + chunksize: int = None, + copy_graphs: bool = True, + read_resu_from_file: int = 1, + output_dir: str = None, + params_idx: str = None, + reorder_graphs: bool = False, + verbose: int = 2, + use_global_env: bool = True, + **kwargs +): + # Reorder graphs if specified: + if reorder_graphs: + graphs_X = reorder_graphs_by_index(graphs_X, idx_key='id') + if graphs_Y is not None: + graphs_Y = reorder_graphs_by_index(graphs_Y, idx_key='id') + + # Compute metric matrix otherwise: + print(f'{INFO_TAG}Computing metric matrix...') + all_graphs = graphs_X + graphs_Y if graphs_Y else graphs_X + nl_names = list( + all_graphs[0].nodes[list(all_graphs[0].nodes)[0]].keys() + ) if graphs_X else [] + if not all_graphs: + el_names = [] + else: + idx_edge = ( + np.where(np.array([nx.number_of_edges(g) for g in all_graphs]) > 0)[0] + ) + if len(idx_edge) == 0: + el_names = [] + else: + el_names = list( + all_graphs[idx_edge[0]].edges[ + list(all_graphs[idx_edge[0]].edges)[0]].keys() + ) + + if use_global_env: + from gklearn.experiments.ged.ged_model.ged_model_global_env import GEDModel + print(f'>>> {INFO_TAG}Using global GEDEnv for all pairs of graphs.') + else: + from gklearn.experiments.ged.ged_model.ged_model_parallel import GEDModel + print(f'>>> {INFO_TAG}Using local GEDEnv for each pair of graphs.') + + if parallel is False: + parallel = None + elif parallel is True: + # Set automatically: the global version uses 'concurrent', and local version 'multiprocessing': + parallel = True + + model = GEDModel( + # env_type=ged_options['env_type'], + ed_method=ged_options['method'], + edit_cost_fun=ged_options['edit_cost_fun'], + init_edit_cost_constants=ged_options['edit_costs'], + edit_cost_config=ged_options.get('edit_cost_config', {}), + optim_method=ged_options['optim_method'], + ged_init_options=ged_options['init_options'], + node_labels=nl_names, edge_labels=el_names, + parallel=parallel, + n_jobs=n_jobs, # fixme: None + chunksize=chunksize, + copy_graphs=copy_graphs, + # make sure it is a full deep copy. and faster! + verbose=verbose + ) + + # Train model. + try: + if graphs_Y is None: + # Compute the distance matrix for the same set of graphs: + matrix = model.fit_transform( + graphs_X, y=graphs_Y, + save_dm_train=True, repeats=ged_options['repeats'], + ) + else: + model.fit(graphs_X, repeats=ged_options['repeats']) + matrix = model.transform( + graphs_Y, + save_dm_test=True, repeats=ged_options['repeats'], + ) + + except OSError as exception: + if 'GLIBC_2.23' in exception.args[0]: + msg = \ + 'This error is very likely due to the low version of GLIBC ' \ + 'on your system. ' \ + 'The required version of GLIBC is 2.23. This may happen on the ' \ + 'CentOS 7 system, where the highest version of GLIBC is 2.17. ' \ + 'You may check your CLIBC version by bash command `rpm -q glibc`. ' \ + 'The `graphkit-learn` library comes with GLIBC_2.23, which you can ' \ + 'install by enable the `--build-gedlib` option: ' \ + '`python3 setup.py install --build-gedlib`. This will compile the C++ ' \ + 'module `gedlib`, which requires a C++ compiler and CMake.' + raise AssertionError(msg) from exception + else: + assert False, exception + except Exception as exception: + assert False, exception + + # Save history: + # For graph kernels it is n * (n - 1) / 2: + if graphs_Y is None: + n_pairs = len(graphs_X) * (len(graphs_X) - 1) / 2 + else: + n_pairs = len(graphs_X) * len(graphs_Y) + # history = {'run_time': AverageMeter()} + # history['run_time'].update(model.run_time / n_pairs, n_pairs) + + # # Save model and history to file: + # if read_resu_from_file >= 1: + # os.makedirs(os.path.dirname(fn_model), exist_ok=True) + # pickle.dump({'model': model, 'history': history}, open(fn_model, 'wb')) + + # Print out the information: + params_msg = f' for parameters {params_idx}' if params_idx else '' + print( + f'{SUCCESS_TAG}Computed metric matrix of size {matrix.shape} in {model.run_time:.3f} ' + f'seconds ({(model.run_time / n_pairs):.9f} s per pair){params_msg}.' + ) + + stats = { + 'n_pairs': n_pairs, + 'matrix_shape': matrix.shape, + 'run_time': model.run_time, + 'run_time_per_pair': model.run_time / n_pairs, + 'env_stats': model.env_stats if use_global_env else model.pairwise_stats, + } + + return model, matrix, stats + + +def show_some_graphs(graphs): + """ + Show some graphs from the list of graphs. + """ + print(f'{INFO_TAG}Showing some graphs:') + for i, g in enumerate(graphs[:2]): + print(f'Graph {i}:') + print('Number of nodes:', g.number_of_nodes()) + print('Number of edges:', g.number_of_edges()) + print('Nodes:', g.nodes(data=True)) + print('Edges:', g.edges(data=True)) + print() + + +def convert_graphs_coords_from_attr_to_string(graphs: List[nx.Graph]): + """ + Convert the coordinates of nodes in graphs from the attribute format `AttrLabel` to the string format `GXLLabel`. + """ + for g in graphs: + for node in g.nodes(data=True): + if 'coords' in node[1]: + # Convert the coordinates to string format and store them in "x" and "y" keys: + coords = node[1]['coords'] + node[1]['x'] = str(coords[0]) + node[1]['y'] = str(coords[1]) + for idx in range(2, len(coords)): + # If there are more than 2 coordinates, store them with extra keys: + node[1][f'coord_{idx}'] = str(coords[idx]) + del node[1]['coords'] + print(f'{INFO_TAG}Converted coordinates from attribute format to string format.') + + +def fit_model_global_version( + seed: int = 42, n_graphs: int = 100, n_emb_dim: int = 2, parallel: bool = False, + ged_init_mode: str = 'eager' +) -> (np.array, float): + """ + Fit the GED model with GEDEnv as a member of the model globally available in the class. + """ + print( + f'\n{INFO_TAG}Fitting GEDModel with GEDEnv as a member of the model...' + ) + + from gklearn.experiments.ged.ged_model.graph_generator import GraphGenerator + generator = GraphGenerator( + num_graphs=n_graphs, + max_num_nodes=20, + min_num_nodes=10, + max_num_edges=50, + min_num_edges=20, + node_feat_type='float', + edge_feat_type=None, + with_discrete_n_features=False, + with_discrete_e_features=False, + with_continuous_n_features=True, + with_continuous_e_features=False, + continuous_n_feature_key='coords', + continuous_n_feature_dim=n_emb_dim, + continuous_e_feature_dim=0, + seed=seed + ) + graphs = generator.generate_graphs() + # Check graph node label format: + one_n_labels = graphs[0].nodes[list(graphs[0].nodes)[0]] + assert 'coords' in one_n_labels and isinstance(one_n_labels['coords'], np.ndarray) and ( + len(one_n_labels['coords']) > 0 and one_n_labels['coords'].dtype in [ + np.float64, np.float32] + ), ( + 'The node labels should contain "coords" key with a numpy array as value.' + ) + print( + f'{INFO_TAG}Generated {len(graphs)} graphs with coordinates in np.array format.' + ) + show_some_graphs(graphs) + + # Set GED options: + ged_options = { + 'env_type': 'attr', # Use the attribute-based environment + 'method': 'BIPARTITE', + 'edit_cost_fun': 'GEOMETRIC', + 'edit_costs': [3, 3, 1, 3, 3, 1], + 'edit_cost_config': { + 'node_coord_metric': 'euclidean', + 'node_embed_metric': 'cosine_distance', + 'edge_weight_metric': 'euclidean', + 'edge_embed_metric': 'cosine_distance', + }, + 'optim_method': 'init', + 'init_options': 'EAGER_WITHOUT_SHUFFLED_COPIES' if ged_init_mode == 'eager' else 'LAZY_WITHOUT_SHUFFLED_COPIES', + 'repeats': 1, + } + + fit_settings = { + 'parallel': parallel, # Use parallel processing if specified + 'n_jobs': 10, # min(12, max(os.cpu_count() - 2, 0)), + 'chunksize': None, # None == automatic determination + 'copy_graphs': True, + 'reorder_graphs': False, + } + + # Fit model and compute GED matrix: + model, matrix, stats = fit_model_ged( + graphs, + graphs_Y=None, + ged_options=ged_options, + read_resu_from_file=0, + output_dir=None, + params_idx=None, + use_global_env=True, # Use local GEDEnv for each pair + verbose=2, + **fit_settings + ) + print("Model:", model) + print("Matrix shape:", matrix.shape) + print("Run time:", stats['run_time']) + + return matrix, stats + + +def fit_model_local_version( + seed: int = 42, n_graphs: int = 100, n_emb_dim: int = 2, parallel: bool = False, + ged_init_mode: str = 'eager' +) -> (np.array, float): + """ + Fit the GED model with GEDEnv locally created for each pair of graphs. + """ + print( + f'\n{INFO_TAG}Fitting GEDModel with GEDEnv created locally for each pair of graphs...' + ) + + from gklearn.experiments.ged.ged_model.graph_generator import GraphGenerator + generator = GraphGenerator( + num_graphs=n_graphs, + max_num_nodes=20, + min_num_nodes=10, + max_num_edges=50, + min_num_edges=20, + node_feat_type='float', + edge_feat_type=None, + with_discrete_n_features=False, + with_discrete_e_features=False, + with_continuous_n_features=True, + with_continuous_e_features=False, + continuous_n_feature_key='coords', + continuous_n_feature_dim=n_emb_dim, + continuous_e_feature_dim=0, + seed=seed + ) + graphs = generator.generate_graphs() + # Check graph node label format: + one_n_labels = graphs[0].nodes[list(graphs[0].nodes)[0]] + assert 'coords' in one_n_labels and isinstance(one_n_labels['coords'], np.ndarray) and ( + len(one_n_labels['coords']) > 0 and one_n_labels['coords'].dtype in [ + np.float64, np.float32] + ), ( + 'The node labels should contain "coords" key with a numpy array as value.' + ) + print( + f'{INFO_TAG}Generated {len(graphs)} graphs with coordinates in np.array format.' + ) + show_some_graphs(graphs) + + # Set GED options: + ged_options = { + 'env_type': 'attr', # Use the attribute-based environment + 'method': 'BIPARTITE', + 'edit_cost_fun': 'GEOMETRIC', + 'edit_costs': [3, 3, 1, 3, 3, 1], + 'edit_cost_config': { + 'node_coord_metric': 'euclidean', + 'node_embed_metric': 'cosine_distance', + 'edge_weight_metric': 'euclidean', + 'edge_embed_metric': 'cosine_distance', + }, + 'optim_method': 'init', + 'init_options': 'EAGER_WITHOUT_SHUFFLED_COPIES' if ged_init_mode == 'eager' else 'LAZY_WITHOUT_SHUFFLED_COPIES', + 'repeats': 1, + } + + fit_settings = { + 'parallel': parallel, # Use parallel processing if specified + 'n_jobs': 10, # min(12, max(os.cpu_count() - 2, 0)), + 'chunksize': None, # None == automatic determination + 'copy_graphs': True, + 'reorder_graphs': False, + } + + # Fit model and compute GED matrix: + model, matrix, stats = fit_model_ged( + graphs, + graphs_Y=None, + ged_options=ged_options, + read_resu_from_file=0, + output_dir=None, + params_idx=None, + use_global_env=False, # Use local GEDEnv for each pair + verbose=2, + **fit_settings + ) + print("Model:", model) + print("Matrix shape:", matrix.shape) + print("Run time:", stats['run_time']) + + return matrix, stats + + +def print_stats_local_version(stats: dict): + # Print the run times: + print( + f'{INFO_TAG}The total run time for the GEDModel: ' + f'{stats["run_time"]:.3f} s / {stats["run_time_per_pair"]:.9f} s per pair.' + ) + + pairwise_stats = stats['env_stats'] + keys = ['pairwise_total_time', 'env_setting_time', 'graphs_adding_time', 'ged_computing_time'] + n_pairs = len(pairwise_stats) + + time_stats = {} + for key in keys: + time_stats[key] = sum([pair[key] for pair in pairwise_stats]) + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_pairs:.9f} s per pair. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + +def print_stats_global_version(stats: dict): + # Print the run times: + print( + f'{INFO_TAG}The total run time for the GEDModel: ' + f'{stats["run_time"]:.3f} s / {stats["run_time_per_pair"]:.9f} s per pair.' + ) + + env_stats = stats['env_stats'] + time_stats = {} + keys = ['env_setting_time', 'eager_label_cost_computing_time'] + for key in keys: + if key not in env_stats: + continue + time_stats[key] = env_stats[key] + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + keys = ['env_setting_time_parallel']# 'eager_label_cost_computing_time'] + for key in keys: + if key not in env_stats: + continue + time_stats[key] = sum(env_stats[key]) + n_ele = len(env_stats[key]) + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_ele:.9f} s per worker. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + keys = ['graphs_adding_time_parallel'] + for key in keys: + if key not in env_stats: + continue + np_time = np.array(env_stats[key]) + time_per_worker = np_time.sum(axis=1) + n_workers = np_time.shape[0] + time_stats[key] = np.sum(time_per_worker) + n_ele = n_workers * np_time.shape[1] + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_ele:.9f} s per worker per graph. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time). ' + f'Time per worker: {time_per_worker}.' + ) + + keys = ['graphs_adding_time', 'ged_computing_time', 'ged_computing_time_parallel'] + elements = ['graph', 'pair', 'pair'] + for key, ele in zip(keys, elements): + if key not in env_stats: + continue + time_stats[key] = sum(env_stats[key]) + n_ele = len(env_stats[key]) + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_ele:.9f} s per {ele}. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + +def compare_ged_model_with_global_and_local_env( + seed: int = 42, n_graphs: int = 100, n_emb_dim: int = 2, parallel: bool = False, + ged_init_mode: str = 'eager' +) -> (np.array, np.array): + """ + Compare the output and the performance of the following two GEDModel versions: + - `GEDModel` with a GEDEnv as its global variable, which will be created along with the model. + - `GEDModel` without a GEDEnv. GEDEnv will be created for each pair of graphs inside the pairwise + computation. + Both versions use `AttrLabel` as the node and edge labels format. + """ + cost_matrix_g, stats_g = fit_model_global_version( + seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel, + ged_init_mode=ged_init_mode + ) + cost_matrix_l, stats_l = fit_model_local_version( + seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel, + ged_init_mode=ged_init_mode + ) + + if not np.allclose(cost_matrix_g, cost_matrix_l, rtol=1e-9): + print( + f'{ISSUE_TAG}The cost matrices are not equal! ' + f'String version: {cost_matrix_g.shape}, ' + f'Attribute version: {cost_matrix_l.shape}, ' + f'Relevant tolerance: 1e-9.' + ) + else: + print( + f'{SUCCESS_TAG}The cost matrices are equal! ' + f'String version: {cost_matrix_g.shape}, ' + f'Attribute version: {cost_matrix_l.shape}, ' + f'Relevant tolerance: 1e-9.' + ) + + # Print the first 5 rows and columns of the matrices: + print('\nFirst 5 rows and columns of the global version cost matrix:') + print(cost_matrix_g[:5, :5]) + print('\nFirst 5 rows and columns of the local version cost matrix:') + print(cost_matrix_l[:5, :5]) + + print(f'\n{INFO_TAG}Global version stats:') + print_stats_global_version(stats_g) + print(f'\n{INFO_TAG}Local version stats:') + print_stats_local_version(stats_l) + + return cost_matrix_g, cost_matrix_l + + +if __name__ == '__main__': + # Test the class + # feat_type = 'str' + seed = 42 + n_graphs = 1000 + n_emb_dim = 200 + parallel = True + ged_init_mode = 'lazy' # 'eager' or 'lazy' + compare_ged_model_with_global_and_local_env( + seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel, + ged_init_mode=ged_init_mode + ) + +# 1. Profiling results: +# +# The following experiment pairs return the same cost matrix: +# - global v.s. local version, no parallelization, eager initialization. +# - global v.s. local version, no parallelization, lazy initialization. +# - global with Multiprocessing v.s. local with Multiprocessing, lazy initialization. +# +# +# 2. Analysis: +# +# # Comparison of the two versions: +# +# General Settings: +# - n_graphs: 1000 +# - node numbers: 10-20 +# - edge numbers: 20-50 +# - n_emb_dim: 200 +# - Coordinates as one label of np.array in AttrLabel, +# which is optimized by the Eigen C++ library for vectorized operations. +# +# ## Without parallelization: +# +# ### local version (GEDEnv created for each pair of graphs): +# +# [info] The total run time for the GEDModel: 1040.524 s / 0.002083132 s per pair. +# [info] pairwise total time: 1020.674 s / 0.002043391 s per pair. (98.09% of total run time). +# [info] env setting time: 213.584 s / 0.000427595 s per pair. (20.53% of total run time). +# [info] graphs adding time: 692.614 s / 0.001386615 s per pair. (66.56% of total run time). +# [info] ged computing time: 105.811 s / 0.000211835 s per pair. (10.17% of total run time). +# +# The actual ged computation only takes 10.17% of the total run time, which is only 49.5% of the +# env setting time and 15.3% of the graphs adding time. +# +# Notice that in this version `init_option` is set to `LAZY_WITHOUT_SHUFFLED_COPIES`, so the costs +# between labels are actually computed when calling `GEDEnv.init()` method. This time is included in +# the `env setting time`. `ged computing time` only contains the time to fetch these costs and +# compute the ged. +# +# There is a huge gap that can be optimized! +# +# ### global version (GEDEnv created once for the model): +# +# #### Using `LAZY_WITHOUT_SHUFFLED_COPIES` init option (computing costs when actually needed): +# +# [info] The total run time for the GEDModel: 199.630 s / 0.000399659 s per pair. +# [info] env setting time: 0.034 s. (0.02% of total run time). +# [info] graphs adding time: 0.900 s / 0.000900076 s per graph. (0.45% of total run time). +# [info] ged computing time: 191.513 s / 0.000383410 s per pair. (95.93% of total run time). +# +# The total run time is significantly reduced to 199.630 s, which is only 19.2% of the local +# version. It even beats the local version with parallelization (~ 2.3x faster). The ged computing +# time (191.513 s) is 1.8x slower than the local version (105.811 s), which may be due to the +# lazy initialization. +# +# #### Using `EAGER_WITH_SHUFFLED_COPIES` init option (computing costs before the ged computation): +# +# [info] The total run time for the GEDModel: 258.232 s / 0.000516981 s per pair. +# [info] env setting time: 0.038 s. (0.01% of total run time). +# [info] eager label cost computing time: 151.453 s. (58.65% of total run time). +# [info] graphs adding time: 1.018 s / 0.001017674 s per graph. (0.39% of total run time). +# [info] ged computing time: 100.500 s / 0.000201202 s per pair. (38.92% of total run time). +# +# Section conclusion: +# - The lazy initialization of the costs may avoid unnecessary computations between label pairs, +# e.g., if the node is inserted or deleted. Meanwhile, the eager version can avoid the multi-time +# computation of the costs between the same label pairs. The problem is that it is done inside +# C++ implementation, so it cannot be progressed by tqdm directly. +# I have not tested how many of these cases exist during the computation. +# - At least for this specific case, the eager version is slower than the lazy version. +# +# +# ## With parallelization (n_jobs=10): +# +# ### local version (GEDEnv created for each pair of graphs): +# +# [info] The total run time for the GEDModel: 460.347 s / 0.000921615 s per pair. +# [info] pairwise total time: 1424.233 s / 0.002851317 s per pair. (309.38% of total run time). +# [info] env setting time: 285.454 s / 0.000571480 s per pair. (62.01% of total run time). +# [info] graphs adding time: 962.572 s / 0.001927071 s per pair. (209.10% of total run time). +# [info] ged computing time: 161.337 s / 0.000322997 s per pair. (35.05% of total run time). +# +# Similar to the non-parallelized version, the actual ged computation only takes 35.05% of the total +# run time, which is only 56.5% of the env setting time and 16.7% of the graphs adding time. +# +# ### global version (GEDEnv created once for the model): +# +# #### ✅ Using `LAZY_WITHOUT_SHUFFLED_COPIES` init option (computing costs when actually needed): +# +# [info] The total run time for the GEDModel: 18.410 s / 0.000036857 s per pair. +# [info] env setting time parallel: 1.428 s / 0.142769814 s per worker. (7.76% of total run time). +# [info] graphs adding time parallel: 6.165 s / 0.000616510 s per worker per graph. (33.49% of total run time). +# Time per worker: [0.44879055 0.52007318 0.71203756 0.58835745 0.50336409 0.69282722 +# 0.69138193 0.7250886 0.59162402 0.69155192]. +# [info] ged computing time parallel: 139.406 s / 0.000279092 s per pair. (757.23% of total run time). +# +# #### Using `EAGER_WITH_SHUFFLED_COPIES` init option (computing costs before the ged computation): +# +# Slow, around 240 seconds for distance computation part, so not included here. +# +# ### Conclusion: +# - ✅ With parallelization, the global version with `LAZY_WITHOUT_SHUFFLED_COPIES` init option is +# 460.347 / 18.410 = 25.0x faster than the local version. Meanwhile, it is 199.630 / 18.410 = 10.8x +# faster than the global version without parallelization!!! 🎉🎉🎉 +# - ❌ the global version with `EAGER_WITH_SHUFFLED_COPIES` init option is much slower for unknown +# reasons. It should be further investigated. +# - In the local version, env setting and graphs adding are performed N * (N - 1) / 2 times, +# which is (N - 1) / 2 times more than the global version. diff --git a/gklearn/experiments/ged/ged_model/profile_ged_model_cross_matrix.py b/gklearn/experiments/ged/ged_model/profile_ged_model_cross_matrix.py new file mode 100644 index 0000000000..353013d001 --- /dev/null +++ b/gklearn/experiments/ged/ged_model/profile_ged_model_cross_matrix.py @@ -0,0 +1,493 @@ +""" +@File: profile_ged_model_cross_matrix.py + +@Author: jajupmochi +@Date: June 5 2025 +""" +from typing import List + +import networkx as nx +import numpy as np + +ISSUE_TAG = "\033[91m[issue]\033[0m " # Red +INFO_TAG = "\033[94m[info]\033[0m " # Blue +SUCCESS_TAG = "\033[92m[success]\033[0m " # Green + + +def fit_model_ged( + graphs_X: List[nx.Graph], + graphs_Y: List[nx.Graph] = None, + ged_options: dict = None, + parallel: bool = None, + n_jobs: int = None, + chunksize: int = None, + copy_graphs: bool = True, + read_resu_from_file: int = 1, + output_dir: str = None, + params_idx: str = None, + reorder_graphs: bool = False, + verbose: int = 2, + use_global_env: bool = True, + **kwargs +): + # Reorder graphs if specified: + if reorder_graphs: + graphs_X = reorder_graphs_by_index(graphs_X, idx_key='id') + if graphs_Y is not None: + graphs_Y = reorder_graphs_by_index(graphs_Y, idx_key='id') + + # Compute metric matrix otherwise: + print(f'{INFO_TAG}Computing metric matrix...') + all_graphs = graphs_X + graphs_Y if graphs_Y else graphs_X + nl_names = list( + all_graphs[0].nodes[list(all_graphs[0].nodes)[0]].keys() + ) if graphs_X else [] + if not all_graphs: + el_names = [] + else: + idx_edge = ( + np.where(np.array([nx.number_of_edges(g) for g in all_graphs]) > 0)[0] + ) + if len(idx_edge) == 0: + el_names = [] + else: + el_names = list( + all_graphs[idx_edge[0]].edges[ + list(all_graphs[idx_edge[0]].edges)[0]].keys() + ) + + if use_global_env: + from gklearn.experiments.ged.ged_model.ged_model_global_env import GEDModel + print(f'>>> {INFO_TAG}Using global GEDEnv for all pairs of graphs.') + else: + from gklearn.experiments.ged.ged_model.ged_model_parallel import GEDModel + print(f'>>> {INFO_TAG}Using local GEDEnv for each pair of graphs.') + + if parallel is False: + parallel = None + elif parallel is True: + # Set automatically: the global version uses 'concurrent', and local version 'multiprocessing': + parallel = True + + model = GEDModel( + # env_type=ged_options['env_type'], + ed_method=ged_options['method'], + edit_cost_fun=ged_options['edit_cost_fun'], + init_edit_cost_constants=ged_options['edit_costs'], + edit_cost_config=ged_options.get('edit_cost_config', {}), + optim_method=ged_options['optim_method'], + ged_init_options=ged_options['init_options'], + node_labels=nl_names, edge_labels=el_names, + parallel=parallel, + n_jobs=n_jobs, + chunksize=chunksize, + copy_graphs=copy_graphs, + # make sure it is a full deep copy. and faster! + verbose=verbose + ) + + # Train model. + try: + if graphs_Y is None: + # Compute the distance matrix for the same set of graphs: + matrix = model.fit_transform( + graphs_X, y=graphs_Y, + save_dm_train=True, repeats=ged_options['repeats'], + ) + else: + model.fit(graphs_X, repeats=ged_options['repeats']) + matrix = model.transform( + graphs_Y, + save_dm_test=True, repeats=ged_options['repeats'], + ) + + except OSError as exception: + if 'GLIBC_2.23' in exception.args[0]: + msg = \ + 'This error is very likely due to the low version of GLIBC ' \ + 'on your system. ' \ + 'The required version of GLIBC is 2.23. This may happen on the ' \ + 'CentOS 7 system, where the highest version of GLIBC is 2.17. ' \ + 'You may check your CLIBC version by bash command `rpm -q glibc`. ' \ + 'The `graphkit-learn` library comes with GLIBC_2.23, which you can ' \ + 'install by enable the `--build-gedlib` option: ' \ + '`python3 setup.py install --build-gedlib`. This will compile the C++ ' \ + 'module `gedlib`, which requires a C++ compiler and CMake.' + raise AssertionError(msg) from exception + else: + assert False, exception + except Exception as exception: + assert False, exception + + # Save history: + # For graph kernels it is n * (n - 1) / 2: + if graphs_Y is None: + n_pairs = len(graphs_X) * (len(graphs_X) - 1) / 2 + else: + n_pairs = len(graphs_X) * len(graphs_Y) + # history = {'run_time': AverageMeter()} + # history['run_time'].update(model.run_time / n_pairs, n_pairs) + + # # Save model and history to file: + # if read_resu_from_file >= 1: + # os.makedirs(os.path.dirname(fn_model), exist_ok=True) + # pickle.dump({'model': model, 'history': history}, open(fn_model, 'wb')) + + # Print out the information: + params_msg = f' for parameters {params_idx}' if params_idx else '' + print( + f'{SUCCESS_TAG}Computed metric matrix of size {matrix.shape} in {model.run_time:.3f} ' + f'seconds ({(model.run_time / n_pairs):.9f} s per pair){params_msg}.' + ) + + stats = { + 'n_pairs': n_pairs, + 'matrix_shape': matrix.shape, + 'run_time': model.run_time, + 'run_time_per_pair': model.run_time / n_pairs, + 'env_stats': model.env_stats if use_global_env else model.pairwise_stats, + } + + return model, matrix, stats + + +def show_some_graphs(graphs): + """ + Show some graphs from the list of graphs. + """ + print(f'{INFO_TAG}Showing some graphs:') + for i, g in enumerate(graphs[:2]): + print(f'Graph {i}:') + print('Number of nodes:', g.number_of_nodes()) + print('Number of edges:', g.number_of_edges()) + print('Nodes:', g.nodes(data=True)) + print('Edges:', g.edges(data=True)) + print() + + +def convert_graphs_coords_from_attr_to_string(graphs: List[nx.Graph]): + """ + Convert the coordinates of nodes in graphs from the attribute format `AttrLabel` to the string format `GXLLabel`. + """ + for g in graphs: + for node in g.nodes(data=True): + if 'coords' in node[1]: + # Convert the coordinates to string format and store them in "x" and "y" keys: + coords = node[1]['coords'] + node[1]['x'] = str(coords[0]) + node[1]['y'] = str(coords[1]) + for idx in range(2, len(coords)): + # If there are more than 2 coordinates, store them with extra keys: + node[1][f'coord_{idx}'] = str(coords[idx]) + del node[1]['coords'] + print(f'{INFO_TAG}Converted coordinates from attribute format to string format.') + + +def generate_graphs(n_graphs: int, n_emb_dim: int = 2, seed: int = 42) -> List[nx.Graph]: + """ + Generate a list of random graphs with node labels containing coordinates in string format. + """ + from gklearn.experiments.ged.ged_model.graph_generator import GraphGenerator + generator = GraphGenerator( + num_graphs=n_graphs, + max_num_nodes=20, + min_num_nodes=10, + max_num_edges=50, + min_num_edges=20, + node_feat_type='float', + edge_feat_type=None, + with_discrete_n_features=False, + with_discrete_e_features=False, + with_continuous_n_features=True, + with_continuous_e_features=False, + continuous_n_feature_key='coords', + continuous_n_feature_dim=n_emb_dim, + continuous_e_feature_dim=0, + seed=seed + ) + graphs = generator.generate_graphs() + # Check the graph node label format: + one_n_labels = graphs[0].nodes[list(graphs[0].nodes)[0]] + assert 'coords' in one_n_labels and isinstance(one_n_labels['coords'], np.ndarray) and ( + len(one_n_labels['coords']) > 0 and one_n_labels['coords'].dtype in [ + np.float64, np.float32] + ), ( + 'The node labels should contain "coords" key with a numpy array as value.' + ) + print( + f'{INFO_TAG}Generated {len(graphs)} graphs with coordinates in np.array format.' + ) + show_some_graphs(graphs) + return graphs + + +def fit_model_either_version( + use_global_env: bool, + seed: int = 42, n_graphs: tuple[int, int] = (100, 50), n_emb_dim: int = 2, + parallel: bool = False, ged_init_mode: str = 'eager' +) -> (np.array, float): + """ + Fit the GED model with GEDEnv as a member of the model globally available in the class. + """ + print( + f'\n{INFO_TAG}Fitting GEDModel with GEDEnv as a member of the model...' + ) + graphs1 = generate_graphs(n_graphs[0], n_emb_dim=n_emb_dim, seed=seed) + graphs2 = generate_graphs(n_graphs[1], n_emb_dim=n_emb_dim, seed=seed) + + # Set GED options: + ged_options = { + 'env_type': 'attr', # Use the attribute-based environment + 'method': 'BIPARTITE', + 'edit_cost_fun': 'GEOMETRIC', + 'edit_costs': [3, 3, 1, 3, 3, 1], + 'edit_cost_config': { + 'node_coord_metric': 'euclidean', + 'node_embed_metric': 'cosine_distance', + 'edge_weight_metric': 'euclidean', + 'edge_embed_metric': 'cosine_distance', + }, + 'optim_method': 'init', + 'init_options': 'EAGER_WITHOUT_SHUFFLED_COPIES' if ged_init_mode == 'eager' else 'LAZY_WITHOUT_SHUFFLED_COPIES', + 'repeats': 1, + } + + fit_settings = { + 'parallel': parallel, # Use parallel processing if specified + 'n_jobs': 10, # min(12, max(os.cpu_count() - 2, 0)), + 'chunksize': None, # None == automatic determination + 'copy_graphs': True, + 'reorder_graphs': False, + } + + # Fit model and compute GED matrix: + model, matrix, stats = fit_model_ged( + graphs1, + graphs_Y=graphs2, + ged_options=ged_options, + read_resu_from_file=0, + output_dir=None, + params_idx=None, + use_global_env=use_global_env, # Use which GEDEnv for each pair + verbose=2, + **fit_settings + ) + print("Model:", model) + print("Matrix shape:", matrix.shape) + print("Run time:", stats['run_time']) + + return matrix, stats + + +def print_stats_local_version(stats: dict): + # Print the run times: + print( + f'{INFO_TAG}The total run time for the GEDModel: ' + f'{stats["run_time"]:.3f} s / {stats["run_time_per_pair"]:.9f} s per pair.' + ) + + pairwise_stats = stats['env_stats'] + if not pairwise_stats: + print(f'{INFO_TAG}No pairwise stats available.') + return + + keys = ['pairwise_total_time', 'env_setting_time', 'graphs_adding_time', 'ged_computing_time'] + n_pairs = len(pairwise_stats) + + time_stats = {} + for key in keys: + if key not in pairwise_stats[0]: + continue + time_stats[key] = sum([pair[key] for pair in pairwise_stats]) + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_pairs:.9f} s per pair. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + +def print_stats_global_version(stats: dict): + # Print the run times: + print( + f'{INFO_TAG}The total run time for the GEDModel: ' + f'{stats["run_time"]:.3f} s / {stats["run_time_per_pair"]:.9f} s per pair.' + ) + + env_stats = stats['env_stats'] + time_stats = {} + keys = ['env_setting_time', 'eager_label_cost_computing_time'] + for key in keys: + if key not in env_stats: + continue + time_stats[key] = env_stats[key] + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + keys = ['env_setting_time_parallel'] # 'eager_label_cost_computing_time'] + for key in keys: + if key not in env_stats: + continue + time_stats[key] = sum(env_stats[key]) + n_ele = len(env_stats[key]) + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_ele:.9f} s per worker. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + keys = ['graphs_adding_time_parallel'] + for key in keys: + if key not in env_stats: + continue + np_time = np.array(env_stats[key]) + time_per_worker = np_time.sum(axis=1) + n_workers = np_time.shape[0] + time_stats[key] = np.sum(time_per_worker) + n_ele = n_workers * np_time.shape[1] + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_ele:.9f} s per worker per graph. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time). ' + f'Time per worker: {time_per_worker}.' + ) + + keys = ['graphs_adding_time', 'ged_computing_time', 'ged_computing_time_parallel'] + elements = ['graph', 'pair', 'pair'] + for key, ele in zip(keys, elements): + if key not in env_stats: + continue + time_stats[key] = sum(env_stats[key]) + n_ele = len(env_stats[key]) + print( + f'{INFO_TAG}{key.replace("_", " ")}: ' + f'{time_stats[key]:.3f} s / {time_stats[key] / n_ele:.9f} s per {ele}. ' + f'({time_stats[key] / stats["run_time"] * 100:.2f}% of total run time).' + ) + + +def compare_ged_model_with_global_and_local_env_cross_matrix( + seed: int = 42, n_graphs: tuple[int, int] = (100, 50), n_emb_dim: int = 2, + parallel: bool = False, ged_init_mode: str = 'eager' +) -> (np.array, np.array): + """ + Compare the output and the performance of the following two GEDModel versions: + - `GEDModel` with a GEDEnv as its global variable, which will be created along with the model. + - `GEDModel` without a GEDEnv. GEDEnv will be created for each pair of graphs inside the pairwise + computation. + Both versions use `AttrLabel` as the node and edge labels format. + """ + cost_matrix_g, stats_g = fit_model_either_version( + use_global_env=True, # Use global GEDEnv for all pairs of graphs + seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel, + ged_init_mode=ged_init_mode + ) + cost_matrix_l, stats_l = fit_model_either_version( + use_global_env=False, # Use local GEDEnv for each pair of graphs + seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel, + ged_init_mode=ged_init_mode + ) + + if not np.allclose(cost_matrix_g, cost_matrix_l, rtol=1e-9): + print( + f'{ISSUE_TAG}The cost matrices are not equal! ' + f'String version: {cost_matrix_g.shape}, ' + f'Attribute version: {cost_matrix_l.shape}, ' + f'Relevant tolerance: 1e-9.' + ) + else: + print( + f'{SUCCESS_TAG}The cost matrices are equal! ' + f'String version: {cost_matrix_g.shape}, ' + f'Attribute version: {cost_matrix_l.shape}, ' + f'Relevant tolerance: 1e-9.' + ) + + # Print the first 5 rows and columns of the matrices: + print('\nFirst 5 rows and columns of the global version cost matrix:') + print(cost_matrix_g[:5, :5]) + print('\nFirst 5 rows and columns of the local version cost matrix:') + print(cost_matrix_l[:5, :5]) + + print(f'\n{INFO_TAG}Global version stats:') + print_stats_global_version(stats_g) + print(f'\n{INFO_TAG}Local version stats:') + print_stats_local_version(stats_l) + + return cost_matrix_g, cost_matrix_l + + +if __name__ == '__main__': + # Test the class + # feat_type = 'str' + seed = 42 + n_graphs = (500, 300) + n_emb_dim = 200 + parallel = True + ged_init_mode = 'lazy' # 'eager' or 'lazy' + compare_ged_model_with_global_and_local_env_cross_matrix( + seed=seed, n_graphs=n_graphs, n_emb_dim=n_emb_dim, parallel=parallel, + ged_init_mode=ged_init_mode + ) + +# 1. Profiling results: +# +# The following experiment pairs return the same cost matrix: +# - global v.s. local version, no parallelization, lazy initialization. +# - global with Multiprocessing v.s. local with Multiprocessing, lazy initialization. +# +# +# 2. Analysis: +# +# # Comparison of the two versions: +# +# General Settings: +# - n_graphs: 500, 300 +# - node numbers: 10-20 +# - edge numbers: 20-50 +# - n_emb_dim: 200 +# - Coordinates as one label of np.array in AttrLabel, +# which is optimized by the Eigen C++ library for vectorized operations. +# +# ## Without parallelization: +# +# ### local version (GEDEnv created for each pair of graphs): +# +# [info] The total run time for the GEDModel: 165.527 s / 0.001103511 s per pair. +# +# ### global version (GEDEnv created once for the model): +# +# #### Using `LAZY_WITHOUT_SHUFFLED_COPIES` init option (computing costs when actually needed): +# +# [info] The total run time for the GEDModel: 29.026 s / 0.000193509 s per pair. +# [info] env setting time: 0.025 s. (0.09% of total run time). +# [info] graphs adding time: 0.352 s / 0.000440472 s per graph. (1.21% of total run time). +# [info] ged computing time: 27.513 s / 0.000183417 s per pair. (94.78% of total run time). +# +# +# ## With parallelization (n_jobs=10): +# +# ### local version (GEDEnv created for each pair of graphs): +# +# [info] The total run time for the GEDModel: 77.923 s / 0.000519487 s per pair. +# +# ### global version (GEDEnv created once for the model): +# +# #### ✅ Using `LAZY_WITHOUT_SHUFFLED_COPIES` init option (computing costs when actually needed): +# +# [info] The total run time for the GEDModel: 5.806 s / 0.000038708 s per pair. +# [info] env setting time parallel: 1.374 s / 0.137399626 s per worker. (23.66% of total run time). +# [info] graphs adding time parallel: 5.109 s / 0.000638592 s per worker per graph. (87.99% of total run time). Time per worker: [0.36477065 0.39869857 0.43185973 0.57985783 0.57254052 0.50350499 +# 0.61493111 0.48274899 0.60086203 0.55896163]. +# [info] ged computing time parallel: 38.818 s / 0.000258788 s per pair. (668.57% of total run time). +# +# +# ### Conclusion: +# - With parallelization, the global version with `LAZY_WITHOUT_SHUFFLED_COPIES` init option is +# 77.923 / 5.806 = 13.41 times faster than the local version. +# - Without parallelization, the global version with `LAZY_WITHOUT_SHUFFLED_COPIES` init option is +# 165.527 / 29.026 = 5.71 times faster than the local version. +# - The global version with parallelization is 29.026 / 5.806 = 5.00 times faster than the global +# version without parallelization. diff --git a/gklearn/ged/median/test_median_graph_estimator.py b/gklearn/ged/median/test_median_graph_estimator.py index a0ebbbb684..ac2756a0b9 100644 --- a/gklearn/ged/median/test_median_graph_estimator.py +++ b/gklearn/ged/median/test_median_graph_estimator.py @@ -9,7 +9,7 @@ def test_median_graph_estimator(): from gklearn.utils import load_dataset from gklearn.ged.median import MedianGraphEstimator, constant_node_costs - from gklearn.gedlib import librariesImport, gedlibpy + from gklearn.gedlib import libraries_import, gedlibpy from gklearn.preimage.utils import get_same_item_indices import multiprocessing @@ -83,7 +83,7 @@ def test_median_graph_estimator(): def test_median_graph_estimator_symb(): from gklearn.utils import load_dataset from gklearn.ged.median import MedianGraphEstimator, constant_node_costs - from gklearn.gedlib import librariesImport, gedlibpy + from gklearn.gedlib import libraries_import, gedlibpy from gklearn.preimage.utils import get_same_item_indices import multiprocessing diff --git a/gklearn/ged/model/__init__.py b/gklearn/ged/model/__init__.py index e69de29bb2..2ce352aeb7 100644 --- a/gklearn/ged/model/__init__.py +++ b/gklearn/ged/model/__init__.py @@ -0,0 +1 @@ +from gklearn.ged.model.ged_model import GEDModel diff --git a/gklearn/ged/model/ged_model.py b/gklearn/ged/model/ged_model.py index ea6793f04c..425cb2522d 100644 --- a/gklearn/ged/model/ged_model.py +++ b/gklearn/ged/model/ged_model.py @@ -1,900 +1,53 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- """ -Created on Thu May 5 09:42:30 2022 +ged_model -@author: ljia -""" -import sys -import multiprocessing -import time -import numpy as np -import networkx as nx -from itertools import combinations -import multiprocessing -from multiprocessing import Pool - -# from abc import ABC, abstractmethod -from sklearn.base import BaseEstimator # , TransformerMixin -from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, -from sklearn.exceptions import NotFittedError - -from gklearn.ged.model.distances import euclid_d -from gklearn.ged.util import pairwise_ged, get_nb_edit_operations -# from gklearn.utils import normalize_gram_matrix -from gklearn.utils import get_iters +A wrapper for the GED model. +@Author: jajupmochi +@Date: Jun 06 2025 +""" -# @TODO: it should be faster if creating a global env variable. -class GEDModel(BaseEstimator): # , ABC): - """The graph edit distance model class compatible with `scikit-learn`. - - Attributes - ---------- - _graphs : list - Stores the input graphs on fit input data. - Default format of the list objects is `NetworkX` graphs. - **We don't guarantee that the input graphs remain unchanged during the - computation.** - Notes - ----- - This class uses the `gedlibpy` module to compute the graph edit distance. +# todo: write test code. - References - ---------- - https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. +def GEDModel(*args, use_global_env: bool = True, **kwargs): """ + ged_model + A wrapper for the GED model. - def __init__( - self, - ed_method='BIPARTITE', - edit_cost_fun='CONSTANT', - init_edit_cost_constants=[3, 3, 1, 3, 3, 1], - optim_method='init', - optim_options={'y_distance': euclid_d, 'mode': 'reg'}, - node_labels=[], - edge_labels=[], - parallel=None, - n_jobs=None, - chunksize=None, - # normalize=True, - copy_graphs=True, # make sure it is a full deep copy. and faster! - verbose=2 - ): - """`__init__` for `GEDModel` object.""" - # @todo: the default settings of the parameters are different from those in the self.compute method. - # self._graphs = None - self.ed_method = ed_method - self.edit_cost_fun = edit_cost_fun - self.init_edit_cost_constants = init_edit_cost_constants - self.optim_method = optim_method - self.optim_options = optim_options - self.node_labels = node_labels - self.edge_labels = edge_labels - self.parallel = parallel - self.n_jobs = ( - (multiprocessing.cpu_count() - 1) if n_jobs is None else n_jobs) - self.chunksize = chunksize - # self.normalize = normalize - self.copy_graphs = copy_graphs - self.verbose = verbose - - - # self._run_time = 0 - # self._gram_matrix = None - # self._gram_matrix_unnorm = None - - ########################################################################## - # The following is the 1st paradigm to compute GED distance matrix, which is - # compatible with `scikit-learn`. - ########################################################################## - - def fit(self, X, y=None, **kwargs): - """Fit a graph dataset for a transformer. - - Parameters - ---------- - X : iterable - DESCRIPTION. - - y : None, optional - There is no need of a target in a transformer, yet the `scikit-learn` - pipeline API requires this parameter. - - Returns - ------- - object - Returns self. - - """ - # self._is_tranformed = False - - # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; - self.clear_attributes() - - # Validate parameters for the transformer. - self.validate_parameters() - - # Validate the input. - self._graphs = self.validate_input(X) - if y is not None: - self._targets = y - # self._targets = self.validate_input(y) - - # Compute edit cost constants. - self.compute_edit_costs(**kwargs) - - # self._X = X - # self._kernel = self._get_kernel_instance() - - # Return the transformer. - return self - - - def transform( - self, X=None, - return_dm_train=False, - save_dm_test=False, - return_dm_test=False, - **kwargs - ): - """Compute the graph kernel matrix between given and fitted data. - - Parameters - ---------- - X : TYPE - DESCRIPTION. - - Raises - ------ - ValueError - DESCRIPTION. - - Returns - ------- - None. - - """ - # If `return_dm_train`, return the fitted GED distance matrix of training data. - if return_dm_train: - check_is_fitted(self, '_dm_train') - self._is_transformed = True - return self._dm_train # @TODO: copy or not? - - if return_dm_test: - check_is_fitted(self, '_dm_test') - return self._dm_test # @TODO: copy or not? - - # Check if method "fit" had been called. - check_is_fitted(self, '_graphs') - - # Validate the input. - Y = self.validate_input(X) - - # Transform: compute the graph kernel matrix. - dis_matrix = self.compute_distance_matrix(Y, **kwargs) - self._Y = Y - - # Self transform must appear before the diagonal call on normalization. - self._is_transformed = True # @TODO: When to set this to True? When return dm test? - # if self.normalize: - # X_diag, Y_diag = self.diagonals() - # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. - # try: - # kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) - # except: - # raise - # finally: - # np.seterr(**old_settings) - - if save_dm_test: - self._dm_test = dis_matrix - # If the model is retransformed and the `save_dm_test` flag is not set, - # then remove the previously computed dm_test to prevent conflicts. - else: - if hasattr(self, '_dm_test'): - delattr(self, '_dm_test') - - return dis_matrix - - - def fit_transform( - self, - X, - y=None, - save_dm_train=False, - save_mm_train: bool = False, - **kwargs): - """Fit and transform: compute GED distance matrix on the same data. - - Parameters - ---------- - X : list of graphs - Input graphs. - - Returns - ------- - dis_matrix : numpy array, shape = [len(X), len(X)] - The distance matrix of X. - - """ - self.fit(X, y, **kwargs) - - # Transform: compute Gram matrix. - dis_matrix = self.compute_distance_matrix(**kwargs) - - # # Normalize. - # if self.normalize: - # self._X_diag = np.diagonal(gram_matrix).copy() - # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. - # try: - # gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) - # except: - # raise - # finally: - # np.seterr(**old_settings) - - if save_mm_train or save_dm_train: - self._dm_train = dis_matrix - # If the model is refitted and the `save_dm_train` flag is not set, then - # remove the previously computed dm_train to prevent conflicts. - else: - if hasattr(self, '_dm_train'): - delattr(self, '_dm_train') - - return dis_matrix - - - def get_params(self): - pass - - - def set_params(self): - pass - - - def clear_attributes(self): # @todo: update - # if hasattr(self, '_X_diag'): - # delattr(self, '_X_diag') - if hasattr(self, '_graphs'): - delattr(self, '_graphs') - if hasattr(self, '_Y'): - delattr(self, '_Y') - if hasattr(self, '_run_time'): - delattr(self, '_run_time') - if hasattr(self, '_test_run_time'): - delattr(self, '_test_run_time') - - - def validate_parameters(self): - """Validate all parameters for the transformer. - - Returns - ------- - None. - - """ - if self.parallel == False: - self.parallel = None - elif self.parallel == True: - self.parallel = 'imap_unordered' - if self.parallel is not None and self.parallel != 'imap_unordered': - raise ValueError('Parallel mode is not set correctly.') - - if self.parallel == 'imap_unordered' and self.n_jobs is None: - self.n_jobs = multiprocessing.cpu_count() - - - def validate_input(self, X): - """Validate the given input and raise errors if it is invalid. - - Parameters - ---------- - X : list - The input to check. Should be a list of graph. - - Raises - ------ - ValueError - Raise if the input is not correct. - - Returns - ------- - X : list - The input. A list of graph. - - """ - if X is None: - raise ValueError('Please add graphs before computing.') - elif not isinstance(X, list): - raise ValueError('Cannot detect graphs. The input must be a list.') - elif len(X) == 0: - raise ValueError( - 'The graph list given is empty. No computation will be performed.' - ) - - return X - - - def compute_distance_matrix(self, Y=None, **kwargs): - """Compute the distance matrix between a given target graphs (Y) and - the fitted graphs (X / self._graphs) or the distance matrix for the fitted - graphs (X / self._graphs). - - Parameters - ---------- - Y : list of graphs, optional - The target graphs. The default is None. If None distance is computed - between X and itself. - - Returns - ------- - dis_matrix : numpy array, shape = [n_targets, n_inputs] - The computed distance matrix. - - """ - if Y is None: - # Compute metric matrix for self._graphs (X). - dis_matrix = self._compute_X_distance_matrix(**kwargs) - # self._gram_matrix_unnorm = np.copy(self._gram_matrix) - - else: - # Compute metric matrix between Y and self._graphs (X). - Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) - graphs_copy = ( - [g.copy() for g in self._graphs] - if self.copy_graphs else self._graphs - ) - - start_time = time.time() - - if self.parallel == 'imap_unordered': - dis_matrix = self._compute_distance_matrix_imap_unordered(Y) - - elif self.parallel is None: - dis_matrix = self._compute_distance_matrix_series( - Y_copy, graphs_copy, **kwargs - ) - - self._test_run_time = time.time() - start_time - - if self.verbose: - print( - 'Distance matrix of size (%d, %d) built in %s seconds.' - % (len(Y), len(self._graphs), self._test_run_time) - ) - - return dis_matrix - - - def _compute_distance_matrix_series(self, X, Y, **kwargs): - """Compute the GED distance matrix between two sets of graphs (X and Y) - without parallelization. - - Parameters - ---------- - X, Y : list of graphs - The input graphs. - - Returns - ------- - dis_matrix : numpy array, shape = [n_X, n_Y] - The computed distance matrix. - - """ - dis_matrix = np.zeros((len(X), len(Y))) - - for i_x, g_x in enumerate(X): - for i_y, g_y in enumerate(Y): - dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y, **kwargs) - - return dis_matrix - - - def _compute_kernel_matrix_imap_unordered(self, Y): - """Compute the kernel matrix between a given target graphs (Y) and - the fitted graphs (X / self._graphs) using imap unordered parallelization. - - Parameters - ---------- - Y : list of graphs, optional - The target graphs. - - Returns - ------- - kernel_matrix : numpy array, shape = [n_targets, n_inputs] - The computed kernel matrix. - - """ - raise Exception('Parallelization for kernel matrix is not implemented.') - - - def diagonals(self): - """Compute the kernel matrix diagonals of the fit/transformed data. + Currently, there are two implementations of the GED model: - Returns - ------- - X_diag : numpy array - The diagonal of the kernel matrix between the fitted data. - This consists of each element calculated with itself. + - The GEDModel class using a GEDEnv as a global environment inside the class for testing purposes. + Compared to the previous local version, this implementation can be at least up to 25x and 5x faster, + respectively with and without parallelization, but it is not very memory efficient. + Check comments in `profile_ged_model.py` and `profile_ged_model_cross_matrix.py` in + `gklearn/expeirments/ged/ged_model/` for the performance comparison. - Y_diag : numpy array - The diagonal of the kernel matrix, of the transform. - This consists of each element calculated with itself. + - The GEDModel class creating a GEDEnv locally inside the pairwise distance computation for + each pair of graphs. This can be a bit time efficient, but also super slow. - """ - # Check if method "fit" had been called. - check_is_fitted(self, ['_graphs']) + We have not yet optimized the automated choice of which implementation to use, + so we leave it to the user to choose. In default, the global environment is used. - # Check if the diagonals of X exist. - try: - check_is_fitted(self, ['_X_diag']) - except NotFittedError: - # Compute diagonals of X. - self._X_diag = np.empty(shape=(len(self._graphs),)) - graphs = ([g.copy() for g in - self._graphs] if self.copy_graphs else self._graphs) - for i, x in enumerate(graphs): - self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? - - try: - # If transform has happened, return both diagonals. - check_is_fitted(self, ['_Y']) - self._Y_diag = np.empty(shape=(len(self._Y),)) - Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) - for (i, y) in enumerate(Y): - self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? - - return self._X_diag, self._Y_diag - except NotFittedError: - # Else just return both X_diag - return self._X_diag - - - # @abstractmethod - def pairwise_distance(self, x, y): - """Compute pairwise kernel between two graphs. - - Parameters - ---------- - x, y : NetworkX Graph. - Graphs bewteen which the kernel is computed. - - Returns - ------- - kernel: float - The computed kernel. - -# Notes -# ----- -# This method is abstract and must be implemented by a subclass. - - """ - raise NotImplementedError( - 'Pairwise kernel computation is not implemented!' - ) - - - def compute_edit_costs(self, Y=None, Y_targets=None, **kwargs): - """Compute edit cost constants. When optimizing method is `fiited`, - apply Jia2021's metric learning method by using a given target graphs (Y) - the fitted graphs (X / self._graphs). - - Parameters - ---------- - Y : TYPE, optional - DESCRIPTION. The default is None. - - Returns - ------- - None. - - """ - # Get or compute. - if self.optim_method == 'random': - self._edit_cost_constants = np.random.rand(6) - - elif self.optim_method == 'init': - self._edit_cost_constants = self.init_edit_cost_constants - - elif self.optim_method == 'expert': - self._edit_cost_constants = [3, 3, 1, 3, 3, 1] - - elif self.optim_method == 'fitted': # Jia2021 method - # Get proper inputs. - if Y is None: - check_is_fitted(self, ['_graphs']) - check_is_fitted(self, ['_targets']) - graphs = ([g.copy() for g in - self._graphs] if self.copy_graphs else self._graphs) - targets = self._targets - else: - graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) - targets = Y_targets - - # Get optimization options. - node_labels = self.node_labels - edge_labels = self.edge_labels - unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) - repeats = kwargs.get('repeats', 1) - from gklearn.ged.model.optim_costs import compute_optimal_costs - self._edit_cost_constants = compute_optimal_costs( - graphs, targets, - node_labels=node_labels, edge_labels=edge_labels, - unlabeled=unlabeled, - init_costs=self.init_edit_cost_constants, - ed_method=self.ed_method, - edit_cost_fun=self.edit_cost_fun, - repeats=repeats, - rescue_optim_failure=False, - verbose=(self.verbose >= 2), - **self.optim_options - ) - - - ########################################################################## - # The following is the 2nd paradigm to compute kernel matrix. It is - # simplified and not compatible with `scikit-learn`. - ########################################################################## - - # def compute(self, *graphs, **kwargs): - # self.parallel = kwargs.get('parallel', 'imap_unordered') - # self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) - # self.normalize = kwargs.get('normalize', True) - # self.verbose = kwargs.get('verbose', 2) - # self.copy_graphs = kwargs.get('copy_graphs', True) - # self.save_unnormed = kwargs.get('save_unnormed', True) - # self.validate_parameters() - - # # If the inputs is a list of graphs. - # if len(graphs) == 1: - # if not isinstance(graphs[0], list): - # raise Exception('Cannot detect graphs.') - # elif len(graphs[0]) == 0: - # raise Exception('The graph list given is empty. No computation was performed.') - # else: - # if self.copy_graphs: - # self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. - # else: - # self._graphs = graphs - # self._gram_matrix = self._compute_gram_matrix() - - # if self.save_unnormed: - # self._gram_matrix_unnorm = np.copy(self._gram_matrix) - # if self.normalize: - # self._gram_matrix = normalize_gram_matrix(self._gram_matrix) - # return self._gram_matrix, self._run_time - - # elif len(graphs) == 2: - # # If the inputs are two graphs. - # if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): - # if self.copy_graphs: - # G0, G1 = graphs[0].copy(), graphs[1].copy() - # else: - # G0, G1 = graphs[0], graphs[1] - # kernel = self._compute_single_kernel(G0, G1) - # return kernel, self._run_time - - # # If the inputs are a graph and a list of graphs. - # elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): - # if self.copy_graphs: - # g1 = graphs[0].copy() - # g_list = [g.copy() for g in graphs[1]] - # kernel_list = self._compute_kernel_list(g1, g_list) - # else: - # kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) - # return kernel_list, self._run_time - - # elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): - # if self.copy_graphs: - # g1 = graphs[1].copy() - # g_list = [g.copy() for g in graphs[0]] - # kernel_list = self._compute_kernel_list(g1, g_list) - # else: - # kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) - # return kernel_list, self._run_time - - # else: - # raise Exception('Cannot detect graphs.') - - # elif len(graphs) == 0 and self._graphs is None: - # raise Exception('Please add graphs before computing.') - - # else: - # raise Exception('Cannot detect graphs.') - - # def normalize_gm(self, gram_matrix): - # import warnings - # warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning) - - # diag = gram_matrix.diagonal().copy() - # for i in range(len(gram_matrix)): - # for j in range(i, len(gram_matrix)): - # gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) - # gram_matrix[j][i] = gram_matrix[i][j] - # return gram_matrix - - # def compute_distance_matrix(self): - # if self._gram_matrix is None: - # raise Exception('Please compute the Gram matrix before computing distance matrix.') - # dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) - # for i in range(len(self._gram_matrix)): - # for j in range(i, len(self._gram_matrix)): - # dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] - # if dis < 0: - # if dis > -1e-10: - # dis = 0 - # else: - # raise ValueError('The distance is negative.') - # dis_mat[i, j] = np.sqrt(dis) - # dis_mat[j, i] = dis_mat[i, j] - # dis_max = np.max(np.max(dis_mat)) - # dis_min = np.min(np.min(dis_mat[dis_mat != 0])) - # dis_mean = np.mean(np.mean(dis_mat)) - # return dis_mat, dis_max, dis_min, dis_mean - - def _compute_X_distance_matrix(self, **kwargs): - graphs = ([g.copy() for g in - self._graphs] if self.copy_graphs else self._graphs) - - start_time = time.time() - - if self.parallel == 'imap_unordered': - dis_matrix = self._compute_X_dm_imap_unordered(graphs, **kwargs) - elif self.parallel is None: - dis_matrix = self._compute_X_dm_series(graphs, **kwargs) - else: - raise Exception('Parallel mode is not set correctly.') - - self._run_time = time.time() - start_time - - if self.verbose: - print( - 'Distance matrix of size %d built in %s seconds.' - % (len(self._graphs), self._run_time) - ) - - return dis_matrix - - - def _compute_X_dm_series(self, graphs, **kwargs): - n = len(graphs) - dis_matrix = np.zeros((n, n)) - - iterator = combinations(range(n), 2) - len_itr = int(n * (n - 1) / 2) - if self.verbose: - print('Graphs in total: %d.' % len(graphs)) - print('The total # of pairs is %d.' % len_itr) - for i, j in get_iters( - iterator, desc='Computing distance matrix', - file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr - ): - g1, g2 = graphs[i], graphs[j] - dis_matrix[i, j], _ = self.compute_ged(g1, g2, **kwargs) - dis_matrix[j, i] = dis_matrix[i, j] - return dis_matrix - - - def _compute_X_dm_imap_unordered(self, graphs, **kwargs): - """Compute GED distance matrix in parallel using imap_unordered. - """ - # This is very slow, maybe because of the Cython is involved. - from gklearn.utils.parallel import parallel_ged_mat - n = len(graphs) - dis_matrix = np.zeros((n, n)) - if self.verbose: - print('Graphs in total: %d.' % len(graphs)) - print('The total # of pairs is %d.' % int(n * (n + 1) / 2)) - - do_fun = self._wrapper_compute_ged - parallel_ged_mat( - do_fun, dis_matrix, graphs, init_worker=_init_worker_ged_mat, - glbv=(graphs,), n_jobs=self.n_jobs, verbose=self.verbose - ) - - - def _wrapper_compute_ged(self, itr): - i = itr[0] - j = itr[1] - # @TODO: repeats are not considered here. - dis, _ = self.compute_ged(G_gn[i], G_gn[j]) - return i, j, dis - - - # # imap_unordered returns an iterator of the results in the order - # # in which the function calls are started. - # # Note that imap_unordered may end up consuming all of the - # # available memory if the iterable is too large. - # n = len(graphs) - # dis_matrix = np.zeros((n, n)) - # iterator = combinations(range(n), 2) - # len_itr = int(n * (n + 1) / 2) - # pool = Pool(processes=self.n_jobs) - # for i, j in get_iters( - # iterator, desc='Computing distance matrix', - # file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr - # ): - # g1, g2 = graphs[i], graphs[j] - # dis_matrix[i, j], _ = pool.apply_async( - # self.compute_ged, (g1, g2) - # ).get() - # dis_matrix[j, i] = dis_matrix[i, j] - # pool.close() - # return dis_matrix - - def compute_ged(self, Gi, Gj, **kwargs): - """ - Compute GED between two graph according to edit_cost. - """ - ged_options = { - 'edit_cost': self.edit_cost_fun, - 'method': self.ed_method, - 'edit_cost_constants': self._edit_cost_constants - } - repeats = kwargs.get('repeats', 1) - dis, pi_forward, pi_backward = pairwise_ged( - Gi, Gj, ged_options, repeats=repeats - ) - # @TODO: Better to have a if here. - # if self.compute_n_eo: - # n_eo_tmp = get_nb_edit_operations( - # Gi, Gj, pi_forward, pi_backward, - # edit_cost=self.edit_cost_fun, - # node_labels=self.node_labels, edge_labels=self.edge_labels - # ) - # else: - # n_eo_tmp = None - # return dis, n_eo_tmp - return dis, None - - - # def _compute_kernel_list(self, g1, g_list): - # start_time = time.time() - - # if self.parallel == 'imap_unordered': - # kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) - # elif self.parallel is None: - # kernel_list = self._compute_kernel_list_series(g1, g_list) - # else: - # raise Exception('Parallel mode is not set correctly.') - - # self._run_time = time.time() - start_time - # if self.verbose: - # print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' - # % (len(g_list), self._run_time)) - - # return kernel_list - - # def _compute_kernel_list_series(self, g1, g_list): - # pass - - # def _compute_kernel_list_imap_unordered(self, g1, g_list): - # pass - - # def _compute_single_kernel(self, g1, g2): - # start_time = time.time() - - # kernel = self._compute_single_kernel_series(g1, g2) - - # self._run_time = time.time() - start_time - # if self.verbose: - # print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) - - # return kernel - - # def _compute_single_kernel_series(self, g1, g2): - # pass - - def is_graph(self, graph): - if isinstance(graph, nx.Graph): - return True - if isinstance(graph, nx.DiGraph): - return True - if isinstance(graph, nx.MultiGraph): - return True - if isinstance(graph, nx.MultiDiGraph): - return True - return False - - - def __repr__(self): - return ( - f"{self.__class__.__name__}(" - f"optim_method={self.optim_method}, " - f"ed_method={self.ed_method}, " - f"edit_cost_fun={self.edit_cost_fun}, " - f"node_labels={self.node_labels}, " - f"edge_labels={self.edge_labels}, " - f"optim_options={self.optim_options}, " - f"init_edit_cost_constants={self.init_edit_cost_constants}, " - f"copy_graphs={self.copy_graphs}, " - f"parallel={self.parallel}, " - f"n_jobs={self.n_jobs}, " - f"verbose={self.verbose}, " - f"normalize={self.normalize}, " - f"run_time={self.run_time}" - f")" - ) - - - @property - def graphs(self): - return self._graphs - - - # @property - # def parallel(self): - # return self.parallel - - # @property - # def n_jobs(self): - # return self.n_jobs - - # @property - # def verbose(self): - # return self.verbose - - # @property - # def normalize(self): - # return self.normalize - - @property - def run_time(self): - return self._run_time - - - @property - def test_run_time(self): - return self._test_run_time - - - @property - def dis_matrix(self): - return self._dm_train - - @dis_matrix.setter - def dis_matrix(self, value): - self._dm_train = value - - - @property - def metric_matrix(self): - return self._dm_train - - - @metric_matrix.setter - def metric_matrix(self, value): - self._dm_train = value - - - @property - def edit_cost_constants(self): - return self._edit_cost_constants - - -# @property -# def gram_matrix_unnorm(self): -# return self._gram_matrix_unnorm - -# @gram_matrix_unnorm.setter -# def gram_matrix_unnorm(self, value): -# self._gram_matrix_unnorm = value + Parameters + ---------- + args : tuple + Positional arguments to pass to the GED model. - @property - def n_pairs(self): - """ - The number of pairs of graphs between which the GEDs are computed. - """ - try: - check_is_fitted(self, '_dm_train') - return len(self._dm_train) * (len(self._dm_train) - 1) / 2 - except NotFittedError: - return None + use_global_env : bool, optional + If True, use the global environment to import the GED model. Default is True. + kwargs : dict + Keyword arguments to pass to the GED model. -def _init_worker_ged_mat(gn_toshare): - global G_gn - G_gn = gn_toshare + Returns + ------- + GEDModel + A GED model instance. + """ + if use_global_env: + from gklearn.ged.model.ged_model_global_env import GEDModel + else: + from gklearn.ged.model.ged_model_local_env import GEDModel + return GEDModel(*args, **kwargs) diff --git a/gklearn/ged/model/ged_model_bk.py b/gklearn/ged/model/ged_model_bk.py new file mode 100644 index 0000000000..ea6793f04c --- /dev/null +++ b/gklearn/ged/model/ged_model_bk.py @@ -0,0 +1,900 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu May 5 09:42:30 2022 + +@author: ljia +""" +import sys +import multiprocessing +import time +import numpy as np +import networkx as nx +from itertools import combinations +import multiprocessing +from multiprocessing import Pool + +# from abc import ABC, abstractmethod +from sklearn.base import BaseEstimator # , TransformerMixin +from sklearn.utils.validation import check_is_fitted # check_X_y, check_array, +from sklearn.exceptions import NotFittedError + +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util import pairwise_ged, get_nb_edit_operations +# from gklearn.utils import normalize_gram_matrix +from gklearn.utils import get_iters + + +# @TODO: it should be faster if creating a global env variable. +class GEDModel(BaseEstimator): # , ABC): + """The graph edit distance model class compatible with `scikit-learn`. + + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the + computation.** + + Notes + ----- + This class uses the `gedlibpy` module to compute the graph edit distance. + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + + def __init__( + self, + ed_method='BIPARTITE', + edit_cost_fun='CONSTANT', + init_edit_cost_constants=[3, 3, 1, 3, 3, 1], + optim_method='init', + optim_options={'y_distance': euclid_d, 'mode': 'reg'}, + node_labels=[], + edge_labels=[], + parallel=None, + n_jobs=None, + chunksize=None, + # normalize=True, + copy_graphs=True, # make sure it is a full deep copy. and faster! + verbose=2 + ): + """`__init__` for `GEDModel` object.""" + # @todo: the default settings of the parameters are different from those in the self.compute method. + # self._graphs = None + self.ed_method = ed_method + self.edit_cost_fun = edit_cost_fun + self.init_edit_cost_constants = init_edit_cost_constants + self.optim_method = optim_method + self.optim_options = optim_options + self.node_labels = node_labels + self.edge_labels = edge_labels + self.parallel = parallel + self.n_jobs = ( + (multiprocessing.cpu_count() - 1) if n_jobs is None else n_jobs) + self.chunksize = chunksize + # self.normalize = normalize + self.copy_graphs = copy_graphs + self.verbose = verbose + + + # self._run_time = 0 + # self._gram_matrix = None + # self._gram_matrix_unnorm = None + + ########################################################################## + # The following is the 1st paradigm to compute GED distance matrix, which is + # compatible with `scikit-learn`. + ########################################################################## + + def fit(self, X, y=None, **kwargs): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + Returns + ------- + object + Returns self. + + """ + # self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + if y is not None: + self._targets = y + # self._targets = self.validate_input(y) + + # Compute edit cost constants. + self.compute_edit_costs(**kwargs) + + # self._X = X + # self._kernel = self._get_kernel_instance() + + # Return the transformer. + return self + + + def transform( + self, X=None, + return_dm_train=False, + save_dm_test=False, + return_dm_test=False, + **kwargs + ): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # If `return_dm_train`, return the fitted GED distance matrix of training data. + if return_dm_train: + check_is_fitted(self, '_dm_train') + self._is_transformed = True + return self._dm_train # @TODO: copy or not? + + if return_dm_test: + check_is_fitted(self, '_dm_test') + return self._dm_test # @TODO: copy or not? + + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + dis_matrix = self.compute_distance_matrix(Y, **kwargs) + self._Y = Y + + # Self transform must appear before the diagonal call on normalization. + self._is_transformed = True # @TODO: When to set this to True? When return dm test? + # if self.normalize: + # X_diag, Y_diag = self.diagonals() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_dm_test: + self._dm_test = dis_matrix + # If the model is retransformed and the `save_dm_test` flag is not set, + # then remove the previously computed dm_test to prevent conflicts. + else: + if hasattr(self, '_dm_test'): + delattr(self, '_dm_test') + + return dis_matrix + + + def fit_transform( + self, + X, + y=None, + save_dm_train=False, + save_mm_train: bool = False, + **kwargs): + """Fit and transform: compute GED distance matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [len(X), len(X)] + The distance matrix of X. + + """ + self.fit(X, y, **kwargs) + + # Transform: compute Gram matrix. + dis_matrix = self.compute_distance_matrix(**kwargs) + + # # Normalize. + # if self.normalize: + # self._X_diag = np.diagonal(gram_matrix).copy() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_mm_train or save_dm_train: + self._dm_train = dis_matrix + # If the model is refitted and the `save_dm_train` flag is not set, then + # remove the previously computed dm_train to prevent conflicts. + else: + if hasattr(self, '_dm_train'): + delattr(self, '_dm_train') + + return dis_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): # @todo: update + # if hasattr(self, '_X_diag'): + # delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + delattr(self, '_run_time') + if hasattr(self, '_test_run_time'): + delattr(self, '_test_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel == False: + self.parallel = None + elif self.parallel == True: + self.parallel = 'imap_unordered' + if self.parallel is not None and self.parallel != 'imap_unordered': + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs. The input must be a list.') + elif len(X) == 0: + raise ValueError( + 'The graph list given is empty. No computation will be performed.' + ) + + return X + + + def compute_distance_matrix(self, Y=None, **kwargs): + """Compute the distance matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the distance matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None distance is computed + between X and itself. + + Returns + ------- + dis_matrix : numpy array, shape = [n_targets, n_inputs] + The computed distance matrix. + + """ + if Y is None: + # Compute metric matrix for self._graphs (X). + dis_matrix = self._compute_X_distance_matrix(**kwargs) + # self._gram_matrix_unnorm = np.copy(self._gram_matrix) + + else: + # Compute metric matrix between Y and self._graphs (X). + Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) + graphs_copy = ( + [g.copy() for g in self._graphs] + if self.copy_graphs else self._graphs + ) + + start_time = time.time() + + if self.parallel == 'imap_unordered': + dis_matrix = self._compute_distance_matrix_imap_unordered(Y) + + elif self.parallel is None: + dis_matrix = self._compute_distance_matrix_series( + Y_copy, graphs_copy, **kwargs + ) + + self._test_run_time = time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size (%d, %d) built in %s seconds.' + % (len(Y), len(self._graphs), self._test_run_time) + ) + + return dis_matrix + + + def _compute_distance_matrix_series(self, X, Y, **kwargs): + """Compute the GED distance matrix between two sets of graphs (X and Y) + without parallelization. + + Parameters + ---------- + X, Y : list of graphs + The input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [n_X, n_Y] + The computed distance matrix. + + """ + dis_matrix = np.zeros((len(X), len(Y))) + + for i_x, g_x in enumerate(X): + for i_y, g_y in enumerate(Y): + dis_matrix[i_x, i_y], _ = self.compute_ged(g_x, g_y, **kwargs) + + return dis_matrix + + + def _compute_kernel_matrix_imap_unordered(self, Y): + """Compute the kernel matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) using imap unordered parallelization. + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. + + Returns + ------- + kernel_matrix : numpy array, shape = [n_targets, n_inputs] + The computed kernel matrix. + + """ + raise Exception('Parallelization for kernel matrix is not implemented.') + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + graphs = ([g.copy() for g in + self._graphs] if self.copy_graphs else self._graphs) + for i, x in enumerate(graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) + for (i, y) in enumerate(Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + + # @abstractmethod + def pairwise_distance(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError( + 'Pairwise kernel computation is not implemented!' + ) + + + def compute_edit_costs(self, Y=None, Y_targets=None, **kwargs): + """Compute edit cost constants. When optimizing method is `fiited`, + apply Jia2021's metric learning method by using a given target graphs (Y) + the fitted graphs (X / self._graphs). + + Parameters + ---------- + Y : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + # Get or compute. + if self.optim_method == 'random': + self._edit_cost_constants = np.random.rand(6) + + elif self.optim_method == 'init': + self._edit_cost_constants = self.init_edit_cost_constants + + elif self.optim_method == 'expert': + self._edit_cost_constants = [3, 3, 1, 3, 3, 1] + + elif self.optim_method == 'fitted': # Jia2021 method + # Get proper inputs. + if Y is None: + check_is_fitted(self, ['_graphs']) + check_is_fitted(self, ['_targets']) + graphs = ([g.copy() for g in + self._graphs] if self.copy_graphs else self._graphs) + targets = self._targets + else: + graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) + targets = Y_targets + + # Get optimization options. + node_labels = self.node_labels + edge_labels = self.edge_labels + unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) + repeats = kwargs.get('repeats', 1) + from gklearn.ged.model.optim_costs import compute_optimal_costs + self._edit_cost_constants = compute_optimal_costs( + graphs, targets, + node_labels=node_labels, edge_labels=edge_labels, + unlabeled=unlabeled, + init_costs=self.init_edit_cost_constants, + ed_method=self.ed_method, + edit_cost_fun=self.edit_cost_fun, + repeats=repeats, + rescue_optim_failure=False, + verbose=(self.verbose >= 2), + **self.optim_options + ) + + + ########################################################################## + # The following is the 2nd paradigm to compute kernel matrix. It is + # simplified and not compatible with `scikit-learn`. + ########################################################################## + + # def compute(self, *graphs, **kwargs): + # self.parallel = kwargs.get('parallel', 'imap_unordered') + # self.n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) + # self.normalize = kwargs.get('normalize', True) + # self.verbose = kwargs.get('verbose', 2) + # self.copy_graphs = kwargs.get('copy_graphs', True) + # self.save_unnormed = kwargs.get('save_unnormed', True) + # self.validate_parameters() + + # # If the inputs is a list of graphs. + # if len(graphs) == 1: + # if not isinstance(graphs[0], list): + # raise Exception('Cannot detect graphs.') + # elif len(graphs[0]) == 0: + # raise Exception('The graph list given is empty. No computation was performed.') + # else: + # if self.copy_graphs: + # self._graphs = [g.copy() for g in graphs[0]] # @todo: might be very slow. + # else: + # self._graphs = graphs + # self._gram_matrix = self._compute_gram_matrix() + + # if self.save_unnormed: + # self._gram_matrix_unnorm = np.copy(self._gram_matrix) + # if self.normalize: + # self._gram_matrix = normalize_gram_matrix(self._gram_matrix) + # return self._gram_matrix, self._run_time + + # elif len(graphs) == 2: + # # If the inputs are two graphs. + # if self.is_graph(graphs[0]) and self.is_graph(graphs[1]): + # if self.copy_graphs: + # G0, G1 = graphs[0].copy(), graphs[1].copy() + # else: + # G0, G1 = graphs[0], graphs[1] + # kernel = self._compute_single_kernel(G0, G1) + # return kernel, self._run_time + + # # If the inputs are a graph and a list of graphs. + # elif self.is_graph(graphs[0]) and isinstance(graphs[1], list): + # if self.copy_graphs: + # g1 = graphs[0].copy() + # g_list = [g.copy() for g in graphs[1]] + # kernel_list = self._compute_kernel_list(g1, g_list) + # else: + # kernel_list = self._compute_kernel_list(graphs[0], graphs[1]) + # return kernel_list, self._run_time + + # elif isinstance(graphs[0], list) and self.is_graph(graphs[1]): + # if self.copy_graphs: + # g1 = graphs[1].copy() + # g_list = [g.copy() for g in graphs[0]] + # kernel_list = self._compute_kernel_list(g1, g_list) + # else: + # kernel_list = self._compute_kernel_list(graphs[1], graphs[0]) + # return kernel_list, self._run_time + + # else: + # raise Exception('Cannot detect graphs.') + + # elif len(graphs) == 0 and self._graphs is None: + # raise Exception('Please add graphs before computing.') + + # else: + # raise Exception('Cannot detect graphs.') + + # def normalize_gm(self, gram_matrix): + # import warnings + # warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning) + + # diag = gram_matrix.diagonal().copy() + # for i in range(len(gram_matrix)): + # for j in range(i, len(gram_matrix)): + # gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j]) + # gram_matrix[j][i] = gram_matrix[i][j] + # return gram_matrix + + # def compute_distance_matrix(self): + # if self._gram_matrix is None: + # raise Exception('Please compute the Gram matrix before computing distance matrix.') + # dis_mat = np.empty((len(self._gram_matrix), len(self._gram_matrix))) + # for i in range(len(self._gram_matrix)): + # for j in range(i, len(self._gram_matrix)): + # dis = self._gram_matrix[i, i] + self._gram_matrix[j, j] - 2 * self._gram_matrix[i, j] + # if dis < 0: + # if dis > -1e-10: + # dis = 0 + # else: + # raise ValueError('The distance is negative.') + # dis_mat[i, j] = np.sqrt(dis) + # dis_mat[j, i] = dis_mat[i, j] + # dis_max = np.max(np.max(dis_mat)) + # dis_min = np.min(np.min(dis_mat[dis_mat != 0])) + # dis_mean = np.mean(np.mean(dis_mat)) + # return dis_mat, dis_max, dis_min, dis_mean + + def _compute_X_distance_matrix(self, **kwargs): + graphs = ([g.copy() for g in + self._graphs] if self.copy_graphs else self._graphs) + + start_time = time.time() + + if self.parallel == 'imap_unordered': + dis_matrix = self._compute_X_dm_imap_unordered(graphs, **kwargs) + elif self.parallel is None: + dis_matrix = self._compute_X_dm_series(graphs, **kwargs) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time = time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size %d built in %s seconds.' + % (len(self._graphs), self._run_time) + ) + + return dis_matrix + + + def _compute_X_dm_series(self, graphs, **kwargs): + n = len(graphs) + dis_matrix = np.zeros((n, n)) + + iterator = combinations(range(n), 2) + len_itr = int(n * (n - 1) / 2) + if self.verbose: + print('Graphs in total: %d.' % len(graphs)) + print('The total # of pairs is %d.' % len_itr) + for i, j in get_iters( + iterator, desc='Computing distance matrix', + file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + ): + g1, g2 = graphs[i], graphs[j] + dis_matrix[i, j], _ = self.compute_ged(g1, g2, **kwargs) + dis_matrix[j, i] = dis_matrix[i, j] + return dis_matrix + + + def _compute_X_dm_imap_unordered(self, graphs, **kwargs): + """Compute GED distance matrix in parallel using imap_unordered. + """ + # This is very slow, maybe because of the Cython is involved. + from gklearn.utils.parallel import parallel_ged_mat + n = len(graphs) + dis_matrix = np.zeros((n, n)) + if self.verbose: + print('Graphs in total: %d.' % len(graphs)) + print('The total # of pairs is %d.' % int(n * (n + 1) / 2)) + + do_fun = self._wrapper_compute_ged + parallel_ged_mat( + do_fun, dis_matrix, graphs, init_worker=_init_worker_ged_mat, + glbv=(graphs,), n_jobs=self.n_jobs, verbose=self.verbose + ) + + + def _wrapper_compute_ged(self, itr): + i = itr[0] + j = itr[1] + # @TODO: repeats are not considered here. + dis, _ = self.compute_ged(G_gn[i], G_gn[j]) + return i, j, dis + + + # # imap_unordered returns an iterator of the results in the order + # # in which the function calls are started. + # # Note that imap_unordered may end up consuming all of the + # # available memory if the iterable is too large. + # n = len(graphs) + # dis_matrix = np.zeros((n, n)) + # iterator = combinations(range(n), 2) + # len_itr = int(n * (n + 1) / 2) + # pool = Pool(processes=self.n_jobs) + # for i, j in get_iters( + # iterator, desc='Computing distance matrix', + # file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + # ): + # g1, g2 = graphs[i], graphs[j] + # dis_matrix[i, j], _ = pool.apply_async( + # self.compute_ged, (g1, g2) + # ).get() + # dis_matrix[j, i] = dis_matrix[i, j] + # pool.close() + # return dis_matrix + + def compute_ged(self, Gi, Gj, **kwargs): + """ + Compute GED between two graph according to edit_cost. + """ + ged_options = { + 'edit_cost': self.edit_cost_fun, + 'method': self.ed_method, + 'edit_cost_constants': self._edit_cost_constants + } + repeats = kwargs.get('repeats', 1) + dis, pi_forward, pi_backward = pairwise_ged( + Gi, Gj, ged_options, repeats=repeats + ) + # @TODO: Better to have a if here. + # if self.compute_n_eo: + # n_eo_tmp = get_nb_edit_operations( + # Gi, Gj, pi_forward, pi_backward, + # edit_cost=self.edit_cost_fun, + # node_labels=self.node_labels, edge_labels=self.edge_labels + # ) + # else: + # n_eo_tmp = None + # return dis, n_eo_tmp + return dis, None + + + # def _compute_kernel_list(self, g1, g_list): + # start_time = time.time() + + # if self.parallel == 'imap_unordered': + # kernel_list = self._compute_kernel_list_imap_unordered(g1, g_list) + # elif self.parallel is None: + # kernel_list = self._compute_kernel_list_series(g1, g_list) + # else: + # raise Exception('Parallel mode is not set correctly.') + + # self._run_time = time.time() - start_time + # if self.verbose: + # print('Graph kernel bewteen a graph and a list of %d graphs built in %s seconds.' + # % (len(g_list), self._run_time)) + + # return kernel_list + + # def _compute_kernel_list_series(self, g1, g_list): + # pass + + # def _compute_kernel_list_imap_unordered(self, g1, g_list): + # pass + + # def _compute_single_kernel(self, g1, g2): + # start_time = time.time() + + # kernel = self._compute_single_kernel_series(g1, g2) + + # self._run_time = time.time() - start_time + # if self.verbose: + # print('Graph kernel bewteen two graphs built in %s seconds.' % (self._run_time)) + + # return kernel + + # def _compute_single_kernel_series(self, g1, g2): + # pass + + def is_graph(self, graph): + if isinstance(graph, nx.Graph): + return True + if isinstance(graph, nx.DiGraph): + return True + if isinstance(graph, nx.MultiGraph): + return True + if isinstance(graph, nx.MultiDiGraph): + return True + return False + + + def __repr__(self): + return ( + f"{self.__class__.__name__}(" + f"optim_method={self.optim_method}, " + f"ed_method={self.ed_method}, " + f"edit_cost_fun={self.edit_cost_fun}, " + f"node_labels={self.node_labels}, " + f"edge_labels={self.edge_labels}, " + f"optim_options={self.optim_options}, " + f"init_edit_cost_constants={self.init_edit_cost_constants}, " + f"copy_graphs={self.copy_graphs}, " + f"parallel={self.parallel}, " + f"n_jobs={self.n_jobs}, " + f"verbose={self.verbose}, " + f"normalize={self.normalize}, " + f"run_time={self.run_time}" + f")" + ) + + + @property + def graphs(self): + return self._graphs + + + # @property + # def parallel(self): + # return self.parallel + + # @property + # def n_jobs(self): + # return self.n_jobs + + # @property + # def verbose(self): + # return self.verbose + + # @property + # def normalize(self): + # return self.normalize + + @property + def run_time(self): + return self._run_time + + + @property + def test_run_time(self): + return self._test_run_time + + + @property + def dis_matrix(self): + return self._dm_train + + @dis_matrix.setter + def dis_matrix(self, value): + self._dm_train = value + + + @property + def metric_matrix(self): + return self._dm_train + + + @metric_matrix.setter + def metric_matrix(self, value): + self._dm_train = value + + + @property + def edit_cost_constants(self): + return self._edit_cost_constants + + +# @property +# def gram_matrix_unnorm(self): +# return self._gram_matrix_unnorm + +# @gram_matrix_unnorm.setter +# def gram_matrix_unnorm(self, value): +# self._gram_matrix_unnorm = value + + @property + def n_pairs(self): + """ + The number of pairs of graphs between which the GEDs are computed. + """ + try: + check_is_fitted(self, '_dm_train') + return len(self._dm_train) * (len(self._dm_train) - 1) / 2 + except NotFittedError: + return None + + +def _init_worker_ged_mat(gn_toshare): + global G_gn + G_gn = gn_toshare diff --git a/gklearn/ged/model/ged_model_global_env.py b/gklearn/ged/model/ged_model_global_env.py new file mode 100644 index 0000000000..360d45ddcb --- /dev/null +++ b/gklearn/ged/model/ged_model_global_env.py @@ -0,0 +1,1483 @@ +""" +ged_model_global_env + +The GEDModel class using a GEDEnv as a global environment inside the class. +Compared to the previous local version, this implementation can be at least up to 25x and 5x faster, +respectively with and without parallelization, but it is not very memory efficient. +Check comments in `profile_ged_model.py` and `profile_ged_model_cross_matrix.py` in +`gklearn/expeirments/ged/ged_model/` for the performance comparison. + +@Author: jajupmochi +@Date: Jun 06 2025 +""" +import multiprocessing +import os +import sys +import time +from contextlib import contextmanager +from functools import partial +from itertools import combinations, product +from multiprocessing import shared_memory + +import networkx as nx +import numpy as np +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util.util import ged_options_to_string +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_is_fitted +from tqdm import tqdm + + +class GEDModel(BaseEstimator): # , ABC): + """The graph edit distance model class compatible with `scikit-learn`. + + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the computation.** + + Notes + ----- + This class uses the `gedlibpy` module to compute the graph edit distance. + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + + def __init__( + self, + env_type: str | None = None, + ed_method='BIPARTITE', + edit_cost_fun='CONSTANT', + init_edit_cost_constants=[3, 3, 1, 3, 3, 1], + edit_cost_config: dict = {}, + optim_method='init', + optim_options={'y_distance': euclid_d, 'mode': 'reg'}, + ged_init_options: dict | None = None, + node_labels=[], + edge_labels=[], + parallel=None, + n_jobs=None, + chunksize=None, + # normalize=True, + copy_graphs=True, # make sure it is a full deep copy. and faster! + verbose=2 + ): + """`__init__` for `GEDModel` object. + + Parameters + ---------- + env_type : str, optional + The type of the GED environment. Default is None. If None, try to determine + the type automatically based on the given graph node / edge labels. + + Available types are: + + - 'attr': Attribute-based environment (with complex node and edge labels). + Each node or edge can have multiple key-value label pairs, and each value can + be of the following types: int, float, str, list/np.ndarray of int or float. + This is the default type if no node or edge labels are provided. + + - 'gxl' or 'str': GXLLabel environment (with string labels). Each node or + edge can have multiple key-value label pairs, but all values must be strings. + The type will be set to GXL only if at least one node or edge label is + provided. + """ + # @todo: the default settings of the parameters are different from those in the self.compute method. + # self._graphs = None + self.env_type = env_type + self.ed_method = ed_method + self.edit_cost_fun = edit_cost_fun + self.init_edit_cost_constants = init_edit_cost_constants + self.edit_cost_config = edit_cost_config + self.optim_method = optim_method + self.optim_options = optim_options + self.ged_init_options = ged_init_options + self.node_labels = node_labels + self.edge_labels = edge_labels + self.parallel = parallel + self.n_jobs = ((multiprocessing.cpu_count() - 1) if n_jobs is None else n_jobs) + self.chunksize = chunksize + # self.normalize = normalize + self.copy_graphs = copy_graphs + self.verbose = verbose + + self._ged_env = None # The GED environment to use for the model. + self._graphs = None # The input graphs to the model. + self._is_transformed = False # Whether the model has been transformed. + self._run_time = 0 # The run time of the last computation. + self._Y = None # The target graphs for the model. + self._dm_train = None # The distance matrix of the training data. + self._dm_test = None # The distance matrix of the test data. + self._edit_cost_constants = None # The edit cost constants for the model. + self._X_diag = None # The diagonal of the metric matrix for the training data (0's in this case). + self._Y_diag = None # The diagonal of the metric matrix for the test data (0's in this case). + self._targets = None # The targets for the model, if any. + + + ########################################################################## + # The following is the paradigm to compute GED distance matrix, which is + # compatible with `scikit-learn`. + ########################################################################## + + + def fit(self, X, y=None, **kwargs): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + kwargs : dict, optional + Additional parameters for the transformer. The following parameters can be included: + + Returns + ------- + object + Returns self. + + """ + # self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + if y is not None: + self._targets = y + # self._targets = self.validate_input(y) + + # Compute edit cost constants. + self.compute_edit_costs(**kwargs) + + # Create the GED environment if not set: + # Only do this if no parallelization will be used. Otherwise, a separate GEDEnv will be + # created in each worker in transforming. + # todo: we plan to refactor this in the future for better performance. + if self.parallel is None: + # `self._edit_cost_constants` is needed from `self.compute_edit_costs` to initialize the + # GED environment. + self._ged_env = self.create_and_setup_ged_env( + self.env_type, graph=X[0], + **{ + 'ed_method': self.ed_method, + 'edit_cost_fun': self.edit_cost_fun, + 'edit_cost_constants': self._edit_cost_constants, + 'edit_cost_config': self.edit_cost_config, + } + ) + # Add graphs to the environment: + self.add_graphs_to_ged_env( + self._graphs, self._ged_env, self.verbose, **{'copy_graphs': self.copy_graphs} + ) + + # Return the transformer. + return self + + + def transform( + self, + X=None, + return_dm_train=False, + save_dm_test=False, + return_dm_test=False, + **kwargs + ): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # If `return_dm_train`, return the fitted GED distance matrix of training data. + if return_dm_train: + check_is_fitted(self, '_dm_train') + self._is_transformed = True + return self._dm_train # @TODO: copy or not? + + if return_dm_test: + check_is_fitted(self, '_dm_test') + return self._dm_test # @TODO: copy or not? + + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + dis_matrix = self.compute_distance_matrix(Y, **kwargs) + self._Y = Y + + # Self transform must appear before the diagonal call on normalization. + self._is_transformed = True # @TODO: When to set this to True? When return dm test? + # if self.normalize: + # X_diag, Y_diag = self.diagonals() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_dm_test: + self._dm_test = dis_matrix + # If the model is retransformed and the `save_dm_test` flag is not set, + # then remove the previously computed dm_test to prevent conflicts. + else: + if hasattr(self, '_dm_test'): + delattr(self, '_dm_test') + + return dis_matrix + + + def fit_transform( + self, + X, + y=None, + save_dm_train=False, + save_mm_train: bool = False, + **kwargs + ): + """Fit and transform: compute GED distance matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [len(X), len(X)] + The distance matrix of X. + + """ + self.fit(X, y, **kwargs) + + # Transform: compute Gram matrix. + dis_matrix = self.compute_distance_matrix(**kwargs) + + # # Normalize. + # if self.normalize: + # self._X_diag = np.diagonal(gram_matrix).copy() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_mm_train or save_dm_train: + self._dm_train = dis_matrix + # If the model is refitted and the `save_dm_train` flag is not set, then + # remove the previously computed dm_train to prevent conflicts. + else: + if hasattr(self, '_dm_train'): + delattr(self, '_dm_train') + + return dis_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): # @todo: update + # if hasattr(self, '_X_diag'): + # delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + self._run_time = 0 + if hasattr(self, '_test_run_time'): + delattr(self, '_test_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel == False: + self.parallel = None + elif self.parallel == True: + self.parallel = 'imap_unordered' + if self.parallel is not None and self.parallel not in [ + 'imap_unordered', 'multiprocessing', 'joblib', 'concurrent' + ]: + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs. The input must be a list.') + elif len(X) == 0: + raise ValueError( + 'The graph list given is empty. No computation will be performed.' + ) + + return X + + + def compute_distance_matrix(self, Y=None, **kwargs): + """Compute the distance matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the distance matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None distance is computed + between X and itself. + + Returns + ------- + dis_matrix : numpy array, shape = [n_targets, n_inputs] + The computed distance matrix. + + """ + if Y is None: + # Compute metric matrix for self._graphs (X). + dis_matrix = self._compute_self_distance_matrix(**kwargs) + + else: + # Graphs copying will be done when loading the graphs into the GEDEnv. + dis_matrix = self._compute_cross_distance_matrix(Y, **kwargs) + + return dis_matrix + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + graphs = ([g.copy() for g in + self._graphs] if self.copy_graphs else self._graphs) + for i, x in enumerate(graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) + for (i, y) in enumerate(Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + + # @abstractmethod + def pairwise_distance(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError( + 'Pairwise kernel computation is not implemented!' + ) + + + def compute_edit_costs(self, Y=None, Y_targets=None, **kwargs): + # todo: this function is not optimized to use global environment. + """Compute edit cost constants. When optimizing method is `fiited`, + apply Jia2021's metric learning method by using a given target graphs (Y) + the fitted graphs (X / self._graphs). + + Parameters + ---------- + Y : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + # Get or compute. + if self.optim_method == 'random': + self._edit_cost_constants = np.random.rand(6) + + elif self.optim_method == 'init': + self._edit_cost_constants = self.init_edit_cost_constants + + elif self.optim_method == 'expert': + self._edit_cost_constants = [3, 3, 1, 3, 3, 1] + + elif self.optim_method == 'fitted': # Jia2021 method + # Get proper inputs. + if Y is None: + check_is_fitted(self, ['_graphs']) + check_is_fitted(self, ['_targets']) + graphs = ([g.copy() for g in + self._graphs] if self.copy_graphs else self._graphs) + targets = self._targets + else: + graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) + targets = Y_targets + + # Get optimization options. + node_labels = self.node_labels + edge_labels = self.edge_labels + unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) + repeats = kwargs.get('repeats', 1) + from gklearn.ged.model.optim_costs import compute_optimal_costs + self._edit_cost_constants = compute_optimal_costs( + graphs, targets, + node_labels=node_labels, edge_labels=edge_labels, + unlabeled=unlabeled, + init_costs=self.init_edit_cost_constants, + ed_method=self.ed_method, + edit_cost_fun=self.edit_cost_fun, + repeats=repeats, + rescue_optim_failure=False, + verbose=(self.verbose >= 2), + **self.optim_options + ) + + + # %% Self distance matrix computation methods: + + + def _compute_self_distance_matrix(self, **kwargs): + # Graphs were loaded into GEDEnv beforehand. No need to copy again. + + start_time = time.time() + + # if self.parallel == 'imap_unordered': + # dis_matrix = self._compute_X_dm_imap_unordered(graphs, **kwargs) + if self.parallel in ['imap_unordered', 'joblib', 'concurrent', 'multiprocessing']: + dis_matrix = self._compute_self_distance_matrix_parallel(**kwargs) + elif self.parallel is None: + dis_matrix = self._compute_self_distance_matrix_series(**kwargs) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time += time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size %d built in %s seconds.' + % (len(self._graphs), self._run_time) + ) + + return dis_matrix + + + def _compute_self_distance_matrix_series(self, **kwargs): + # We put the initialization of the GED environment here for these reasons: + # 1. To process the computation of costs between labels separately for series and parallel mode. + # 2. To include the time of this initialization in the total run time. + # 3. For cross distance matrix, target graphs (Y) need to be added to the environment. + self.init_ged_env_and_method(self._ged_env, **{'ged_init_options': self.ged_init_options}) + + graph_ids = self._ged_env.get_all_graph_ids() + n = len(graph_ids) + if n != len(self._graphs): + raise ValueError( + f'Number of graphs in the GEDEnv ({n}) does not match ' + f'number of input graphs in the GEDModel ({len(self._graphs)}).' + ) + + dis_matrix = np.zeros((n, n)) + iterator = combinations(range(n), 2) + len_itr = int(n * (n - 1) / 2) + if self.verbose: + print('Graphs in total: %d.' % n) + print('The total # of pairs is %d.' % len_itr) + # for i, j in get_iters( + # iterator, desc='Computing distance matrix', + # file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + # ): + if self.verbose >= 2: + iterator = tqdm( + iterator, desc='Computing distance matrix', + file=sys.stdout, total=len_itr + ) + for i, j in iterator: + gid1, gid2 = graph_ids[i], graph_ids[j] + dis_matrix[i, j], _ = GEDModel.pairwise_ged_with_gids( + gid1, gid2, self._ged_env, self._graphs, **kwargs + ) + dis_matrix[j, i] = dis_matrix[i, j] + return dis_matrix + + + # todo: this is not refactored yet. + def _compute_self_distance_matrix_parallel(self, **kwargs): + """ + Highly optimized parallelized version of distance matrix computation between graphs. + + Parameters: + ----------- + graphs : list + List of graph objects to compute pairwise distances + n_jobs : int, default=-1 + Number of parallel jobs. -1 means using all available cores. + chunk_size : int, default=None + Number of tasks per chunk. If None, will be auto-calculated. + memory_limit : str or int, default='auto' + Memory limit per worker in MB or 'auto' to determine automatically. + method : str, default='joblib' + Parallelization backend: 'joblib', 'concurrent', or 'multiprocessing' + + Returns: + -------- + np.ndarray + Distance matrix of shape (n, n) + """ + n = len(self._graphs) + + # Get all pairs of indices + pairs = list(combinations(range(n), 2)) + len_itr = len(pairs) + + n_jobs = self.n_jobs + chunksize = self.chunksize + method = self.parallel + memory_limit = kwargs.get('memory_limit', 'auto') + + if self.verbose: + print('Graphs in total: %d.' % n) + print('The total # of pairs is %d.' % len_itr) + + # Determine the number of processes: + if n_jobs == -1: + n_jobs = os.cpu_count() - 1 + n_jobs = min(n_jobs, os.cpu_count(), len_itr) + + # Auto-calculate optimal chunk size if not provided + if chunksize is None: + # # this seems to be slightly faster when using `test_ged_model.py` + # # with 100 graphs (0.0012 s vs 0.0016 s per pair). Yet gets slower with + # # larger number of graphs (e.g., 1000) (~ 31 mins vs ~ 40 mins in total). + # if len_itr < 100 * n_jobs: + # chunksize = int(len_itr / n_jobs) + 1 + # else: + # chunksize = 100 + + # Balancing chunk size: larger chunks reduce overhead but limit load balancing + # A good heuristic is sqrt(len_itr / n_jobs) * 4 + chunksize = max(1, int(np.sqrt(len_itr / n_jobs) * 4)) + + if self.verbose >= 2: + print( + f"Running with {n_jobs} parallel processes and chunk size of {chunksize}" + ) + + # # For networkx graphs, we need to use a Manager to share them between processes: + # with Manager() as manager: + # # Create a managed shared list for the graphs + # # todo: + # # 1. This operation will serialize the graphs, which will make a deep copy of each graph, + # # so it is not efficient. + # # + # # 2. When using multiprocessing.Manager to share graphs, a separate manager process is launched + # # to hold the shared objects. Accessing these shared graphs from other processes involves + # # serialization (pickling), inter-process communication (IPC), and deserialization (unpickling), + # # which can be very costly for large NetworkX graphs. + # # + # # In contrast, if we use per-process global variables initialized via init_worker(), + # # each process gets a local copy of the graph data, which avoids the IPC overhead, + # # but requires duplicating memory (one full copy per worker). + # # + # # To compare the overheads: + # # - Using a Manager: + # # -- Every access to a graph (e.g., shared_graphs[i]) involves: pickle → IPC → unpickle. + # # -- Graphs are not truly shared in memory; they are proxied through the manager process. + # # Since we then create GEDEnv graphs, so no more pickling is needed. + # # + # # - Using global variables in worker init: + # # -- Graphs are copied once to each worker during process start (via memory fork or pickle). + # # -- After that, all access is purely local (no IPC, no further serialization). + # # -- This is faster at runtime but uses more memory. + # # + # # 3. Since manager uses a proxy object, it may cause issues when trying to modify + # # the graphs. + # shared_graphs = manager.list(self._graphs) + + # Get a function reference to compute_ged that can be pickled + # Using a Python trick to make the instance method picklable + compute_ged_func = partial(GEDModel.pairwise_ged_with_gids_parallel, **kwargs) + + # Create a shared memory array for results + with numpy_shared_memory((n, n), dtype=np.float64) as (dis_matrix, shm_name): + + # Create a partial function with fixed arguments - must use module-level function + worker = partial( + self._process_pair_worker, + shm_name=shm_name, + matrix_shape=(n, n), + compute_ged_func=compute_ged_func, + **kwargs + ) + + try: + # Three different parallelization options for different scenarios + if method == 'joblib': + raise NotImplementedError( + 'Joblib parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + + elif method == 'concurrent': + # Option 2: ProcessPoolExecutor - cleaner API, slightly faster for CPU-bound tasks + # Use thread instead of the process to support shared memory for pre-created + # Cython objects: + raise NotImplementedError( + 'concurrent parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + + elif method in ['imap_unordered' or 'multiprocessing']: + # Option 3: multiprocessing.Pool with imap_unordered - more control, classic approach + # Does not work with pre-created GEDEnv Cython objects: + # TypeError: no default __reduce__ due to non-trivial __cinit__ + # So create a GEDEnv for each worker during the initialization. + # todo: maybe it is better to + # parallelize directly in C++ with pybind with e.g., openmp + + if self.verbose >= 2: + print(f'Using multiprocessing imap_unordered.') + + init_kwargs = { + 'ed_method': self.ed_method, + 'edit_cost_fun': self.edit_cost_fun, + 'edit_cost_constants': self._edit_cost_constants, + 'edit_cost_config': self.edit_cost_config, + 'ged_init_options': self.ged_init_options, + 'copy_graphs': False, + # Do not copy graphs here, they are already copied in the worker + } + + + # todo: we can actually control the part of graphs that each worker will process, + # but it is not worth the effort for now. + def init_worker_self_metric_matrix(graphs): + """Initialize each worker process with a GED environment""" + global g_ged_env # <- This will be created for each worker + global g_graphs + g_graphs = graphs # Set the graphs for the worker + g_ged_env = GEDModel.create_and_init_ged_env_for_parallel(g_graphs, **init_kwargs) + + + with multiprocessing.Pool( + processes=n_jobs, initializer=init_worker_self_metric_matrix, + initargs=(self._graphs,) + ) as pool: + if self.verbose >= 2: + results = list( + tqdm( + pool.imap_unordered(worker, pairs, chunksize=chunksize), + total=len_itr, + desc='Computing distance matrix', + file=sys.stdout + ) + ) + else: + results = list( + pool.imap_unordered(worker, pairs, chunksize=chunksize) + ) + + else: + raise ValueError( + f"Unsupported parallelization method: {method}." + ) + + # Copy the result from shared memory to a regular numpy array + result = dis_matrix.copy() + + except Exception as e: + # Make sure we log any errors that occur during parallel execution + if self.verbose: + print(f"Error during parallel execution: {e}.") + raise + + # At this point, the Manager will automatically clean up shared resources + + return result + + + @staticmethod + def _process_pair_worker(pair, shm_name, matrix_shape, compute_ged_func, **kwargs): + """Worker function that processes a pair of graphs and updates the shared matrix. + Must be defined at module level to be picklable.""" + # # test only: + # print(f'[{multiprocessing.current_process().name}] Processing pair: {pair}.') + + i, j = pair + + try: + # Access the shared memory + existing_shm = shared_memory.SharedMemory(name=shm_name) + shared_matrix = np.ndarray( + matrix_shape, dtype=np.float64, buffer=existing_shm.buf + ) + + # Compute distance using the function reference + distance, _ = compute_ged_func(i, j, **kwargs) + + # Update the matrix + shared_matrix[i, j] = distance + shared_matrix[j, i] = distance + + finally: + # Clean up local shared memory reference + if 'existing_shm' in locals(): + existing_shm.close() + + return i, j, distance # Return for progress tracking + + + # %% Cross distance matrix computation methods: + + + def _compute_cross_distance_matrix(self, graphs_t: nx.Graph, **kwargs): + start_time = time.time() + + if self.parallel in ['imap_unordered', 'joblib', 'concurrent', 'multiprocessing']: + dis_matrix = self._compute_distance_matrix_parallel_unified( + self._graphs, graphs_t, **kwargs + ) + + elif self.parallel is None: + dis_matrix = self._compute_cross_distance_matrix_series( + self._graphs, graphs_t, **kwargs + ) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time += time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size (%d, %d) built in %s seconds.' + % (len(graphs_t), len(self._graphs), self._run_time) + ) + + return dis_matrix + + + def _compute_cross_distance_matrix_series( + self, graphs_f: list[nx.Graph], graphs_t: list[nx.Graph], **kwargs + ): + """Compute the GED distance matrix between two sets of graphs (X and Y) + without parallelization. + + Parameters + ---------- + graphs_f : list of graphs + The fitted graphs (X / self._graphs). + + graphs_t : list of graphs + The target graphs (Y). + + Returns + ------- + dis_matrix : numpy array, shape = [n_Y, n_X] + The computed distance matrix. + """ + # Add graphs to the environment: + self.add_graphs_to_ged_env( + graphs_t, self._ged_env, self.verbose, **{'copy_graphs': self.copy_graphs} + ) + # We put the initialization of the GED environment here for these reasons: + # 1. To process the computation of costs between labels separately for series and parallel mode. + # 2. To include the time of this initialization in the total run time. + # 3. For cross distance matrix, target graphs (Y) need to be added to the environment. + self.init_ged_env_and_method(self._ged_env, **{'ged_init_options': self.ged_init_options}) + + n_f = len(graphs_f) + n_t = len(graphs_t) + n_graphs_in_env = self._ged_env.get_num_graphs() + if n_graphs_in_env != n_f + n_t: + raise ValueError( + f'Number of graphs in the GEDEnv ({n_graphs_in_env}) does not match ' + f'the total number of fitted and target graphs in the GEDModel ({n_f} + {n_t} = {n_f + n_t}).' + ) + + # Initialize distance matrix with zeros + dis_matrix = np.zeros((n_t, n_f)) + iterator = product(range(n_f), range(n_t)) + len_itr = n_f * n_t + if self.verbose: + print(f'Computing distances between {n_t} and {n_f} graphs.') + print(f'The total # of pairs is {len_itr}.') + + # Use tqdm for progress bar if verbose: + if self.verbose >= 2: + iterator = tqdm( + iterator, desc='Computing distance matrix', file=sys.stdout, total=len_itr + ) + # for i, j in get_iters( + # iterator, desc='Computing distance matrix', + # file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + # ): + for i_f, j_t in iterator: + gid_f, gid_t = i_f, j_t + dis_matrix[j_t, i_f], _ = self.pairwise_ged_with_gids( + gid_f, gid_t, self._ged_env, graphs_f + graphs_t, **kwargs + ) + + return dis_matrix + + + def _compute_distance_matrix_parallel_unified( + self, graphs_f, graphs_t: nx.Graph | None = None, **kwargs + ): + """Compute the GED distance matrix between two sets of graphs (X and Y) + with parallelization. + + Parameters + ---------- + graphs_f : list of graphs + The fitted graphs (X). + + graphs_t : list of graphs + The target graphs (Y). If None, the distance is computed between + the fitted graphs (X) and itself. + + + Returns + ------- + dis_matrix : numpy array, shape = [n_Y, n_X] + The computed distance matrix. + + References + ---------- + This method is written with the help of the Claude 3.7 Sonnet AI, accessed on 2025.05.15. + + todo: this can be merged with the _compute_X_dm_parallel method. + """ + # Handle the case where graphs2 is not provided + is_same_set = graphs_t is None + if is_same_set: + graphs_t = graphs_f + + n_f = len(graphs_f) + n_t = len(graphs_t) + + # Get all pairs of indices to compute + if is_same_set: + # Only compute the upper triangular portion for efficiency when comparing within same set + pairs = list(combinations(range(n_f), 2)) + else: + # Compute all pairs when comparing between different sets: + # Notice this has different order (fiited / col first) as the matrix (target / row first): + pairs = list(product(range(n_f), range(n_t))) + + len_itr = len(pairs) + + n_jobs = self.n_jobs + chunksize = self.chunksize + method = self.parallel + # memory_limit = kwargs.get('memory_limit', 'auto') + + if self.verbose: + if is_same_set: + print(f'Graphs in total: {n_f}.') + else: + print(f'Computing distances between {n_t} and {n_f} graphs.') + print(f'The total # of pairs is {len_itr}.') + + # Determine the number of workers: + if n_jobs == -1 or n_jobs is None: + n_jobs = os.cpu_count() - 1 + n_jobs = min(n_jobs, os.cpu_count(), len_itr) + + # Auto-calculate optimal chunk size if not provided + if chunksize is None: + # # this seems to be slightly faster when using `test_ged_model.py` + # # with 100 graphs (0.0012 s vs 0.0016 s per pair). Yet gets slower with + # # larger number of graphs (e.g., 1000) (~ 31 mins vs ~ 40 mins in total). + # if len_itr < 100 * n_jobs: + # chunksize = int(len_itr / n_jobs) + 1 + # else: + # chunksize = 100 + + # Balancing chunk size: larger chunks reduce overhead but limit load balancing + # A good heuristic is sqrt(len_itr / n_jobs) * 4 + chunksize = max(1, int(np.sqrt(len_itr / n_jobs) * 4)) + + if self.verbose >= 2: + print( + f"Running with {n_jobs} parallel processes and chunk size of {chunksize}..." + ) + + # Get a function reference to compute_ged that can be pickled + # Using a Python trick to make the instance method picklable + compute_ged_func = partial( + GEDModel.pairwise_ged_with_gids_parallel, is_same_set=is_same_set, **kwargs + ) + + # Create a shared memory array for results + with numpy_shared_memory((n_t, n_f), dtype=np.float64) as (dis_matrix, shm_name): + # Create a partial function with fixed arguments - MUST NOT use + # inline function here, as it won't be picklable: + worker = partial( + self._process_pair_worker_unified, + shm_name=shm_name, + matrix_shape=(n_t, n_f), + compute_ged_func=compute_ged_func, + is_same_set=is_same_set, + **kwargs + ) + + try: + # Three different parallelization options for different scenarios + if method == 'joblib': + raise NotImplementedError( + 'Joblib parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + + elif method == 'concurrent': + # Option 2: ProcessPoolExecutor - cleaner API, slightly faster for CPU-bound tasks + raise NotImplementedError( + 'concurrent parallelization is not implemented yet. ' + 'Please use "multiprocessing".' + ) + + elif method in ['imap_unordered' or 'multiprocessing']: + # Option 3: multiprocessing.Pool with imap_unordered - more control, classic approach + if self.verbose >= 2: + print(f'Using multiprocessing imap_unordered.') + + init_kwargs = { + 'ed_method': self.ed_method, + 'edit_cost_fun': self.edit_cost_fun, + 'edit_cost_constants': self._edit_cost_constants, + 'edit_cost_config': self.edit_cost_config, + 'ged_init_options': self.ged_init_options, + 'copy_graphs': False, + # Do not copy graphs here, they are already copied in the worker + } + + + def init_worker_cross_metric_matrix(graphs_f, graphs_t): + """Initialize each worker process with a GED environment""" + global g_ged_env # <- This will be created for each worker + global g_graphs_f + global g_graphs_t + g_graphs_f = graphs_f # Set the graphs for the worker + g_graphs_t = graphs_t + g_ged_env = GEDModel.create_and_init_ged_env_for_parallel( + g_graphs_f + g_graphs_t, **init_kwargs + ) + + + with multiprocessing.Pool( + processes=n_jobs, initializer=init_worker_cross_metric_matrix, + initargs=(graphs_f, graphs_t,) + ) as pool: + if self.verbose >= 2: + results = list( + tqdm( + pool.imap_unordered(worker, pairs, chunksize=chunksize), + total=len_itr, + desc='Computing distance matrix', + file=sys.stdout + ) + ) + else: + results = list(pool.imap_unordered(worker, pairs, chunksize=chunksize)) + + else: + raise ValueError( + f"Unsupported parallelization method: {method}." + ) + + # Copy the result from shared memory to a regular numpy array + result = dis_matrix.copy() + + except Exception as e: + # Make sure we log any errors that occur during parallel execution + if self.verbose: + print(f"Error during parallel execution: {e}.") + raise + + # At this point, the Manager will automatically clean up shared resources + + return result + + + # %% Parallelization methods: + + + @staticmethod + def create_and_init_ged_env_for_parallel(graphs: list[nx.Graph], **kwargs): + """Create and initialize a GED environment for parallel processing.""" + # Create a new GEDEnv instance for each worker + ged_env = GEDModel.create_and_setup_ged_env(graph=graphs[0], **kwargs) + # print(f'[{multiprocessing.current_process().name}] ') + # print(ged_env) + + # Add all graphs to the environment: + GEDModel.add_graphs_to_ged_env(graphs, ged_env, verbose=0, **kwargs) + # print('fnished adding graphs to the GEDEnv in worker.') + # print(ged_env.get_all_graph_ids()) + GEDModel.init_ged_env_and_method(ged_env, **kwargs) + + graph_ids = ged_env.get_all_graph_ids() + n = len(graph_ids) + if n != len(graphs): + raise ValueError( + f'Number of graphs in the GEDEnv ({n}) does not match ' + f'number of graphs set from GEDModel to the worker ({len(graphs)}).' + ) + return ged_env + + + @staticmethod + def _process_pair_worker_unified( + pair, shm_name, matrix_shape, compute_ged_func, is_same_set=True, **kwargs + ): + """Worker function that processes a pair of graphs and updates the shared matrix. + Must be defined at module level to be picklable.""" + i_f, j_t = pair # Indices of the fitted and target graphs in the original lists in GEDModel + + try: + # Access the shared memory + existing_shm = shared_memory.SharedMemory(name=shm_name) + shared_matrix = np.ndarray(matrix_shape, dtype=np.float64, buffer=existing_shm.buf) + + # Compute distance using the function reference + distance, _ = compute_ged_func(i_f, j_t, **kwargs) + + # Update the matrix + shared_matrix[j_t, i_f] = distance + + # If computing within the same set, update symmetric position: + if is_same_set and i_f != j_t: + shared_matrix[i_f, j_t] = distance + + finally: + # Clean up local shared memory reference + if 'existing_shm' in locals(): + existing_shm.close() + + return i_f, j_t, distance # Return for progress tracking + + + @staticmethod + def pairwise_ged_with_gids_parallel( + graph_id_f: int, graph_id_t: int, is_same_set: bool = True, **kwargs + ): + global g_ged_env # <- Use the global GEDEnv created in the worker initializer + if is_same_set: + global g_graphs + graphs1, graphs2 = g_graphs, None + else: + global g_graphs_f, g_graphs_t + graphs1, graphs2 = g_graphs_f, g_graphs_t + + dis, _ = GEDModel.pairwise_ged_with_gids( + graph_id_f, graph_id_t, g_ged_env, graphs1, + is_same_set=is_same_set, graphs2=graphs2, **kwargs + ) + + return dis, None + + + # %% GEDEnv related methods: + + + @staticmethod + def get_env_type(graph: nx.Graph | None = None): + """ + Check the environment type of the graph. + If `env_type` is set on initialization, return it. + Otherwise, check the given graph's node and edge labels to determine the type. + + Only one node and one edge are checked to determine the type. + This function expects that all nodes have the same type of labels, so as all + edges. + """ + if graph is None: + raise ValueError( + 'Graph is not provided while `env_type` not set on initialization. ' + 'Cannot determine environment type.' + ) + # Use 'gxl' env type only if all nodes and edge labes are strings, and at least one + # node or edge label is present: + one_n_labels = graph.nodes[list(graph.nodes)[0]] + for k, v in one_n_labels.items(): + if not isinstance(v, str): + return 'attr' + if nx.number_of_edges(graph) != 0: + one_e_labels = graph.edges[list(graph.edges)[0]] + for k, v in one_e_labels.items(): + if not isinstance(v, str): + return 'attr' + if len(one_n_labels) > 0 or ( + nx.number_of_edges(graph) != 0 and len(one_e_labels) > 0 + ): + return 'gxl' + return 'attr' + + + @staticmethod + def create_and_setup_ged_env(env_type: str | None = None, graph: nx.Graph = None, **kwargs): + """ + Create and set up the GED environment. + + Notes + ----- + `GEDENV.init()` and `GEDENV.init_method()` must be called after all graphs are added + to the GEDEnv. They are not called here. + """ + from gklearn.gedlib import gedlibpy + + if env_type is None: + env_type = GEDModel.get_env_type(graph=graph) + ged_options = { + 'env_type': env_type, + 'edit_cost': kwargs['edit_cost_fun'], + 'method': kwargs['ed_method'], + 'edit_cost_constants': kwargs['edit_cost_constants'], + 'edit_cost_config': kwargs['edit_cost_config'], + } + + ged_env = gedlibpy.GEDEnv(env_type=ged_options.get('env_type', 'attr'), verbose=False) + ged_env.set_edit_cost( + ged_options['edit_cost'], + edit_cost_constant=ged_options['edit_cost_constants'], + **ged_options.get('edit_cost_config') and { + 'edit_cost_config': ged_options['edit_cost_config'] + } or {} + ) + + ged_env.set_method(ged_options['method'], ged_options_to_string(ged_options)) + + return ged_env + + + @staticmethod + def add_graphs_to_ged_env(graphs: list[nx.Graph], ged_env, verbose: int = 1, **kwargs): + # `init()` and `init_method()` must be called after all graphs are added to the GEDEnv. + + iterator = enumerate(graphs) + if verbose >= 2: + iterator = tqdm( + iterator, desc='Adding graphs to the GED environment', + file=sys.stdout, total=len(graphs) + ) + for i, g in iterator: + GEDModel.add_graph_to_ged_env(g.copy() if kwargs['copy_graphs'] else g, ged_env=ged_env) + + + @staticmethod + def add_graph_to_ged_env(graph: nx.Graph, ged_env): + ged_env.add_nx_graph(graph, '', ignore_duplicates=True) + + + @staticmethod + def init_ged_env_and_method(ged_env, **kwargs): + # `init()` must be called after all graphs are added to the GEDEnv: + # todo: determine which is faster: lazy or eager. Maybe do this automatically. + # (eager can not show progress bar): + init_options = 'LAZY_WITHOUT_SHUFFLED_COPIES' if kwargs['ged_init_options'] is None else \ + kwargs['ged_init_options'] + if init_options.startswith('EAGER_'): + print(f'Starting eager label cost computing. This may take a while...') + ged_env.init(init_options) + print(f'Eager label cost computing finished.') + else: + ged_env.init(init_options) + # `init_method()` must be called after `init()`: + ged_env.init_method() + + + @staticmethod + def pairwise_ged_with_gids( + graph_id1: int, graph_id2: int, ged_env, graphs: list[nx.Graph], + is_same_set: bool = True, graphs2: list[nx.Graph] | None = None, **kwargs + ): + """ + Compute pairwise GED between two graphs using their IDs in the GEDEnv. + + This method uses the GEDEnv member globally available in the class. + + Parameters + ---------- + graph_id1 : int + ID of the first graph in the GEDEnv. If `is_same_set` is False, it refers to the fitted + (reference) graph. + + graph_id2 : int + ID of the second graph in the GEDEnv. If `is_same_set` is False, it refers to the target + graph. + + Notes + ----- + - Be careful with the order between `graph_id1` and `graph_id2`. When `is_same_set` = False, + `graph_id1` is the fitted (reference) graph and `graph_id2` is the target graph. + + Todo + ---- + - Since GED is not normally symmetric, maybe add an option to compute the average of the two + - distances (forward and backward) or the minimum of the two distances. + """ + repeats = kwargs.get('repeats', 1) + + dis_min = np.inf + + if is_same_set: + graph_id2_env = graph_id2 + else: + graph_id2_env = len(graphs) + graph_id2 # Both graph lists were added to the GEDEnv. + + for i in range(0, repeats): + ged_env.run_method(graph_id1, graph_id2_env) + upper = ged_env.get_upper_bound(graph_id1, graph_id2_env) + dis = upper + # print(dis) + if dis < dis_min: + dis_min = dis + pi_forward = ged_env.get_forward_map(graph_id1, graph_id2_env) + pi_backward = ged_env.get_backward_map(graph_id1, graph_id2_env) + # lower = ged_env.get_lower_bound(g, h) + + # make the map label correct (label remove mappings as np.inf): + if is_same_set: + g1, g2 = graphs[graph_id1], graphs[graph_id2] + else: + g1, g2 = graphs[graph_id1], graphs2[graph_id2] + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + # print(pi_forward) + + # @TODO: Better to have a if here. + # if self.compute_n_eo: + # n_eo_tmp = get_nb_edit_operations( + # Gi, Gj, pi_forward, pi_backward, + # edit_cost=self.edit_cost_fun, + # node_labels=self.node_labels, edge_labels=self.edge_labels + # ) + # else: + # n_eo_tmp = None + # return dis, n_eo_tmp + return dis, None + + + # %% + + def is_graph(self, graph): + if isinstance(graph, nx.Graph): + return True + if isinstance(graph, nx.DiGraph): + return True + if isinstance(graph, nx.MultiGraph): + return True + if isinstance(graph, nx.MultiDiGraph): + return True + return False + + + def __repr__(self): + return ( + f"{self.__class__.__name__}(" + f"optim_method={self.optim_method}, " + f"ed_method={self.ed_method}, " + f"edit_cost_fun={self.edit_cost_fun}, " + f"node_labels={self.node_labels}, " + f"edge_labels={self.edge_labels}, " + f"optim_options={self.optim_options}, " + f"init_edit_cost_constants={self.init_edit_cost_constants}, " + f"copy_graphs={self.copy_graphs}, " + f"parallel={self.parallel}, " + f"n_jobs={self.n_jobs}, " + f"verbose={self.verbose}, " + + (f"normalize={self.normalize}, " if hasattr(self, 'normalize') else "") + + f"run_time={self.run_time}" + f")" + ) + + + @property + def graphs(self): + return self._graphs + + + # @property + # def parallel(self): + # return self.parallel + + # @property + # def n_jobs(self): + # return self.n_jobs + + # @property + # def verbose(self): + # return self.verbose + + # @property + # def normalize(self): + # return self.normalize + + @property + def run_time(self): + return self._run_time + + + @property + def test_run_time(self): + return self._test_run_time + + + @property + def dis_matrix(self): + return self._dm_train + + + @dis_matrix.setter + def dis_matrix(self, value): + self._dm_train = value + + + @property + def metric_matrix(self): + return self._dm_train + + + @metric_matrix.setter + def metric_matrix(self, value): + self._dm_train = value + + + @property + def edit_cost_constants(self): + return self._edit_cost_constants + + + # @property + # def gram_matrix_unnorm(self): + # return self._gram_matrix_unnorm + + # @gram_matrix_unnorm.setter + # def gram_matrix_unnorm(self, value): + # self._gram_matrix_unnorm = value + + @property + def n_pairs(self): + """ + The number of pairs of graphs between which the GEDs are computed. + """ + try: + check_is_fitted(self, '_dm_train') + return len(self._dm_train) * (len(self._dm_train) - 1) / 2 + except NotFittedError: + return None + + +# Context manager for shared memory with automatic cleanup +@contextmanager +def numpy_shared_memory(shape, dtype=np.float64): + """Create a numpy array in shared memory that automatically cleans up.""" + size = int(np.prod(shape)) * np.dtype(dtype).itemsize + shm = shared_memory.SharedMemory(create=True, size=size) + try: + array = np.ndarray(shape, dtype=dtype, buffer=shm.buf) + array.fill(0) # Initialize with zeros + yield array, shm.name + finally: + shm.close() + shm.unlink() diff --git a/gklearn/ged/model/ged_model_local_env.py b/gklearn/ged/model/ged_model_local_env.py new file mode 100644 index 0000000000..f7841de5d1 --- /dev/null +++ b/gklearn/ged/model/ged_model_local_env.py @@ -0,0 +1,1314 @@ +""" +ged_model_local_env + +The GEDModel class creating a GEDEnv locally inside the pairwise distance computation for +each pair of graphs. This can be a bit time efficient, but also super slow. + +Check comments in `profile_ged_model.py` and `profile_ged_model_cross_matrix.py` in +`gklearn/expeirments/ged/ged_model/` for the performance comparison. + +@Author: jajupmochi +@Date: Jun 06 2025 +""" +import gc +import multiprocessing +import os +import sys +import time +from concurrent.futures import ProcessPoolExecutor +from contextlib import contextmanager +from functools import partial +from itertools import combinations, product +from multiprocessing import shared_memory, Manager + +import joblib +import networkx as nx +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_is_fitted +from tqdm import tqdm + +from gklearn.ged.model.distances import euclid_d +from gklearn.ged.util.util import ged_options_to_string +from gklearn.utils import get_iters + + +class GEDModel(BaseEstimator): # , ABC): + """The graph edit distance model class compatible with `scikit-learn`. + + Attributes + ---------- + _graphs : list + Stores the input graphs on fit input data. + Default format of the list objects is `NetworkX` graphs. + **We don't guarantee that the input graphs remain unchanged during the + computation.** + + Notes + ----- + This class uses the `gedlibpy` module to compute the graph edit distance. + + References + ---------- + https://ysig.github.io/GraKeL/0.1a8/_modules/grakel/kernels/kernel.html#Kernel. + """ + + + def __init__( + self, + env_type: str | None = None, + ed_method='BIPARTITE', + edit_cost_fun='CONSTANT', + init_edit_cost_constants=[3, 3, 1, 3, 3, 1], + edit_cost_config: dict = {}, + optim_method='init', + optim_options={'y_distance': euclid_d, 'mode': 'reg'}, + ged_init_options=None, + node_labels=[], + edge_labels=[], + parallel=None, + n_jobs=None, + chunksize=None, + # normalize=True, + copy_graphs=True, # make sure it is a full deep copy. and faster! + verbose=2 + ): + """`__init__` for `GEDModel` object. + + Parameters + ---------- + env_type : str, optional + The type of the GED environment. Default is None. If None, try to determine + the type automatically based on the given graph node / edge labels. + + Available types are: + + - 'attr': Attribute-based environment (with complex node and edge labels). + Each node or edge can have multiple key-value label pairs, and each value can + be of the following types: int, float, str, list/np.ndarray of int or float. + This is the default type if no node or edge labels are provided. + + - 'gxl' or 'str': GXLLabel environment (with string labels). Each node or + edge can have multiple key-value label pairs, but all values must be strings. + The type will be set to GXL only if at least one node or edge label is + provided. + """ + # @todo: the default settings of the parameters are different from those in the self.compute method. + # self._graphs = None + self.env_type = env_type + self.ed_method = ed_method + self.edit_cost_fun = edit_cost_fun + self.init_edit_cost_constants = init_edit_cost_constants + self.edit_cost_config = edit_cost_config + self.optim_method = optim_method + self.optim_options = optim_options + self.node_labels = node_labels + self.edge_labels = edge_labels + self.parallel = parallel + self.n_jobs = ( + (multiprocessing.cpu_count() - 1) if n_jobs is None else n_jobs) + self.chunksize = chunksize + # self.normalize = normalize + self.copy_graphs = copy_graphs + self.verbose = verbose + + + ########################################################################## + # The following is the 1st paradigm to compute GED distance matrix, which is + # compatible with `scikit-learn`. + ########################################################################## + + def fit(self, X, y=None, **kwargs): + """Fit a graph dataset for a transformer. + + Parameters + ---------- + X : iterable + DESCRIPTION. + + y : None, optional + There is no need of a target in a transformer, yet the `scikit-learn` + pipeline API requires this parameter. + + Returns + ------- + object + Returns self. + + """ + # self._is_tranformed = False + + # Clear any prior attributes stored on the estimator, # @todo: unless warm_start is used; + self.clear_attributes() + + # Validate parameters for the transformer. + self.validate_parameters() + + # Validate the input. + self._graphs = self.validate_input(X) + if y is not None: + self._targets = y + # self._targets = self.validate_input(y) + + # Compute edit cost constants. + self.compute_edit_costs(**kwargs) + + # self._X = X + # self._kernel = self._get_kernel_instance() + + # Return the transformer. + return self + + + def transform( + self, + X=None, + return_dm_train=False, + save_dm_test=False, + return_dm_test=False, + **kwargs + ): + """Compute the graph kernel matrix between given and fitted data. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Raises + ------ + ValueError + DESCRIPTION. + + Returns + ------- + None. + + """ + # If `return_dm_train`, return the fitted GED distance matrix of training data. + if return_dm_train: + check_is_fitted(self, '_dm_train') + self._is_transformed = True + return self._dm_train # @TODO: copy or not? + + if return_dm_test: + check_is_fitted(self, '_dm_test') + return self._dm_test # @TODO: copy or not? + + # Check if method "fit" had been called. + check_is_fitted(self, '_graphs') + + # Validate the input. + Y = self.validate_input(X) + + # Transform: compute the graph kernel matrix. + dis_matrix = self.compute_distance_matrix(Y, **kwargs) + self._Y = Y + + # Self transform must appear before the diagonal call on normalization. + self._is_transformed = True # @TODO: When to set this to True? When return dm test? + # if self.normalize: + # X_diag, Y_diag = self.diagonals() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # kernel_matrix /= np.sqrt(np.outer(Y_diag, X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_dm_test: + self._dm_test = dis_matrix + # If the model is retransformed and the `save_dm_test` flag is not set, + # then remove the previously computed dm_test to prevent conflicts. + else: + if hasattr(self, '_dm_test'): + delattr(self, '_dm_test') + + return dis_matrix + + + def fit_transform( + self, + X, + y=None, + save_dm_train=False, + save_mm_train: bool = False, + **kwargs + ): + """Fit and transform: compute GED distance matrix on the same data. + + Parameters + ---------- + X : list of graphs + Input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [len(X), len(X)] + The distance matrix of X. + + """ + self.fit(X, y, **kwargs) + + # Transform: compute Gram matrix. + dis_matrix = self.compute_distance_matrix(**kwargs) + + # # Normalize. + # if self.normalize: + # self._X_diag = np.diagonal(gram_matrix).copy() + # old_settings = np.seterr(invalid='raise') # Catch FloatingPointError: invalid value encountered in sqrt. + # try: + # gram_matrix /= np.sqrt(np.outer(self._X_diag, self._X_diag)) + # except: + # raise + # finally: + # np.seterr(**old_settings) + + if save_mm_train or save_dm_train: + self._dm_train = dis_matrix + # If the model is refitted and the `save_dm_train` flag is not set, then + # remove the previously computed dm_train to prevent conflicts. + else: + if hasattr(self, '_dm_train'): + delattr(self, '_dm_train') + + return dis_matrix + + + def get_params(self): + pass + + + def set_params(self): + pass + + + def clear_attributes(self): # @todo: update + # if hasattr(self, '_X_diag'): + # delattr(self, '_X_diag') + if hasattr(self, '_graphs'): + delattr(self, '_graphs') + if hasattr(self, '_Y'): + delattr(self, '_Y') + if hasattr(self, '_run_time'): + delattr(self, '_run_time') + if hasattr(self, '_test_run_time'): + delattr(self, '_test_run_time') + + + def validate_parameters(self): + """Validate all parameters for the transformer. + + Returns + ------- + None. + + """ + if self.parallel == False: + self.parallel = None + elif self.parallel == True: + self.parallel = 'imap_unordered' + if self.parallel is not None and self.parallel not in [ + 'imap_unordered', 'multiprocessing', 'joblib', 'concurrent' + ]: + raise ValueError('Parallel mode is not set correctly.') + + if self.parallel == 'imap_unordered' and self.n_jobs is None: + self.n_jobs = multiprocessing.cpu_count() + + + def validate_input(self, X): + """Validate the given input and raise errors if it is invalid. + + Parameters + ---------- + X : list + The input to check. Should be a list of graph. + + Raises + ------ + ValueError + Raise if the input is not correct. + + Returns + ------- + X : list + The input. A list of graph. + + """ + if X is None: + raise ValueError('Please add graphs before computing.') + elif not isinstance(X, list): + raise ValueError('Cannot detect graphs. The input must be a list.') + elif len(X) == 0: + raise ValueError( + 'The graph list given is empty. No computation will be performed.' + ) + + return X + + + def compute_distance_matrix(self, Y=None, **kwargs): + """Compute the distance matrix between a given target graphs (Y) and + the fitted graphs (X / self._graphs) or the distance matrix for the fitted + graphs (X / self._graphs). + + Parameters + ---------- + Y : list of graphs, optional + The target graphs. The default is None. If None distance is computed + between X and itself. + + Returns + ------- + dis_matrix : numpy array, shape = [n_targets, n_inputs] + The computed distance matrix. + + """ + if Y is None: + # Compute metric matrix for self._graphs (X). + dis_matrix = self._compute_X_distance_matrix(**kwargs) + # self._gram_matrix_unnorm = np.copy(self._gram_matrix) + + else: + # Compute metric matrix between Y and self._graphs (X). + Y_copy = ([g.copy() for g in Y] if self.copy_graphs else Y) + graphs_copy = ( + [g.copy() for g in self._graphs] + if self.copy_graphs else self._graphs + ) + + start_time = time.time() + + if self.parallel in [ + 'imap_unordered', 'joblib', 'concurrent', 'multiprocessing' + ]: + dis_matrix = self._compute_cross_distance_matrix_parallel( + Y_copy, graphs_copy, **kwargs + ) + + elif self.parallel is None: + dis_matrix = self._compute_cross_distance_matrix_series( + Y_copy, graphs_copy, **kwargs + ) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time = time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size (%d, %d) built in %s seconds.' + % (len(Y), len(self._graphs), self._run_time) + ) + + return dis_matrix + + + def diagonals(self): + """Compute the kernel matrix diagonals of the fit/transformed data. + + Returns + ------- + X_diag : numpy array + The diagonal of the kernel matrix between the fitted data. + This consists of each element calculated with itself. + + Y_diag : numpy array + The diagonal of the kernel matrix, of the transform. + This consists of each element calculated with itself. + + """ + # Check if method "fit" had been called. + check_is_fitted(self, ['_graphs']) + + # Check if the diagonals of X exist. + try: + check_is_fitted(self, ['_X_diag']) + except NotFittedError: + # Compute diagonals of X. + self._X_diag = np.empty(shape=(len(self._graphs),)) + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + for i, x in enumerate(graphs): + self._X_diag[i] = self.pairwise_kernel(x, x) # @todo: parallel? + + try: + # If transform has happened, return both diagonals. + check_is_fitted(self, ['_Y']) + self._Y_diag = np.empty(shape=(len(self._Y),)) + Y = ([g.copy() for g in self._Y] if self.copy_graphs else self._Y) + for (i, y) in enumerate(Y): + self._Y_diag[i] = self.pairwise_kernel(y, y) # @todo: parallel? + + return self._X_diag, self._Y_diag + except NotFittedError: + # Else just return both X_diag + return self._X_diag + + + # @abstractmethod + def pairwise_distance(self, x, y): + """Compute pairwise kernel between two graphs. + + Parameters + ---------- + x, y : NetworkX Graph. + Graphs bewteen which the kernel is computed. + + Returns + ------- + kernel: float + The computed kernel. + +# Notes +# ----- +# This method is abstract and must be implemented by a subclass. + + """ + raise NotImplementedError( + 'Pairwise kernel computation is not implemented!' + ) + + + def compute_edit_costs(self, Y=None, Y_targets=None, **kwargs): + """Compute edit cost constants. When optimizing method is `fiited`, + apply Jia2021's metric learning method by using a given target graphs (Y) + the fitted graphs (X / self._graphs). + + Parameters + ---------- + Y : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + # Get or compute. + if self.optim_method == 'random': + self._edit_cost_constants = np.random.rand(6) + + elif self.optim_method == 'init': + self._edit_cost_constants = self.init_edit_cost_constants + + elif self.optim_method == 'expert': + self._edit_cost_constants = [3, 3, 1, 3, 3, 1] + + elif self.optim_method == 'fitted': # Jia2021 method + # Get proper inputs. + if Y is None: + check_is_fitted(self, ['_graphs']) + check_is_fitted(self, ['_targets']) + graphs = ([g.copy() for g in self._graphs] if self.copy_graphs else self._graphs) + targets = self._targets + else: + graphs = ([g.copy() for g in Y] if self.copy_graphs else Y) + targets = Y_targets + + # Get optimization options. + node_labels = self.node_labels + edge_labels = self.edge_labels + unlabeled = (len(node_labels) == 0 and len(edge_labels) == 0) + repeats = kwargs.get('repeats', 1) + from gklearn.ged.model.optim_costs import compute_optimal_costs + self._edit_cost_constants = compute_optimal_costs( + graphs, targets, + node_labels=node_labels, edge_labels=edge_labels, + unlabeled=unlabeled, + init_costs=self.init_edit_cost_constants, + ed_method=self.ed_method, + edit_cost_fun=self.edit_cost_fun, + repeats=repeats, + rescue_optim_failure=False, + verbose=(self.verbose >= 2), + **self.optim_options + ) + + + def _compute_X_distance_matrix(self, **kwargs): + graphs = ( + [ + g.copy() for g in self._graphs + ] if self.copy_graphs else self._graphs + ) + + start_time = time.time() + + # if self.parallel == 'imap_unordered': + # dis_matrix = self._compute_X_dm_imap_unordered(graphs, **kwargs) + if self.parallel in [ + 'imap_unordered', 'joblib', 'concurrent', 'multiprocessing' + ]: + dis_matrix = self._compute_X_dm_parallel(graphs, **kwargs) + elif self.parallel is None: + dis_matrix = self._compute_X_dm_series(graphs, **kwargs) + else: + raise Exception('Parallel mode is not set correctly.') + + self._run_time = time.time() - start_time + + if self.verbose: + print( + 'Distance matrix of size %d built in %s seconds.' + % (len(self._graphs), self._run_time) + ) + + return dis_matrix + + + def _compute_X_dm_series(self, graphs, **kwargs): + n = len(graphs) + dis_matrix = np.zeros((n, n)) + + iterator = combinations(range(n), 2) + len_itr = int(n * (n - 1) / 2) + if self.verbose: + print('Graphs in total: %d.' % len(graphs)) + print('The total # of pairs is %d.' % len_itr) + for i, j in get_iters( + iterator, desc='Computing distance matrix', + file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + ): + g1, g2 = graphs[i], graphs[j] + dis_matrix[i, j] = self.compute_ged(g1, g2, **kwargs) + dis_matrix[j, i] = dis_matrix[i, j] + return dis_matrix + + + # todo: this is not refactored yet. + def _compute_X_dm_parallel(self, graphs, **kwargs): + """ + Highly optimized parallelized version of distance matrix computation between graphs. + + Parameters: + ----------- + graphs : list + List of graph objects to compute pairwise distances + n_jobs : int, default=-1 + Number of parallel jobs. -1 means using all available cores. + chunk_size : int, default=None + Number of tasks per chunk. If None, will be auto-calculated. + memory_limit : str or int, default='auto' + Memory limit per worker in MB or 'auto' to determine automatically. + method : str, default='joblib' + Parallelization backend: 'joblib', 'concurrent', or 'multiprocessing' + + Returns: + -------- + np.ndarray + Distance matrix of shape (n, n) + """ + n = len(graphs) + + # Get all pairs of indices + pairs = list(combinations(range(n), 2)) + len_itr = len(pairs) + + n_jobs = self.n_jobs + chunksize = self.chunksize + method = self.parallel + memory_limit = kwargs.get('memory_limit', 'auto') + + if self.verbose: + print('Graphs in total: %d.' % len(graphs)) + print('The total # of pairs is %d.' % len_itr) + + # Determine number of processes + if n_jobs == -1: + n_jobs = os.cpu_count() - 1 + n_jobs = min(n_jobs, os.cpu_count(), len_itr) + + # Auto-calculate optimal chunk size if not provided + if chunksize is None: + # # this seems to be slightly faster when using `test_ged_model.py` + # # with 100 graphs (0.0012 s vs 0.0016 s per pair). Yet gets slower with + # # larger number of graphs (e.g., 1000) (~ 31 mins vs ~ 40 mins in total). + # if len_itr < 100 * n_jobs: + # chunksize = int(len_itr / n_jobs) + 1 + # else: + # chunksize = 100 + + # Balancing chunk size: larger chunks reduce overhead but limit load balancing + # A good heuristic is sqrt(len_itr / n_jobs) * 4 + chunksize = max(1, int(np.sqrt(len_itr / n_jobs) * 4)) + + if self.verbose >= 2: + print( + f"Running with {n_jobs} parallel processes and chunk size of {chunksize}" + ) + + # For networkx graphs, we need to use a Manager to share them between processes + with Manager() as manager: + # Create a managed shared list for the graphs + shared_graphs = manager.list(graphs) + + # Get a function reference to compute_ged that can be pickled + # Using a Python trick to make the instance method picklable + compute_ged_func = self.compute_ged + + # Create a shared memory array for results + with numpy_shared_memory((n, n), dtype=np.float64) as ( + dis_matrix, shm_name + ): + + # Create a partial function with fixed arguments - must use module-level function + worker = partial( + self._process_pair_worker, + graphs_manager=shared_graphs, + shm_name=shm_name, + matrix_shape=(n, n), + compute_ged_func=compute_ged_func, + **kwargs + ) + + try: + # Force garbage collection before starting parallel processing + gc.collect() + + # Three different parallelization options for different scenarios + if method == 'joblib': + if memory_limit == 'auto': + # Set max_nbytes according to the size of the shared memory: + # Get the size of the shared memory in bytes: + shm = shared_memory.SharedMemory(name=shm_name) + memory_limit = shm.size + shm.close() + if self.verbose >= 2: + print( + f"Setting memory limit to {memory_limit} bytes per process." + ) + + # Option 1: joblib - great for large datasets, memory control, possible caching + with joblib.parallel_backend( + 'loky', n_jobs=n_jobs, inner_max_num_threads=1, + mmap_mode='r', temp_folder='/tmp' + ): + results = joblib.Parallel( + verbose=self.verbose >= 2, + prefer="processes", + batch_size=chunksize, + pre_dispatch='2*n_jobs', + max_nbytes=memory_limit + )( + joblib.delayed(worker)(pair) for pair in pairs + ) + + elif method == 'concurrent': + # Option 2: ProcessPoolExecutor - cleaner API, slightly faster for CPU-bound tasks + with ProcessPoolExecutor( + max_workers=n_jobs + ) as executor: + futures = [executor.submit(worker, pair) for pair in pairs] + + # Track progress if verbose + if self.verbose >= 2: + results = [] + for f in tqdm( + futures, total=len(futures), + desc='Computing distance matrix' + ): + results.append(f.result()) + else: + results = [f.result() for f in futures] + + elif method in ['imap_unordered' or 'multiprocessing']: + # Option 3: multiprocessing.Pool with imap_unordered - more control, classic approach + with multiprocessing.Pool(processes=n_jobs) as pool: + if self.verbose >= 2: + results = list( + tqdm( + pool.imap_unordered( + worker, pairs, + chunksize=chunksize + ), + total=len_itr, + desc='Computing distance matrix', + file=sys.stdout + ) + ) + else: + results = list( + pool.imap_unordered( + worker, pairs, chunksize=chunksize + ) + ) + + else: + raise ValueError( + f"Unsupported parallelization method: {method}." + ) + + # Copy the result from shared memory to a regular numpy array + result = dis_matrix.copy() + + except Exception as e: + # Make sure we log any errors that occur during parallel execution + if self.verbose: + print(f"Error during parallel execution: {e}.") + raise + + # At this point, the Manager will automatically clean up shared resources + + return result + + + @staticmethod + def _process_pair_worker( + pair, graphs_manager, shm_name, matrix_shape, + compute_ged_func, **kwargs + ): + """Worker function that processes a pair of graphs and updates the shared matrix. + Must be defined at module level to be picklable.""" + i, j = pair + + # Access the shared graphs from the manager + g1 = graphs_manager[i] + g2 = graphs_manager[j] + + try: + # Access the shared memory + existing_shm = shared_memory.SharedMemory(name=shm_name) + shared_matrix = np.ndarray( + matrix_shape, dtype=np.float64, buffer=existing_shm.buf + ) + + # Compute distance using the function reference + distance = compute_ged_func(g1, g2, **kwargs) + + # Update the matrix + shared_matrix[i, j] = distance + shared_matrix[j, i] = distance + + finally: + # Clean up local shared memory reference + if 'existing_shm' in locals(): + existing_shm.close() + + return i, j, distance # Return for progress tracking + + + def _compute_cross_distance_matrix_series(self, graphs1, graphs2, **kwargs): + """Compute the GED distance matrix between two sets of graphs (X and Y) + without parallelization. + + Parameters + ---------- + X, Y : list of graphs + The input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [n_X, n_Y] + The computed distance matrix. + + """ + n1 = len(graphs1) + n2 = len(graphs2) + + # Initialize distance matrix with zeros + dis_matrix = np.zeros((n1, n2)) + + # Cross set case: compute all pairs between the two sets + iterator = product(range(n1), range(n2)) + len_itr = n1 * n2 + + if self.verbose: + print(f'Computing distances between {n1} and {n2} graphs.') + print(f'The total # of pairs is {len_itr}.') + + for i, j in get_iters( + iterator, desc='Computing distance matrix', + file=sys.stdout, verbose=(self.verbose >= 2), length=len_itr + ): + g1, g2 = graphs1[i], graphs2[j] + dis_matrix[i, j] = self.compute_ged(g1, g2, **kwargs) + + return dis_matrix + + + def _compute_cross_distance_matrix_parallel( + self, graphs1, graphs2, **kwargs + ): + """Compute the GED distance matrix between two sets of graphs (X and Y) + with parallelization. + + Parameters + ---------- + X, Y : list of graphs + The input graphs. + + Returns + ------- + dis_matrix : numpy array, shape = [n_X, n_Y] + The computed distance matrix. + + References + ---------- + This method is written with the help of the Claude 3.7 Sonnet AI, + accessed on 2025.05.15. + + + todo: this can be merged with the _compute_X_dm_parallel method. + """ + # Handle the case where graphs2 is not provided + is_same_set = graphs2 is None + if is_same_set: + graphs2 = graphs1 + + n1 = len(graphs1) + n2 = len(graphs2) + + # Get all pairs of indices to compute + if is_same_set: + # Only compute upper triangular portion for efficiency when comparing within same set + pairs = list(combinations(range(n1), 2)) + else: + # Compute all pairs when comparing between different sets + pairs = list(product(range(n1), range(n2))) + + len_itr = len(pairs) + + n_jobs = self.n_jobs + chunksize = self.chunksize + method = self.parallel + memory_limit = kwargs.get('memory_limit', 'auto') + + if self.verbose: + if is_same_set: + print(f'Graphs in total: {n1}.') + else: + print(f'Computing distances between {n1} and {n2} graphs.') + print(f'The total # of pairs is {len_itr}.') + + # Determine number of processes + if n_jobs == -1: + n_jobs = os.cpu_count() - 1 + n_jobs = min(n_jobs, os.cpu_count(), len_itr) + + # Auto-calculate optimal chunk size if not provided + if chunksize is None: + # # this seems to be slightly faster when using `test_ged_model.py` + # # with 100 graphs (0.0012 s vs 0.0016 s per pair). Yet gets slower with + # # larger number of graphs (e.g., 1000) (~ 31 mins vs ~ 40 mins in total). + # if len_itr < 100 * n_jobs: + # chunksize = int(len_itr / n_jobs) + 1 + # else: + # chunksize = 100 + + # Balancing chunk size: larger chunks reduce overhead but limit load balancing + # A good heuristic is sqrt(len_itr / n_jobs) * 4 + chunksize = max(1, int(np.sqrt(len_itr / n_jobs) * 4)) + + if self.verbose >= 2: + print( + f"Running with {n_jobs} parallel processes and chunk size of {chunksize}" + ) + + # For networkx graphs, we need to use a Manager to share them between processes + with Manager() as manager: + # Create managed shared lists for both graph sets + shared_graphs1 = manager.list(graphs1) + shared_graphs2 = manager.list(graphs2) + + # Get a function reference to compute_ged that can be pickled + # Using a Python trick to make the instance method picklable + compute_ged_func = self.compute_ged + + # Create a shared memory array for results + with numpy_shared_memory((n1, n2), dtype=np.float64) as ( + dis_matrix, shm_name + ): + # Create a partial function with fixed arguments - MUST NOT use + # inline function here, as it won't be picklable: + worker = partial( + self._process_pair_worker_cross, + graphs1_manager=shared_graphs1, + graphs2_manager=shared_graphs2, + shm_name=shm_name, + matrix_shape=(n1, n2), + compute_ged_func=compute_ged_func, + is_same_set=is_same_set, + **kwargs + ) + + try: + # Force garbage collection before starting parallel processing + gc.collect() + + # Three different parallelization options for different scenarios + if method == 'joblib': + if memory_limit == 'auto': + # Set max_nbytes according to the size of the shared memory: + # Get the size of the shared memory in bytes: + shm = shared_memory.SharedMemory(name=shm_name) + memory_limit = shm.size + shm.close() + if self.verbose >= 2: + print( + f"Setting memory limit to {memory_limit} bytes per process." + ) + + # Option 1: joblib - great for large datasets, memory control, possible caching + with joblib.parallel_backend( + 'loky', n_jobs=n_jobs, inner_max_num_threads=1, + mmap_mode='r', temp_folder='/tmp' + ): + results = joblib.Parallel( + verbose=self.verbose >= 2, + prefer="processes", + batch_size=chunksize, + pre_dispatch='2*n_jobs', + max_nbytes=memory_limit + )( + joblib.delayed(worker)(pair) for pair in pairs + ) + + elif method == 'concurrent': + # Option 2: ProcessPoolExecutor - cleaner API, slightly faster for CPU-bound tasks + with ProcessPoolExecutor( + max_workers=n_jobs + ) as executor: + futures = [executor.submit(worker, pair) for pair + in pairs] + + # Track progress if verbose + if self.verbose >= 2: + results = [] + for f in tqdm( + futures, total=len(futures), + desc='Computing distance matrix' + ): + results.append(f.result()) + else: + results = [f.result() for f in futures] + + elif method in ['imap_unordered' or 'multiprocessing']: + # Option 3: multiprocessing.Pool with imap_unordered - more control, classic approach + with multiprocessing.Pool(processes=n_jobs) as pool: + if self.verbose >= 2: + results = list( + tqdm( + pool.imap_unordered( + worker, pairs, + chunksize=chunksize + ), + total=len_itr, + desc='Computing distance matrix', + file=sys.stdout + ) + ) + else: + results = list( + pool.imap_unordered( + worker, pairs, chunksize=chunksize + ) + ) + + else: + raise ValueError( + f"Unsupported parallelization method: {method}." + ) + + # Copy the result from shared memory to a regular numpy array + result = dis_matrix.copy() + + except Exception as e: + # Make sure we log any errors that occur during parallel execution + if self.verbose: + print(f"Error during parallel execution: {e}.") + raise + + # At this point, the Manager will automatically clean up shared resources + + return result + + + @staticmethod + def _process_pair_worker_cross( + pair, graphs1_manager, graphs2_manager, shm_name, matrix_shape, + compute_ged_func, is_same_set=False, **kwargs + ): + """Worker function that processes a pair of graphs and updates the shared matrix. + Must be defined at module level to be picklable.""" + i, j = pair + + # Access the shared graphs from the manager + g1 = graphs1_manager[i] + g2 = graphs2_manager[j] + + try: + # Access the shared memory + existing_shm = shared_memory.SharedMemory(name=shm_name) + shared_matrix = np.ndarray( + matrix_shape, dtype=np.float64, buffer=existing_shm.buf + ) + + # Compute distance using the function reference + distance = compute_ged_func(g1, g2, **kwargs) + + # Update the matrix + shared_matrix[i, j] = distance + + # If computing within the same set, update symmetric position: + if is_same_set and i != j: + shared_matrix[j, i] = distance + + finally: + # Clean up local shared memory reference + if 'existing_shm' in locals(): + existing_shm.close() + + return i, j, distance # Return for progress tracking + + + def compute_ged(self, Gi, Gj, **kwargs): + """ + Compute GED between two graphs according to edit_cost. + """ + env_type = self.get_env_type(graph=Gi) + ged_options = { + 'env_type': env_type, + 'edit_cost': self.edit_cost_fun, + 'method': self.ed_method, + 'edit_cost_constants': self._edit_cost_constants, + 'edit_cost_config': self.edit_cost_config, + } + repeats = kwargs.get('repeats', 1) + dis, pi_forward, pi_backward = pairwise_ged( + Gi, Gj, ged_options, repeats=repeats + ) + # @TODO: Better to have a if here. + # if self.compute_n_eo: + # n_eo_tmp = get_nb_edit_operations( + # Gi, Gj, pi_forward, pi_backward, + # edit_cost=self.edit_cost_fun, + # node_labels=self.node_labels, edge_labels=self.edge_labels + # ) + # else: + # n_eo_tmp = None + # return dis, n_eo_tmp + return dis + + + def get_env_type(self, graph: nx.Graph | None = None): + """ + Check the environment type of the graph. + If `env_type` is set on initialization, return it. + Otherwise, check the given graph's node and edge labels to determine the type. + + Only one node and one edge are checked to determine the type. + This function expects that all nodes have the same type of labels, so as all + edges. + """ + if self.env_type is not None: + return self.env_type + if graph is None: + raise ValueError( + 'Graph is not provided while `env_type` not set on initialization. ' + 'Cannot determine environment type.' + ) + # Use 'gxl' env type only if all nodes and edge labes are strings, and at least one + # node or edge label is present: + one_n_labels = graph.nodes[list(graph.nodes)[0]] + for k, v in one_n_labels.items(): + if not isinstance(v, str): + return 'attr' + if nx.number_of_edges(graph) != 0: + one_e_labels = graph.edges[list(graph.edges)[0]] + for k, v in one_e_labels.items(): + if not isinstance(v, str): + return 'attr' + if len(one_n_labels) > 0 or ( + nx.number_of_edges(graph) != 0 and len(one_e_labels) > 0 + ): + return 'gxl' + return 'attr' + + + def is_graph(self, graph): + if isinstance(graph, nx.Graph): + return True + if isinstance(graph, nx.DiGraph): + return True + if isinstance(graph, nx.MultiGraph): + return True + if isinstance(graph, nx.MultiDiGraph): + return True + return False + + + def __repr__(self): + return ( + f"{self.__class__.__name__}(" + f"optim_method={self.optim_method}, " + f"ed_method={self.ed_method}, " + f"edit_cost_fun={self.edit_cost_fun}, " + f"node_labels={self.node_labels}, " + f"edge_labels={self.edge_labels}, " + f"optim_options={self.optim_options}, " + f"init_edit_cost_constants={self.init_edit_cost_constants}, " + f"copy_graphs={self.copy_graphs}, " + f"parallel={self.parallel}, " + f"n_jobs={self.n_jobs}, " + f"verbose={self.verbose}, " + + (f"normalize={self.normalize}, " if hasattr(self, 'normalize') else "") + + f"run_time={self.run_time}" + f")" + ) + + + @property + def graphs(self): + return self._graphs + + + # @property + # def parallel(self): + # return self.parallel + + # @property + # def n_jobs(self): + # return self.n_jobs + + # @property + # def verbose(self): + # return self.verbose + + # @property + # def normalize(self): + # return self.normalize + + @property + def run_time(self): + return self._run_time + + + @property + def test_run_time(self): + return self._test_run_time + + + @property + def dis_matrix(self): + return self._dm_train + + + @dis_matrix.setter + def dis_matrix(self, value): + self._dm_train = value + + + @property + def metric_matrix(self): + return self._dm_train + + + @metric_matrix.setter + def metric_matrix(self, value): + self._dm_train = value + + + @property + def edit_cost_constants(self): + return self._edit_cost_constants + + + # @property + # def gram_matrix_unnorm(self): + # return self._gram_matrix_unnorm + + # @gram_matrix_unnorm.setter + # def gram_matrix_unnorm(self, value): + # self._gram_matrix_unnorm = value + + @property + def n_pairs(self): + """ + The number of graph pairs between which the GEDs are computed. + """ + try: + check_is_fitted(self, '_dm_train') + return len(self._dm_train) * (len(self._dm_train) - 1) / 2 + except NotFittedError: + return None + + +# Context manager for shared memory with automatic cleanup +@contextmanager +def numpy_shared_memory(shape, dtype=np.float64): + """Create a numpy array in shared memory that automatically cleans up.""" + size = int(np.prod(shape)) * np.dtype(dtype).itemsize + shm = shared_memory.SharedMemory(create=True, size=size) + try: + array = np.ndarray(shape, dtype=dtype, buffer=shm.buf) + array.fill(0) # Initialize with zeros + yield array, shm.name + finally: + shm.close() + shm.unlink() + + +def pairwise_ged( + g1, g2, options={}, sort=True, repeats=1, parallel=False, verbose=True +): + """Compute the graph edit distance between two graphs using the gedlib library + with repeats. + + Notes + ----- + - For methods such as BIPARTITE, the repeats may result same results. + - # of edit operations are not computed in this method. + """ + from gklearn.gedlib import gedlibpy + + ged_env = gedlibpy.GEDEnv(env_type=options.get('env_type', 'attr'), verbose=False) + ged_env.set_edit_cost( + options['edit_cost'], + edit_cost_constant=options['edit_cost_constants'], + **options.get('edit_cost_config') and { + 'edit_cost_config': options['edit_cost_config'] + } or {} + ) + + ged_env.add_nx_graph(g1, '') + ged_env.add_nx_graph(g2, '') + + list_id = ged_env.get_all_graph_ids() + + ged_env.init( + init_option=( + options[ + 'init_option'] if 'init_option' in options else 'EAGER_WITHOUT_SHUFFLED_COPIES' + ) + ) + ged_env.set_method(options['method'], ged_options_to_string(options)) + ged_env.init_method() + + g = list_id[0] + h = list_id[1] + dis_min = np.inf + + for i in range(0, repeats): + ged_env.run_method(g, h) + upper = ged_env.get_upper_bound(g, h) + dis = upper + # print(dis) + if dis < dis_min: + dis_min = dis + pi_forward = ged_env.get_forward_map(g, h) + pi_backward = ged_env.get_backward_map(g, h) + # lower = ged_env.get_lower_bound(g, h) + + # make the map label correct (label remove map as np.inf) + nodes1 = [n for n in g1.nodes()] + nodes2 = [n for n in g2.nodes()] + nb1 = nx.number_of_nodes(g1) + nb2 = nx.number_of_nodes(g2) + pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward] + pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward] + # print(pi_forward) + + return dis, pi_forward, pi_backward diff --git a/gklearn/ged/util/util.py b/gklearn/ged/util/util.py index 44c6a32835..c87b24c6d8 100644 --- a/gklearn/ged/util/util.py +++ b/gklearn/ged/util/util.py @@ -28,7 +28,7 @@ def compute_ged(g1, g2, options): - # of edit operations are not computed in this method. """ - from gklearn.gedlib import librariesImport, gedlibpy + from gklearn.gedlib import libraries_import, gedlibpy ged_env = gedlibpy.GEDEnv() ged_env.set_edit_cost( @@ -375,7 +375,7 @@ def _compute_geds_without_permutation( ----- - # of edit operations are computed in this method. """ - from gklearn.gedlib import librariesImport, gedlibpy + from gklearn.gedlib import libraries_import, gedlibpy # initialize ged env. ged_env = gedlibpy.GEDEnv() @@ -626,7 +626,7 @@ def get_nb_edit_operations( g1, g2, forward_map, backward_map, node_attrs=node_attrs, edge_attrs=edge_attrs ) - elif edit_cost == 'CONSTANT': + elif edit_cost in ['CONSTANT', 'CHEM_1', 'CHEM_2']: node_labels = kwargs.get('node_labels', []) edge_labels = kwargs.get('edge_labels', []) return get_nb_edit_operations_symbolic( @@ -634,11 +634,17 @@ def get_nb_edit_operations( node_labels=node_labels, edge_labels=edge_labels ) else: - return get_nb_edit_operations_symbolic( - g1, g2, forward_map, - backward_map + # The following edit costs include non-symbolic computations: + # 'CMU', 'GREC_1', 'GREC_2', 'FINGERPRINT', 'PROTEIN', 'GEOMETRIC'. + raise NotImplementedError( + f'`get_nb_edit_operations()` is not implemented for edit cost "{edit_cost}". ' ) + # return get_nb_edit_operations_symbolic( + # g1, g2, forward_map, + # backward_map + # ) + def get_nb_edit_operations_symbolic( g1, g2, forward_map, backward_map, node_labels=[], edge_labels=[] diff --git a/gklearn/gedlib/__init__.py b/gklearn/gedlib/__init__.py index 1289a2c44b..d3ae80a365 100755 --- a/gklearn/gedlib/__init__.py +++ b/gklearn/gedlib/__init__.py @@ -6,5 +6,5 @@ # info __version__ = "0.1" -__author__ = "Linlin Jia" -__date__ = "March 2020" +__author__ = "Linlin Jia" +__date__ = "March 2020" diff --git a/gklearn/gedlib/gedlibpy_attr.pyx b/gklearn/gedlib/gedlibpy_attr.pyx index f2fbf4fe58..5ef9b601c2 100644 --- a/gklearn/gedlib/gedlibpy_attr.pyx +++ b/gklearn/gedlib/gedlibpy_attr.pyx @@ -706,15 +706,19 @@ cdef class GEDEnvAttr: config_bool[key.encode('utf-8')] = value else: raise EditCostError( - "Edit cost configuration values must be either string or boolean." + 'Edit cost configuration values must be either string or boolean.' ) # # debug test only: # print(f'[gedlibpy_attr.pyx] Edit cost config passed to C++ wrapper is {edit_cost_config}.') - - self.c_env.setEditCost( - edit_cost_b, edit_cost_constant, config_str, config_bool - ) + try: + self.c_env.setEditCost( + edit_cost_b, edit_cost_constant, config_str, config_bool + ) + except Exception as e: + raise EditCostError( + f"Caught C++ Exception': {str(e)}." + ) else: raise EditCostError( "This edit cost function doesn't exist, please see list_of_edit_cost_options for selecting a edit cost function" diff --git a/gklearn/gedlib/gedlibpy_backup.pyx b/gklearn/gedlib/gedlibpy_backup.pyx new file mode 100644 index 0000000000..5dc33b7e36 --- /dev/null +++ b/gklearn/gedlib/gedlibpy_backup.pyx @@ -0,0 +1,1581 @@ +# distutils: language = c++ + +""" + Python GedLib module + ====================== + + This module allow to use a C++ library for edit distance between graphs (GedLib) with Python. + + + Authors + ------------------- + + David Blumenthal + Natacha Lambert + Linlin Jia + + Copyright (C) 2019-2020 by all the authors + + Classes & Functions + ------------------- + +""" + +################################# +##DECLARATION OF C++ INTERFACES## +################################# + + +#Types imports for C++ compatibility +from libcpp.vector cimport vector +from libcpp.string cimport string +from libcpp.map cimport map +from libcpp cimport bool +from libcpp.pair cimport pair +from libcpp.list cimport list + +#Long unsigned int equivalent +cimport numpy as cnp +ctypedef cnp.npy_uint32 UINT32_t +from cpython cimport array + + +cdef extern from "src/GedLibBind.hpp" namespace "pyged": + + cdef vector[string] getEditCostStringOptions() except + + cdef vector[string] getMethodStringOptions() except + + cdef vector[string] getInitStringOptions() except + + cdef size_t getDummyNode() except + + + cdef cppclass PyGEDEnv: + PyGEDEnv() except + + bool isInitialized() except + + void restartEnv() except + + void loadGXLGraph(string pathFolder, string pathXML, bool node_type, bool edge_type) except + + pair[size_t,size_t] getGraphIds() except + + vector[size_t] getAllGraphIds() except + + string getGraphClass(size_t id) except + + string getGraphName(size_t id) except + + size_t addGraph(string name, string classe) except + + void addNode(size_t graphId, string nodeId, map[string, string] nodeLabel) except + + void addEdge(size_t graphId, string tail, string head, map[string, string] edgeLabel, bool ignoreDuplicates) except + + void clearGraph(size_t graphId) except + + size_t getGraphInternalId(size_t graphId) except + + size_t getGraphNumNodes(size_t graphId) except + + size_t getGraphNumEdges(size_t graphId) except + + vector[string] getGraphOriginalNodeIds(size_t graphId) except + + vector[map[string, string]] getGraphNodeLabels(size_t graphId) except + + map[pair[size_t, size_t], map[string, string]] getGraphEdges(size_t graphId) except + + vector[vector[size_t]] getGraphAdjacenceMatrix(size_t graphId) except + + void setEditCost(string editCost, vector[double] editCostConstant) except + + void setPersonalEditCost(vector[double] editCostConstant) except + + void initEnv(string initOption, bool print_to_stdout) except + + void setMethod(string method, string options) except + + void initMethod() except + + double getInitime() except + + void runMethod(size_t g, size_t h) except + + double getUpperBound(size_t g, size_t h) except + + double getLowerBound(size_t g, size_t h) except + + vector[cnp.npy_uint64] getForwardMap(size_t g, size_t h) except + + vector[cnp.npy_uint64] getBackwardMap(size_t g, size_t h) except + + size_t getNodeImage(size_t g, size_t h, size_t nodeId) except + + size_t getNodePreImage(size_t g, size_t h, size_t nodeId) except + + double getInducedCost(size_t g, size_t h) except + + vector[pair[size_t,size_t]] getNodeMap(size_t g, size_t h) except + + vector[vector[int]] getAssignmentMatrix(size_t g, size_t h) except + + vector[vector[cnp.npy_uint64]] getAllMap(size_t g, size_t h) except + + double getRuntime(size_t g, size_t h) except + + bool quasimetricCosts() except + + vector[vector[size_t]] hungarianLSAP(vector[vector[size_t]] matrixCost) except + + vector[vector[double]] hungarianLSAPE(vector[vector[double]] matrixCost) except + + # added by Linlin Jia. + size_t getNumNodeLabels() except + + map[string, string] getNodeLabel(size_t label_id) except + + size_t getNumEdgeLabels() except + + map[string, string] getEdgeLabel(size_t label_id) except + +# size_t getNumNodes(size_t graph_id) except + + double getAvgNumNodes() except + + double getNodeRelCost(map[string, string] & node_label_1, map[string, string] & node_label_2) except + + double getNodeDelCost(map[string, string] & node_label) except + + double getNodeInsCost(map[string, string] & node_label) except + + map[string, string] getMedianNodeLabel(vector[map[string, string]] & node_labels) except + + double getEdgeRelCost(map[string, string] & edge_label_1, map[string, string] & edge_label_2) except + + double getEdgeDelCost(map[string, string] & edge_label) except + + double getEdgeInsCost(map[string, string] & edge_label) except + + map[string, string] getMedianEdgeLabel(vector[map[string, string]] & edge_labels) except + + string getInitType() except + +# double getNodeCost(size_t label1, size_t label2) except + + double computeInducedCost(size_t g_id, size_t h_id, vector[pair[size_t,size_t]]) except + + + +############################# +##CYTHON WRAPPER INTERFACES## +############################# + +# import cython +import numpy as np +import networkx as nx +from gklearn.ged.env import NodeMap + +# import librariesImport +from ctypes import * +import os +lib1 = cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + '/lib/fann.2.2.0/libdoublefann.so') +lib2 = cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + '/lib/libsvm.3.22/libsvm.so') +lib3 = cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + '/lib/nomad.3.8.1/libnomad.so') +lib4 = cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + '/lib/nomad.3.8.1/libsgtelib.so') + + +def get_edit_cost_options() : + """ + Searchs the differents edit cost functions and returns the result. + + :return: The list of edit cost functions + :rtype: list[string] + + .. warning:: This function is useless for an external use. Please use directly list_of_edit_cost_options. + .. note:: Prefer the list_of_edit_cost_options attribute of this module. + """ + + return [option.decode('utf-8') for option in getEditCostStringOptions()] + + +def get_method_options() : + """ + Searchs the differents method for edit distance computation between graphs and returns the result. + + :return: The list of method to compute the edit distance between graphs + :rtype: list[string] + + .. warning:: This function is useless for an external use. Please use directly list_of_method_options. + .. note:: Prefer the list_of_method_options attribute of this module. + """ + return [option.decode('utf-8') for option in getMethodStringOptions()] + + +def get_init_options() : + """ + Searchs the differents initialization parameters for the environment computation for graphs and returns the result. + + :return: The list of options to initialize the computation environment + :rtype: list[string] + + .. warning:: This function is useless for an external use. Please use directly list_of_init_options. + .. note:: Prefer the list_of_init_options attribute of this module. + """ + return [option.decode('utf-8') for option in getInitStringOptions()] + + +def get_dummy_node() : + """ + Returns the ID of a dummy node. + + :return: The ID of the dummy node (18446744073709551614 for my computer, the hugest number possible) + :rtype: size_t + + .. note:: A dummy node is used when a node isn't associated to an other node. + """ + return getDummyNode() + + +# @cython.auto_pickle(True) +cdef class GEDEnv: + """Cython wrapper class for C++ class PyGEDEnv + """ +# cdef PyGEDEnv c_env # Hold a C++ instance which we're wrapping + cdef PyGEDEnv* c_env # hold a pointer to the C++ instance which we're wrapping + + + def __cinit__(self): +# self.c_env = PyGEDEnv() + self.c_env = new PyGEDEnv() + + + def __dealloc__(self): + del self.c_env + + +# def __reduce__(self): +# # return GEDEnv, (self.c_env,) +# return GEDEnv, tuple() + + + def is_initialized(self) : + """ + Checks and returns if the computation environment is initialized or not. + + :return: True if it's initialized, False otherwise + :rtype: bool + + .. note:: This function exists for internals verifications but you can use it for your code. + """ + return self.c_env.isInitialized() + + + def restart_env(self) : + """ + Restarts the environment variable. All data related to it will be delete. + + .. warning:: This function deletes all graphs, computations and more so make sure you don't need anymore your environment. + .. note:: You can now delete and add somes graphs after initialization so you can avoid this function. + """ + self.c_env.restartEnv() + + + def load_GXL_graphs(self, path_folder, path_XML, node_type, edge_type) : + """ + Loads some GXL graphes on the environment which is in a same folder, and present in the XMLfile. + + :param path_folder: The folder's path which contains GXL graphs + :param path_XML: The XML's path which indicates which graphes you want to load + :param node_type: Select if nodes are labeled or unlabeled + :param edge_type: Select if edges are labeled or unlabeled + :type path_folder: string + :type path_XML: string + :type node_type: bool + :type edge_type: bool + + + .. note:: You can call this function multiple times if you want, but not after an init call. + """ + self.c_env.loadGXLGraph(path_folder.encode('utf-8'), path_XML.encode('utf-8'), node_type, edge_type) + + + def graph_ids(self) : + """ + Searchs the first and last IDs of the loaded graphs in the environment. + + :return: The pair of the first and the last graphs Ids + :rtype: tuple(size_t, size_t) + + .. note:: Prefer this function if you have huges structures with lots of graphs. + """ + return self.c_env.getGraphIds() + + + def get_all_graph_ids(self) : + """ + Searchs all the IDs of the loaded graphs in the environment. + + :return: The list of all graphs's Ids + :rtype: list[size_t] + + .. note:: The last ID is equal to (number of graphs - 1). The order correspond to the loading order. + """ + return self.c_env.getAllGraphIds() + + + def get_graph_class(self, id) : + """ + Returns the class of a graph with its ID. + + :param id: The ID of the wanted graph + :type id: size_t + :return: The class of the graph which correpond to the ID + :rtype: string + + .. seealso:: get_graph_class() + .. note:: An empty string can be a class. + """ + return self.c_env.getGraphClass(id) + + + def get_graph_name(self, id) : + """ + Returns the name of a graph with its ID. + + :param id: The ID of the wanted graph + :type id: size_t + :return: The name of the graph which correpond to the ID + :rtype: string + + .. seealso:: get_graph_class() + .. note:: An empty string can be a name. + """ + return self.c_env.getGraphName(id).decode('utf-8') + + + def add_graph(self, name="", classe="") : + """ + Adds a empty graph on the environment, with its name and its class. Nodes and edges will be add in a second time. + + :param name: The name of the new graph, an empty string by default + :param classe: The class of the new graph, an empty string by default + :type name: string + :type classe: string + :return: The ID of the newly graphe + :rtype: size_t + + .. seealso::add_node(), add_edge() , add_symmetrical_edge() + .. note:: You can call this function without parameters. You can also use this function after initialization, call init() after you're finished your modifications. + """ + return self.c_env.addGraph(name.encode('utf-8'), classe.encode('utf-8')) + + + def add_node(self, graph_id, node_id, node_label): + """ + Adds a node on a graph selected by its ID. A ID and a label for the node is required. + + :param graph_id: The ID of the wanted graph + :param node_id: The ID of the new node + :param node_label: The label of the new node + :type graph_id: size_t + :type node_id: string + :type node_label: dict{string : string} + + .. seealso:: add_graph(), add_edge(), add_symmetrical_edge() + .. note:: You can also use this function after initialization, but only on a newly added graph. Call init() after you're finished your modifications. + """ + self.c_env.addNode(graph_id, node_id.encode('utf-8'), encode_your_map(node_label)) + + + def add_edge(self, graph_id, tail, head, edge_label, ignore_duplicates=True) : + """ + Adds an edge on a graph selected by its ID. + + :param graph_id: The ID of the wanted graph + :param tail: The ID of the tail node for the new edge + :param head: The ID of the head node for the new edge + :param edge_label: The label of the new edge + :param ignore_duplicates: If True, duplicate edges are ignored, otherwise it's raise an error if an existing edge is added. True by default + :type graph_id: size_t + :type tail: string + :type head: string + :type edge_label: dict{string : string} + :type ignore_duplicates: bool + + .. seealso:: add_graph(), add_node(), add_symmetrical_edge() + .. note:: You can also use this function after initialization, but only on a newly added graph. Call init() after you're finished your modifications. + """ + self.c_env.addEdge(graph_id, tail.encode('utf-8'), head.encode('utf-8'), encode_your_map(edge_label), ignore_duplicates) + + + def add_symmetrical_edge(self, graph_id, tail, head, edge_label) : + """ + Adds a symmetrical edge on a graph selected by its ID. + + :param graph_id: The ID of the wanted graph + :param tail: The ID of the tail node for the new edge + :param head: The ID of the head node for the new edge + :param edge_label: The label of the new edge + :type graph_id: size_t + :type tail: string + :type head: string + :type edge_label: dict{string : string} + + .. seealso:: add_graph(), add_node(), add_edge() + .. note:: You can also use this function after initialization, but only on a newly added graph. Call init() after you're finished your modifications. + """ + tailB = tail.encode('utf-8') + headB = head.encode('utf-8') + edgeLabelB = encode_your_map(edge_label) + self.c_env.addEdge(graph_id, tailB, headB, edgeLabelB, True) + self.c_env.addEdge(graph_id, headB, tailB, edgeLabelB, True) + + + def clear_graph(self, graph_id) : + """ + Deletes a graph, selected by its ID, to the environment. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + + .. note:: Call init() after you're finished your modifications. + """ + self.c_env.clearGraph(graph_id) + + + def get_graph_internal_id(self, graph_id) : + """ + Searchs and returns the internal Id of a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The internal ID of the selected graph + :rtype: size_t + + .. seealso:: get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_node_labels(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return self.c_env.getGraphInternalId(graph_id) + + + def get_graph_num_nodes(self, graph_id) : + """ + Searchs and returns the number of nodes on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The number of nodes on the selected graph + :rtype: size_t + + .. seealso:: get_graph_internal_id(), get_graph_num_edges(), get_original_node_ids(), get_graph_node_labels(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return self.c_env.getGraphNumNodes(graph_id) + + + def get_graph_num_edges(self, graph_id) : + """ + Searchs and returns the number of edges on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The number of edges on the selected graph + :rtype: size_t + + .. seealso:: get_graph_internal_id(), get_graph_num_nodes(), get_original_node_ids(), get_graph_node_labels(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return self.c_env.getGraphNumEdges(graph_id) + + + def get_original_node_ids(self, graph_id) : + """ + Searchs and returns all th Ids of nodes on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of IDs's nodes on the selected graph + :rtype: list[string] + + .. seealso::get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_graph_node_labels(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return [gid.decode('utf-8') for gid in self.c_env.getGraphOriginalNodeIds(graph_id)] + + + def get_graph_node_labels(self, graph_id) : + """ + Searchs and returns all the labels of nodes on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of nodes' labels on the selected graph + :rtype: list[dict{string : string}] + + .. seealso:: get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_edges(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return [decode_your_map(node_label) for node_label in self.c_env.getGraphNodeLabels(graph_id)] + + + def get_graph_edges(self, graph_id) : + """ + Searchs and returns all the edges on a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The list of edges on the selected graph + :rtype: dict{tuple(size_t, size_t) : dict{string : string}} + + .. seealso::get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_node_labels(), get_graph_adjacence_matrix() + .. note:: These functions allow to collect all the graph's informations. + """ + return decode_graph_edges(self.c_env.getGraphEdges(graph_id)) + + + def get_graph_adjacence_matrix(self, graph_id) : + """ + Searchs and returns the adjacence list of a graph, selected by its ID. + + :param graph_id: The ID of the wanted graph + :type graph_id: size_t + :return: The adjacence list of the selected graph + :rtype: list[list[size_t]] + + .. seealso:: get_graph_internal_id(), get_graph_num_nodes(), get_graph_num_edges(), get_original_node_ids(), get_graph_node_labels(), get_graph_edges() + .. note:: These functions allow to collect all the graph's informations. + """ + return self.c_env.getGraphAdjacenceMatrix(graph_id) + + + def set_edit_cost(self, edit_cost, edit_cost_constant = []) : + """ + Sets an edit cost function to the environment, if it exists. + + :param edit_cost: The name of the edit cost function + :type edit_cost: string + :param edi_cost_constant: The parameters you will add to the editCost, empty by default + :type edit_cost_constant: list + + .. seealso:: list_of_edit_cost_options + .. note:: Try to make sure the edit cost function exists with list_of_edit_cost_options, raise an error otherwise. + """ + if edit_cost in list_of_edit_cost_options: + edit_cost_b = edit_cost.encode('utf-8') + self.c_env.setEditCost(edit_cost_b, edit_cost_constant) + else: + raise EditCostError("This edit cost function doesn't exist, please see list_of_edit_cost_options for selecting a edit cost function") + + + def set_personal_edit_cost(self, edit_cost_constant = []) : + """ + Sets an personal edit cost function to the environment. + + :param edit_cost_constant: The parameters you will add to the editCost, empty by default + :type edit_cost_constant: list + + .. seealso:: list_of_edit_cost_options, set_edit_cost() + .. note::You have to modify the C++ function to use it. Please see the documentation to add your Edit Cost function. + """ + self.c_env.setPersonalEditCost(edit_cost_constant) + + + def init(self, init_option='EAGER_WITHOUT_SHUFFLED_COPIES', print_to_stdout=False) : + """ + Initializes the environment with the chosen edit cost function and graphs. + + :param init_option: The name of the init option, "EAGER_WITHOUT_SHUFFLED_COPIES" by default + :type init_option: string + + .. seealso:: list_of_init_options + .. warning:: No modification were allowed after initialization. Try to make sure your choices is correct. You can though clear or add a graph, but recall init() after that. + .. note:: Try to make sure the option exists with list_of_init_options or choose no options, raise an error otherwise. + """ + if init_option in list_of_init_options: + init_option_b = init_option.encode('utf-8') + self.c_env.initEnv(init_option_b, print_to_stdout) + else: + raise InitError("This init option doesn't exist, please see list_of_init_options for selecting an option. You can choose any options.") + + + def set_method(self, method, options="") : + """ + Sets a computation method to the environment, if its exists. + + :param method: The name of the computation method + :param options: The options of the method (like bash options), an empty string by default + :type method: string + :type options: string + + .. seealso:: init_method(), list_of_method_options + .. note:: Try to make sure the edit cost function exists with list_of_method_options, raise an error otherwise. Call init_method() after your set. + """ + if method in list_of_method_options: + method_b = method.encode('utf-8') + self.c_env.setMethod(method_b, options.encode('utf-8')) + else: + raise MethodError("This method doesn't exist, please see list_of_method_options for selecting a method") + + + def init_method(self) : + """ + Inits the environment with the set method. + + .. seealso:: set_method(), list_of_method_options + .. note:: Call this function after set the method. You can't launch computation or change the method after that. + """ + self.c_env.initMethod() + + + def get_init_time(self) : + """ + Returns the initialization time. + + :return: The initialization time + :rtype: double + """ + return self.c_env.getInitime() + + + def run_method(self, g, h) : + """ + Computes the edit distance between two graphs g and h, with the edit cost function and method computation selected. + + :param g: The Id of the first graph to compare + :param h: The Id of the second graph to compare + :type g: size_t + :type h: size_t + + .. seealso:: get_upper_bound(), get_lower_bound(), get_forward_map(), get_backward_map(), get_runtime(), quasimetric_cost() + .. note:: This function only compute the distance between two graphs, without returning a result. Use the differents function to see the result between the two graphs. + """ + self.c_env.runMethod(g, h) + + + def get_upper_bound(self, g, h) : + """ + Returns the upper bound of the edit distance cost between two graphs g and h. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The upper bound of the edit distance cost + :rtype: double + + .. seealso:: run_method(), get_lower_bound(), get_forward_map(), get_backward_map(), get_runtime(), quasimetric_cost() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: The upper bound is equivalent to the result of the pessimist edit distance cost. Methods are heuristics so the library can't compute the real perfect result because it's NP-Hard problem. + """ + return self.c_env.getUpperBound(g, h) + + + def get_lower_bound(self, g, h) : + """ + Returns the lower bound of the edit distance cost between two graphs g and h. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The lower bound of the edit distance cost + :rtype: double + + .. seealso:: run_method(), get_upper_bound(), get_forward_map(), get_backward_map(), get_runtime(), quasimetric_cost() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: This function can be ignored, because lower bound doesn't have a crucial utility. + """ + return self.c_env.getLowerBound(g, h) + + + def get_forward_map(self, g, h) : + """ + Returns the forward map (or the half of the adjacence matrix) between nodes of the two indicated graphs. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The forward map to the adjacence matrix between nodes of the two graphs + :rtype: list[npy_uint32] + + .. seealso:: run_method(), get_upper_bound(), get_lower_bound(), get_backward_map(), get_runtime(), quasimetric_cost(), get_node_map(), get_assignment_matrix() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: I don't know how to connect the two map to reconstruct the adjacence matrix. Please come back when I know how it's work ! + """ + return self.c_env.getForwardMap(g, h) + + + def get_backward_map(self, g, h) : + """ + Returns the backward map (or the half of the adjacence matrix) between nodes of the two indicated graphs. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The backward map to the adjacence matrix between nodes of the two graphs + :rtype: list[npy_uint32] + + .. seealso:: run_method(), get_upper_bound(), get_lower_bound(), get_forward_map(), get_runtime(), quasimetric_cost(), get_node_map(), get_assignment_matrix() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: I don't know how to connect the two map to reconstruct the adjacence matrix. Please come back when I know how it's work ! + """ + return self.c_env.getBackwardMap(g, h) + + + def get_node_image(self, g, h, node_id) : + """ + Returns the node's image in the adjacence matrix, if it exists. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :param node_id: The ID of the node which you want to see the image + :type g: size_t + :type h: size_t + :type node_id: size_t + :return: The ID of the image node + :rtype: size_t + + .. seealso:: run_method(), get_forward_map(), get_backward_map(), get_node_pre_image(), get_node_map(), get_assignment_matrix() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: Use BackwardMap's Node to find its images ! You can also use get_forward_map() and get_backward_map(). + + """ + return self.c_env.getNodeImage(g, h, node_id) + + + def get_node_pre_image(self, g, h, node_id) : + """ + Returns the node's preimage in the adjacence matrix, if it exists. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :param node_id: The ID of the node which you want to see the preimage + :type g: size_t + :type h: size_t + :type node_id: size_t + :return: The ID of the preimage node + :rtype: size_t + + .. seealso:: run_method(), get_forward_map(), get_backward_map(), get_node_image(), get_node_map(), get_assignment_matrix() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: Use ForwardMap's Node to find its images ! You can also use get_forward_map() and get_backward_map(). + + """ + return self.c_env.getNodePreImage(g, h, node_id) + + + def get_induced_cost(self, g, h) : + """ + Returns the induced cost between the two indicated graphs. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The induced cost between the two indicated graphs + :rtype: double + + .. seealso:: run_method(), get_forward_map(), get_backward_map(), get_node_image(), get_node_map(), get_assignment_matrix() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: Use ForwardMap's Node to find its images ! You can also use get_forward_map() and get_backward_map(). + + """ + return self.c_env.getInducedCost(g, h) + + + def get_node_map(self, g, h) : + """ + Returns the Node Map, like C++ NodeMap. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The Node Map between the two selected graph. + :rtype: gklearn.ged.env.NodeMap. + + .. seealso:: run_method(), get_forward_map(), get_backward_map(), get_node_image(), get_node_pre_image(), get_assignment_matrix() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: This function creates datas so use it if necessary, however you can understand how assignement works with this example. + """ + map_as_relation = self.c_env.getNodeMap(g, h) + induced_cost = self.c_env.getInducedCost(g, h) # @todo: the C++ implementation for this function in GedLibBind.ipp re-call get_node_map() once more, this is not neccessary. + source_map = [item.first if item.first < len(map_as_relation) else np.inf for item in map_as_relation] # item.first < len(map_as_relation) is not exactly correct. +# print(source_map) + target_map = [item.second if item.second < len(map_as_relation) else np.inf for item in map_as_relation] +# print(target_map) + num_node_source = len([item for item in source_map if item != np.inf]) +# print(num_node_source) + num_node_target = len([item for item in target_map if item != np.inf]) +# print(num_node_target) + + node_map = NodeMap(num_node_source, num_node_target) +# print(node_map.get_forward_map(), node_map.get_backward_map()) + for i in range(len(source_map)): + node_map.add_assignment(source_map[i], target_map[i]) + node_map.set_induced_cost(induced_cost) + + return node_map + + + def get_assignment_matrix(self, g, h) : + """ + Returns the Assignment Matrix between two selected graphs g and h. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The Assignment Matrix between the two selected graph. + :rtype: list[list[int]] + + .. seealso:: run_method(), get_forward_map(), get_backward_map(), get_node_image(), get_node_pre_image(), get_node_map() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: This function creates datas so use it if necessary. + """ + return self.c_env.getAssignmentMatrix(g, h) + + + def get_all_map(self, g, h) : + """ + Returns a vector which contains the forward and the backward maps between nodes of the two indicated graphs. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The forward and backward maps to the adjacence matrix between nodes of the two graphs + :rtype: list[list[npy_uint32]] + + .. seealso:: run_method(), get_upper_bound(), get_lower_bound(), get_forward_map(), get_backward_map(), get_runtime(), quasimetric_cost() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: This function duplicates data so please don't use it. I also don't know how to connect the two map to reconstruct the adjacence matrix. Please come back when I know how it's work ! + """ + return self.c_env.getAllMap(g, h) + + + def get_runtime(self, g, h) : + """ + Returns the runtime to compute the edit distance cost between two graphs g and h + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: The runtime of the computation of edit distance cost between the two selected graphs + :rtype: double + + .. seealso:: run_method(), get_upper_bound(), get_lower_bound(), get_forward_map(), get_backward_map(), quasimetric_cost() + .. warning:: run_method() between the same two graph must be called before this function. + .. note:: Python is a bit longer than C++ due to the functions's encapsulate. + """ + return self.c_env.getRuntime(g,h) + + + def quasimetric_cost(self) : + """ + Checks and returns if the edit costs are quasimetric. + + :param g: The Id of the first compared graph + :param h: The Id of the second compared graph + :type g: size_t + :type h: size_t + :return: True if it's verified, False otherwise + :rtype: bool + + .. seealso:: run_method(), get_upper_bound(), get_lower_bound(), get_forward_map(), get_backward_map(), get_runtime() + .. warning:: run_method() between the same two graph must be called before this function. + """ + return self.c_env.quasimetricCosts() + + + def hungarian_LSAP(self, matrix_cost) : + """ + Applies the hungarian algorithm (LSAP) on a matrix Cost. + + :param matrix_cost: The matrix Cost + :type matrix_cost: vector[vector[size_t]] + :return: The values of rho, varrho, u and v, in this order + :rtype: vector[vector[size_t]] + + .. seealso:: hungarian_LSAPE() + """ + return self.c_env.hungarianLSAP(matrix_cost) + + + def hungarian_LSAPE(self, matrix_cost) : + """ + Applies the hungarian algorithm (LSAPE) on a matrix Cost. + + :param matrix_cost: The matrix Cost + :type matrix_cost: vector[vector[double]] + :return: The values of rho, varrho, u and v, in this order + :rtype: vector[vector[double]] + + .. seealso:: hungarian_LSAP() + """ + return self.c_env.hungarianLSAPE(matrix_cost) + + + def add_random_graph(self, name, classe, list_of_nodes, list_of_edges, ignore_duplicates=True) : + """ + Add a Graph (not GXL) on the environment. Be careful to respect the same format as GXL graphs for labelling nodes and edges. + + :param name: The name of the graph to add, can be an empty string + :param classe: The classe of the graph to add, can be an empty string + :param list_of_nodes: The list of nodes to add + :param list_of_edges: The list of edges to add + :param ignore_duplicates: If True, duplicate edges are ignored, otherwise it's raise an error if an existing edge is added. True by default + :type name: string + :type classe: string + :type list_of_nodes: list[tuple(size_t, dict{string : string})] + :type list_of_edges: list[tuple(tuple(size_t,size_t), dict{string : string})] + :type ignore_duplicates: bool + :return: The ID of the newly added graphe + :rtype: size_t + + .. note:: The graph must respect the GXL structure. Please see how a GXL graph is construct. + + """ + id = self.add_graph(name, classe) + for node in list_of_nodes: + self.add_node(id, node[0], node[1]) + for edge in list_of_edges: + self.add_edge(id, edge[0], edge[1], edge[2], ignore_duplicates) + return id + + + def add_nx_graph(self, g, classe, ignore_duplicates=True) : + """ + Add a Graph (made by networkx) on the environment. Be careful to respect the same format as GXL graphs for labelling nodes and edges. + + :param g: The graph to add (networkx graph) + :param ignore_duplicates: If True, duplicate edges are ignored, otherwise it's raise an error if an existing edge is added. True by default + :type g: networkx.graph + :type ignore_duplicates: bool + :return: The ID of the newly added graphe + :rtype: size_t + + .. note:: The NX graph must respect the GXL structure. Please see how a GXL graph is construct. + + """ + id = self.add_graph(g.name, classe) + for node in g.nodes: + self.add_node(id, str(node), g.nodes[node]) + for edge in g.edges: + self.add_edge(id, str(edge[0]), str(edge[1]), g.get_edge_data(edge[0], edge[1]), ignore_duplicates) + return id + + + def compute_ged_on_two_graphs(self, g1, g2, edit_cost, method, options, init_option="EAGER_WITHOUT_SHUFFLED_COPIES") : + """ + Computes the edit distance between two NX graphs. + + :param g1: The first graph to add and compute + :param g2: The second graph to add and compute + :param edit_cost: The name of the edit cost function + :param method: The name of the computation method + :param options: The options of the method (like bash options), an empty string by default + :param init_option: The name of the init option, "EAGER_WITHOUT_SHUFFLED_COPIES" by default + :type g1: networksx.graph + :type g2: networksx.graph + :type edit_cost: string + :type method: string + :type options: string + :type init_option: string + :return: The edit distance between the two graphs and the nodeMap between them. + :rtype: double, list[tuple(size_t, size_t)] + + .. seealso:: list_of_edit_cost_options, list_of_method_options, list_of_init_options + .. note:: Make sure each parameter exists with your architecture and these lists : list_of_edit_cost_options, list_of_method_options, list_of_init_options. The structure of graphs must be similar as GXL. + + """ + if self.is_initialized() : + self.restart_env() + + g = self.add_nx_graph(g1, "") + h = self.add_nx_graph(g2, "") + + self.set_edit_cost(edit_cost) + self.init(init_option) + + self.set_method(method, options) + self.init_method() + + resDistance = 0 + resMapping = [] + self.run_method(g, h) + resDistance = self.get_upper_bound(g, h) + resMapping = self.get_node_map(g, h) + + return resDistance, resMapping + + + def compute_edit_distance_on_nx_graphs(self, dataset, classes, edit_cost, method, options, init_option="EAGER_WITHOUT_SHUFFLED_COPIES") : + """ + + Computes all the edit distance between each NX graphs on the dataset. + + :param dataset: The list of graphs to add and compute + :param classes: The classe of all the graph, can be an empty string + :param edit_cost: The name of the edit cost function + :param method: The name of the computation method + :param options: The options of the method (like bash options), an empty string by default + :param init_option: The name of the init option, "EAGER_WITHOUT_SHUFFLED_COPIES" by default + :type dataset: list[networksx.graph] + :type classes: string + :type edit_cost: string + :type method: string + :type options: string + :type init_option: string + :return: Two matrix, the first with edit distances between graphs and the second the nodeMap between graphs. The result between g and h is one the [g][h] coordinates. + :rtype: list[list[double]], list[list[list[tuple(size_t, size_t)]]] + + .. seealso:: list_of_edit_cost_options, list_of_method_options, list_of_init_options + .. note:: Make sure each parameter exists with your architecture and these lists : list_of_edit_cost_options, list_of_method_options, list_of_init_options. The structure of graphs must be similar as GXL. + + """ + if self.is_initialized() : + self.restart_env() + + print("Loading graphs in progress...") + for graph in dataset : + self.add_nx_graph(graph, classes) + listID = self.graph_ids() + print("Graphs loaded ! ") + print("Number of graphs = " + str(listID[1])) + + self.set_edit_cost(edit_cost) + print("Initialization in progress...") + self.init(init_option) + print("Initialization terminated !") + + self.set_method(method, options) + self.init_method() + + resDistance = [[]] + resMapping = [[]] + for g in range(listID[0], listID[1]) : + print("Computation between graph " + str(g) + " with all the others including himself.") + for h in range(listID[0], listID[1]) : + #print("Computation between graph " + str(g) + " and graph " + str(h)) + self.run_method(g, h) + resDistance[g][h] = self.get_upper_bound(g, h) + resMapping[g][h] = self.get_node_map(g, h) + + print("Finish ! The return contains edit distances and NodeMap but you can check the result with graphs'ID until you restart the environment") + return resDistance, resMapping + + + def compute_edit_distance_on_GXl_graphs(self, path_folder, path_XML, edit_cost, method, options="", init_option="EAGER_WITHOUT_SHUFFLED_COPIES") : + """ + Computes all the edit distance between each GXL graphs on the folder and the XMl file. + + :param path_folder: The folder's path which contains GXL graphs + :param path_XML: The XML's path which indicates which graphes you want to load + :param edit_cost: The name of the edit cost function + :param method: The name of the computation method + :param options: The options of the method (like bash options), an empty string by default + :param init_option: The name of the init option, "EAGER_WITHOUT_SHUFFLED_COPIES" by default + :type path_folder: string + :type path_XML: string + :type edit_cost: string + :type method: string + :type options: string + :type init_option: string + :return: The list of the first and last-1 ID of graphs + :rtype: tuple(size_t, size_t) + + .. seealso:: list_of_edit_cost_options, list_of_method_options, list_of_init_options + .. note:: Make sure each parameter exists with your architecture and these lists : list_of_edit_cost_options, list_of_method_options, list_of_init_options. + + """ + + if self.is_initialized() : + self.restart_env() + + print("Loading graphs in progress...") + self.load_GXL_graphs(path_folder, path_XML) + listID = self.graph_ids() + print("Graphs loaded ! ") + print("Number of graphs = " + str(listID[1])) + + self.set_edit_cost(edit_cost) + print("Initialization in progress...") + self.init(init_option) + print("Initialization terminated !") + + self.set_method(method, options) + self.init_method() + + #res = [] + for g in range(listID[0], listID[1]) : + print("Computation between graph " + str(g) + " with all the others including himself.") + for h in range(listID[0], listID[1]) : + #print("Computation between graph " + str(g) + " and graph " + str(h)) + self.run_method(g,h) + #res.append((get_upper_bound(g,h), get_node_map(g,h), get_runtime(g,h))) + + #return res + + print ("Finish ! You can check the result with each ID of graphs ! There are in the return") + print ("Please don't restart the environment or recall this function, you will lose your results !") + return listID + + + def get_num_node_labels(self): + """ + Returns the number of node labels. + + :return: Number of pairwise different node labels contained in the environment. + :rtype: size_t + + .. note:: If 1 is returned, the nodes are unlabeled. + """ + return self.c_env.getNumNodeLabels() + + + def get_node_label(self, label_id): + """ + Returns node label. + + :param label_id: ID of node label that should be returned. Must be between 1 and get_num_node_labels(). + :type label_id: size_t + :return: Node label for selected label ID. + :rtype: dict{string : string} + """ + return decode_your_map(self.c_env.getNodeLabel(label_id)) + + + def get_num_edge_labels(self): + """ + Returns the number of edge labels. + + :return: Number of pairwise different edge labels contained in the environment. + :rtype: size_t + + .. note:: If 1 is returned, the edges are unlabeled. + """ + return self.c_env.getNumEdgeLabels() + + + def get_edge_label(self, label_id): + """ + Returns edge label. + + :param label_id: ID of edge label that should be returned. Must be between 1 and get_num_edge_labels(). + :type label_id: size_t + :return: Edge label for selected label ID. + :rtype: dict{string : string} + """ + return decode_your_map(self.c_env.getEdgeLabel(label_id)) + + +# def get_num_nodes(self, graph_id): +# """ +# Returns the number of nodes. +# +# :param graph_id: ID of an input graph that has been added to the environment. +# :type graph_id: size_t +# :return: Number of nodes in the graph. +# :rtype: size_t +# """ +# return self.c_env.getNumNodes(graph_id) + + def get_avg_num_nodes(self): + """ + Returns average number of nodes. + + :return: Average number of nodes of the graphs contained in the environment. + :rtype: double + """ + return self.c_env.getAvgNumNodes() + + def get_node_rel_cost(self, node_label_1, node_label_2): + """ + Returns node relabeling cost. + + :param node_label_1: First node label. + :param node_label_2: Second node label. + :type node_label_1: dict{string : string} + :type node_label_2: dict{string : string} + :return: Node relabeling cost for the given node labels. + :rtype: double + """ + return self.c_env.getNodeRelCost(encode_your_map(node_label_1), encode_your_map(node_label_2)) + + + def get_node_del_cost(self, node_label): + """ + Returns node deletion cost. + + :param node_label: Node label. + :type node_label: dict{string : string} + :return: Cost of deleting node with given label. + :rtype: double + """ + return self.c_env.getNodeDelCost(encode_your_map(node_label)) + + + def get_node_ins_cost(self, node_label): + """ + Returns node insertion cost. + + :param node_label: Node label. + :type node_label: dict{string : string} + :return: Cost of inserting node with given label. + :rtype: double + """ + return self.c_env.getNodeInsCost(encode_your_map(node_label)) + + + def get_median_node_label(self, node_labels): + """ + Computes median node label. + + :param node_labels: The node labels whose median should be computed. + :type node_labels: list[dict{string : string}] + :return: Median of the given node labels. + :rtype: dict{string : string} + """ + node_labels_b = [encode_your_map(node_label) for node_label in node_labels] + return decode_your_map(self.c_env.getMedianNodeLabel(node_labels_b)) + + + def get_edge_rel_cost(self, edge_label_1, edge_label_2): + """ + Returns edge relabeling cost. + + :param edge_label_1: First edge label. + :param edge_label_2: Second edge label. + :type edge_label_1: dict{string : string} + :type edge_label_2: dict{string : string} + :return: Edge relabeling cost for the given edge labels. + :rtype: double + """ + return self.c_env.getEdgeRelCost(encode_your_map(edge_label_1), encode_your_map(edge_label_2)) + + + def get_edge_del_cost(self, edge_label): + """ + Returns edge deletion cost. + + :param edge_label: Edge label. + :type edge_label: dict{string : string} + :return: Cost of deleting edge with given label. + :rtype: double + """ + return self.c_env.getEdgeDelCost(encode_your_map(edge_label)) + + + def get_edge_ins_cost(self, edge_label): + """ + Returns edge insertion cost. + + :param edge_label: Edge label. + :type edge_label: dict{string : string} + :return: Cost of inserting edge with given label. + :rtype: double + """ + return self.c_env.getEdgeInsCost(encode_your_map(edge_label)) + + + def get_median_edge_label(self, edge_labels): + """ + Computes median edge label. + + :param edge_labels: The edge labels whose median should be computed. + :type edge_labels: list[dict{string : string}] + :return: Median of the given edge labels. + :rtype: dict{string : string} + """ + edge_labels_b = [encode_your_map(edge_label) for edge_label in edge_labels] + return decode_your_map(self.c_env.getMedianEdgeLabel(edge_label_b)) + + + def get_nx_graph(self, graph_id, adj_matrix=True, adj_lists=False, edge_list=False): # @todo + """ + Get graph with id `graph_id` in the form of the NetworkX Graph. + + Parameters + ---------- + graph_id : int + ID of the selected graph. + + adj_matrix : bool + Set to `True` to construct an adjacency matrix `adj_matrix` and a hash-map `edge_labels`, which has a key for each pair `(i,j)` such that `adj_matrix[i][j]` equals 1. No effect for now. + + adj_lists : bool + No effect for now. + + edge_list : bool + No effect for now. + + Returns + ------- + NetworkX Graph object + The obtained graph. + """ + graph = nx.Graph() + graph.graph['id'] = graph_id + + nb_nodes = self.get_graph_num_nodes(graph_id) + original_node_ids = self.get_original_node_ids(graph_id) + node_labels = self.get_graph_node_labels(graph_id) +# print(original_node_ids) +# print(node_labels) + graph.graph['original_node_ids'] = original_node_ids + + for node_id in range(0, nb_nodes): + graph.add_node(node_id, **node_labels[node_id]) +# graph.nodes[node_id]['original_node_id'] = original_node_ids[node_id] + + edges = self.get_graph_edges(graph_id) + for (head, tail), labels in edges.items(): + graph.add_edge(head, tail, **labels) +# print(edges) + + return graph + + + def get_init_type(self): + """ + Returns the initialization type of the last initialization in string. + + Returns + ------- + string + Initialization type in string. + """ + return self.c_env.getInitType().decode('utf-8') + + +# def get_node_cost(self, label1, label2): +# """ +# Returns node relabeling, insertion, or deletion cost. + +# Parameters +# ---------- +# label1 : int +# First node label. +# +# label2 : int +# Second node label. +# +# Returns +# ------- +# Node relabeling cost if `label1` and `label2` are both different from `ged::dummy_label()`, node insertion cost if `label1` equals `ged::dummy_label` and `label2` does not, node deletion cost if `label1` does not equal `ged::dummy_label` and `label2` does, and 0 otherwise. +# """ +# return self.c_env.getNodeCost(label1, label2) + + + def load_nx_graph(self, nx_graph, graph_id, graph_name='', graph_class=''): + """ + Loads NetworkX Graph into the GED environment. + + Parameters + ---------- + nx_graph : NetworkX Graph object + The graph that should be loaded. + + graph_id : int or None + The ID of a graph contained the environment (overwrite existing graph) or add new graph if `None`. + + graph_name : string, optional + The name of newly added graph. The default is ''. Has no effect unless `graph_id` equals `None`. + + graph_class : string, optional + The class of newly added graph. The default is ''. Has no effect unless `graph_id` equals `None`. + + Returns + ------- + int + The ID of the newly loaded graph. + """ + if graph_id is None: + graph_id = self.add_graph(graph_name, graph_class) + else: + self.clear_graph(graph_id) + for node in nx_graph.nodes: + self.add_node(graph_id, str(node), nx_graph.nodes[node]) + for edge in nx_graph.edges: + self.add_edge(graph_id, str(edge[0]), str(edge[1]), nx_graph.get_edge_data(edge[0], edge[1])) + return graph_id + + + def compute_induced_cost(self, g_id, h_id, node_map): + """ + Computes the edit cost between two graphs induced by a node map. + + Parameters + ---------- + g_id : int + ID of input graph. + h_id : int + ID of input graph. + node_map: gklearn.ged.env.NodeMap. + The NodeMap instance whose reduced cost will be computed and re-assigned. + + Returns + ------- + None. + """ + relation = [] + node_map.as_relation(relation) +# print(relation) + dummy_node = get_dummy_node() +# print(dummy_node) + for i, val in enumerate(relation): + val1 = dummy_node if val[0] == np.inf else val[0] + val2 = dummy_node if val[1] == np.inf else val[1] + relation[i] = tuple((val1, val2)) +# print(relation) + induced_cost = self.c_env.computeInducedCost(g_id, h_id, relation) + node_map.set_induced_cost(induced_cost) + + +##################################################################### +##LISTS OF EDIT COST FUNCTIONS, METHOD COMPUTATION AND INIT OPTIONS## +##################################################################### + +list_of_edit_cost_options = get_edit_cost_options() +list_of_method_options = get_method_options() +list_of_init_options = get_init_options() + + +##################### +##ERRORS MANAGEMENT## +##################### + +class Error(Exception): + """ + Class for error's management. This one is general. + """ + pass + + +class EditCostError(Error) : + """ + Class for Edit Cost Error. Raise an error if an edit cost function doesn't exist in the library (not in list_of_edit_cost_options). + + :attribute message: The message to print when an error is detected. + :type message: string + """ + def __init__(self, message): + """ + Inits the error with its message. + + :param message: The message to print when the error is detected + :type message: string + """ + self.message = message + + +class MethodError(Error) : + """ + Class for Method Error. Raise an error if a computation method doesn't exist in the library (not in list_of_method_options). + + :attribute message: The message to print when an error is detected. + :type message: string + """ + def __init__(self, message): + """ + Inits the error with its message. + + :param message: The message to print when the error is detected + :type message: string + """ + self.message = message + + +class InitError(Error) : + """ + Class for Init Error. Raise an error if an init option doesn't exist in the library (not in list_of_init_options). + + :attribute message: The message to print when an error is detected. + :type message: string + """ + def __init__(self, message): + """ + Inits the error with its message. + + :param message: The message to print when the error is detected + :type message: string + """ + self.message = message + + +######################################### +##PYTHON FUNCTIONS FOR SOME COMPUTATION## +######################################### + +def encode_your_map(map_u): + """ + Encodes Python unicode strings in dictionnary `map` to utf-8 byte strings for C++ functions. + + :param map_b: The map to encode + :type map_b: dict{string : string} + :return: The encoded map + :rtype: dict{'b'string : 'b'string} + + .. note:: This function is used for type connection. + + """ + res = {} + for key, value in map_u.items(): + res[key.encode('utf-8')] = value.encode('utf-8') + return res + + +def decode_your_map(map_b): + """ + Decodes utf-8 byte strings in `map` from C++ functions to Python unicode strings. + + :param map_b: The map to decode + :type map_b: dict{'b'string : 'b'string} + :return: The decoded map + :rtype: dict{string : string} + + .. note:: This function is used for type connection. + + """ + res = {} + for key, value in map_b.items(): + res[key.decode('utf-8')] = value.decode('utf-8') + return res + + +def decode_graph_edges(map_edge_b): + """ + Decode utf-8 byte strings in graph edges `map` from C++ functions to Python unicode strings. + + Parameters + ---------- + map_edge_b : dict{tuple(size_t, size_t) : dict{'b'string : 'b'string}} + The map to decode. + + Returns + ------- + dict{tuple(size_t, size_t) : dict{string : string}} + The decoded map. + + Notes + ----- + This is a helper function for function `GEDEnv.get_graph_edges()`. + """ + map_edges = {} + for key, value in map_edge_b.items(): + map_edges[key] = decode_your_map(value) + return map_edges + + + + + + + +# cdef extern from "src/GedLibBind.h" namespace "shapes": +# cdef cppclass Rectangle: +# Rectangle() except + +# Rectangle(int, int, int, int) except + +# int x0, y0, x1, y1 +# int getArea() +# void getSize(int* width, int* height) +# void move(int, int) + + +# # Create a Cython extension type which holds a C++ instance +# # as an attribute and create a bunch of forwarding methods +# # Python extension type. +# cdef class PyRectangle: +# cdef Rectangle c_rect # Hold a C++ instance which we're wrapping + +# def __cinit__(self, int x0, int y0, int x1, int y1): +# self.c_rect = Rectangle(x0, y0, x1, y1) + +# def get_area(self): +# return self.c_rect.getArea() + +# def get_size(self): +# cdef int width, height +# self.c_rect.getSize(&width, &height) +# return width, height + +# def move(self, dx, dy): +# self.c_rect.move(dx, dy) + +# # Attribute access +# @property +# def x0(self): +# return self.c_rect.x0 +# @x0.setter +# def x0(self, x0): +# self.c_rect.x0 = x0 + +# # Attribute access +# @property +# def x1(self): +# return self.c_rect.x1 +# @x1.setter +# def x1(self, x1): +# self.c_rect.x1 = x1 + +# # Attribute access +# @property +# def y0(self): +# return self.c_rect.y0 +# @y0.setter +# def y0(self, y0): +# self.c_rect.y0 = y0 + +# # Attribute access +# @property +# def y1(self): +# return self.c_rect.y1 +# @y1.setter +# def y1(self, y1): +# self.c_rect.y1 = y1 diff --git a/gklearn/gedlib/test.py b/gklearn/gedlib/test.py index 28ae2ae91f..338000cb76 100755 --- a/gklearn/gedlib/test.py +++ b/gklearn/gedlib/test.py @@ -2,7 +2,7 @@ #Pour que "import script" trouve les librairies qu'a besoin GedLib #Equivalent à définir la variable d'environnement LD_LIBRARY_PATH sur un bash -import librariesImport +import libraries_import import gedlibpy import networkx as nx diff --git a/gklearn/preimage/median_preimage_generator.py b/gklearn/preimage/median_preimage_generator.py index 7cb362fc96..449c42862c 100644 --- a/gklearn/preimage/median_preimage_generator.py +++ b/gklearn/preimage/median_preimage_generator.py @@ -16,7 +16,7 @@ from gklearn.ged.util import compute_geds, ged_options_to_string from gklearn.ged.median import MedianGraphEstimator from gklearn.ged.median import constant_node_costs,mge_options_to_string -from gklearn.gedlib import librariesImport, gedlibpy +from gklearn.gedlib import libraries_import, gedlibpy from gklearn.utils import Timer from gklearn.utils.utils import get_graph_kernel_by_name