diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 41ec4b1e..49b8deec 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -90,7 +90,7 @@ class RunnerConfig: Configuration for scoring, classifier setup, learning parameters, and optional features. Attributes: - classifier (str): Classifier type used for semi-supervised learning ('LDA', 'SVM' or 'XGBoost'). + classifier (str): Classifier type used for semi-supervised learning Can either be a single classifier ('LDA', 'SVM', 'XGBoost') or a multiclassifier ('LDA_XGBoost'). autotune (bool): Whether to autotune hyperparameters for the classifier (XGBoost / SVM) ss_main_score (str): Starting main score for semi-supervised learning (can be 'auto'). main_score_selection_report (bool): Whether to generate a report for main score selection. @@ -127,7 +127,7 @@ class RunnerConfig: """ # Scoring / classifier options - classifier: Literal["LDA", "SVM", "XGBoost"] = "LDA" + classifier: Literal["LDA", "SVM", "XGBoost", 'LDA_XGBoost'] = "LDA" autotune: bool = False ss_main_score: str = "auto" main_score_selection_report: bool = False diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py index 0ba72b51..54c47432 100644 --- a/pyprophet/cli/score.py +++ b/pyprophet/cli/score.py @@ -12,7 +12,7 @@ memray_profile, ) from .._config import RunnerIOConfig -from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier +from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier, LDA_XGBoostMultiLearner # PyProphet semi-supervised learning and scoring @@ -43,7 +43,7 @@ "--classifier", default="LDA", show_default=True, - type=click.Choice(["LDA", "SVM", "XGBoost"]), + type=click.Choice(["LDA", "SVM", "XGBoost", "LDA_XGBoost"]), help='Either a "LDA", "SVM" or "XGBoost" classifier is used for semi-supervised learning.', ) @click.option( @@ -360,7 +360,7 @@ def score( config.subsample_ratio = 1.0 if not apply_weights: - if config.subsample_ratio < 1.0: + if config.subsample_ratio < 1.0: # currently LDA_XGBoostMultiLearner does not support subsampling logger.info( f"Conducting {level} semi-supervised learning on {config.subsample_ratio * 100}% of the data.", ) @@ -399,11 +399,18 @@ def score( PyProphetWeightApplier(weights_path, run_config).run() else: PyProphetWeightApplier(weights_path, config).run() - else: - logger.info( + else: # No subsampling + if config.runner.classifier == "LDA_XGBoost": + logger.info( + f"Conducting {level} semi-supervised learning with LDA followed by XGBoost.", + ) + LDA_XGBoostMultiLearner(config).run() + + else: + logger.info( f"Conducting {level} semi-supervised learning.", - ) - PyProphetLearner(config).run() + ) + PyProphetLearner(config).run() else: logger.info( f"Applying {level} weights from {apply_weights} to the full data set.", diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 89e284d9..91c1e95c 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -152,7 +152,7 @@ def _finalize_feature_table(self, df, ss_main_score): f"Main score ({main_score}) not found in input columns: {df.columns}" ) - if self.classifier == "XGBoost" and self.level != "alignment": + if self.classifier in ["XGBoost", "LDA_XGBoost"] and self.level != "alignment": logger.info( "Enable number of transitions & precursor / product charge scores for XGBoost-based classifier" ) diff --git a/pyprophet/scoring/runner.py b/pyprophet/scoring/runner.py index 347a421d..0cad5a08 100644 --- a/pyprophet/scoring/runner.py +++ b/pyprophet/scoring/runner.py @@ -255,6 +255,60 @@ def print_summary(self, result): logger.opt(raw=True).info("\n") +class PyProphetMultiLearner(PyProphetRunner): + """ + Implements the learning and scoring workflow for PyProphet with multiple classifiers run sequentially. + """ + + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def run_algo(self, part=None): + if self.glyco: + raise click.ClickException( + "Multi-classifier learning is not supported for glycopeptide workflows." + ) + + +class LDA_XGBoostMultiLearner(PyProphetMultiLearner): + """ + Implements the learning and scoring workflow for PyProphet with multiple classifiers run sequentially + """ + + def run_algo(self, part=None): + """ + Runs the learning and scoring algorithm for multiple classifiers. + + Returns: + tuple: A tuple containing the result, scorer, and weights. + """ + + super(LDA_XGBoostMultiLearner, self).run_algo(part) + + config_lda = self.config.copy() + config_lda.runner.classifier = "LDA" + + # remove columns that are not needed for LDA + table_lda = self.table.drop(columns=["var_precursor_charge", "var_product_charge", "var_transition_count"], errors='ignore') + + (result_lda, scorer_lda, weights_lda) = PyProphet(config_lda).learn_and_apply(table_lda) + + # rename the column that was the main score + self.table.columns = self.table.columns.str.replace('^main', '', regex=True) + + self.table['main_var_lda_score'] = result_lda.scored_tables['d_score'] + + logger.info("LDA scores computed! Now running XGBoost using the LDA score as the main score") + + config_xgb = self.config.copy() + config_xgb.runner.ss_main_score = 'var_lda_score' # use lda score as the main score for XGBoost + config_xgb.runner.classifier = "XGBoost" + config_xgb.runner.ss_use_dynamic_main_score = False # since using lda score do not need to dynamically select the main score + self.config.runner.classifier = "XGBoost" # need to change to XGBoost for saving the weights + + (result_xgb, scorer_xgb, weights_xgb) = PyProphet(config_xgb).learn_and_apply(self.table) + return (result_xgb, scorer_xgb, weights_xgb) + class PyProphetLearner(PyProphetRunner): """ Implements the learning and scoring workflow for PyProphet. diff --git a/tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out b/tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out new file mode 100644 index 00000000..dabd5a1e --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out @@ -0,0 +1,14 @@ + feature_id ms1_precursor_pep ms2_peakgroup_pep ms2_precursor_pep +0 -9078977811506172301 0.0063 0.0022 0.0025 +1 -9009602369958523731 0.0063 0.0022 0.0325 +2 -8990894093332793487 0.0063 0.0022 0.0025 +3 -8915955323477460297 0.0063 0.0022 0.0071 +4 -8858715981476206597 0.0063 0.0022 0.0025 +.. ... ... ... ... +95 -2912234918591861719 0.0063 0.0022 0.0025 +96 -2872329084347808160 0.0063 0.0022 0.0025 +97 -2789098353857361973 1.0000 0.0022 0.0025 +98 -2788620575140019858 0.0063 0.0022 0.0025 +99 -2741276427609241638 0.0063 0.0022 0.0325 + +[100 rows x 4 columns] diff --git a/tests/test_pyprophet_score.py b/tests/test_pyprophet_score.py index 14e6ccfb..2fe7e144 100644 --- a/tests/test_pyprophet_score.py +++ b/tests/test_pyprophet_score.py @@ -190,6 +190,8 @@ def execute(self, levels=None, **kwargs): level_cmd += " --classifier=XGBoost" if kwargs.get("xgboost_tune"): level_cmd += " --autotune" + if kwargs.get("lda_xgboost"): + level_cmd += " --classifier=LDA_XGBoost" if kwargs.get("score_filter"): level_cmd = self.config.add_score_filter(level_cmd, level) @@ -770,6 +772,19 @@ def test_osw_9(test_runner, test_config, regtest): def test_osw_10(test_runner, test_config, regtest): run_metabo_test(test_runner, test_config, regtest, ms1ms2=True, score_filter=True) +# Tests LDA then XGBoost +def test_osw_11(test_runner, test_config, regtest): + run_generic_test( + test_runner, + test_config, + OSWTestStrategy, + regtest, + pfdr=True, + pi0_lambda="0 0 0", + ms1ms2=True, + lda_xgboost=True, + ) + # Parquet Tests def test_parquet_0(test_runner, test_config, regtest):