From 15f0e9c7619057e1fa45e53dadc10e5b316dfc66 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Wed, 30 Jul 2025 17:50:10 -0400 Subject: [PATCH 1/8] feature: LDA then XGBoost New workflow which first run LDA and then run XGBoost using the LDA results as the main score. This helps prevent overfitting with XGBoost, results pretty comparable to XGBoost --- pyprophet/cli/score.py | 18 ++++++++++----- pyprophet/io/_base.py | 2 +- pyprophet/scoring/runner.py | 44 +++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py index 0ba72b51..49ea355c 100644 --- a/pyprophet/cli/score.py +++ b/pyprophet/cli/score.py @@ -12,7 +12,7 @@ memray_profile, ) from .._config import RunnerIOConfig -from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier +from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier, PyProphetMultiLearner # PyProphet semi-supervised learning and scoring @@ -43,7 +43,7 @@ "--classifier", default="LDA", show_default=True, - type=click.Choice(["LDA", "SVM", "XGBoost"]), + type=click.Choice(["LDA", "SVM", "XGBoost", "LDA_XGBoost"]), help='Either a "LDA", "SVM" or "XGBoost" classifier is used for semi-supervised learning.', ) @click.option( @@ -400,10 +400,18 @@ def score( else: PyProphetWeightApplier(weights_path, config).run() else: - logger.info( + if config.runner.classifier == "LDA_XGBoost": + logger.info( + f"Conducting {level} semi-supervised learning with LDA followed by XGBoost.", + ) + PyProphetMultiLearner(config).run() + + + else: + logger.info( f"Conducting {level} semi-supervised learning.", - ) - PyProphetLearner(config).run() + ) + PyProphetLearner(config).run() else: logger.info( f"Applying {level} weights from {apply_weights} to the full data set.", diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 89e284d9..91c1e95c 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -152,7 +152,7 @@ def _finalize_feature_table(self, df, ss_main_score): f"Main score ({main_score}) not found in input columns: {df.columns}" ) - if self.classifier == "XGBoost" and self.level != "alignment": + if self.classifier in ["XGBoost", "LDA_XGBoost"] and self.level != "alignment": logger.info( "Enable number of transitions & precursor / product charge scores for XGBoost-based classifier" ) diff --git a/pyprophet/scoring/runner.py b/pyprophet/scoring/runner.py index 347a421d..64374027 100644 --- a/pyprophet/scoring/runner.py +++ b/pyprophet/scoring/runner.py @@ -255,6 +255,50 @@ def print_summary(self, result): logger.opt(raw=True).info("\n") +class PyProphetMultiLearner(PyProphetRunner): + """ + Implements the learning and scoring workflow for PyProphet with multiple classifiers run sequentially + """ + + def run_algo(self, part=None): + """ + Runs the learning and scoring algorithm for multiple classifiers. + + Returns: + tuple: A tuple containing the result, scorer, and weights. + """ + if self.glyco: + raise click.ClickException( + "Multi-classifier learning is not supported for glycopeptide workflows." + ) + else: + config_lda = self.config.copy() + config_lda.runner.classifier = "LDA" + + # remove columns that are not needed for LDA + table_lda = self.table.drop(columns=["var_precursor_charge", "var_product_charge", "var_transition_count"], errors='ignore') + + (result_lda, scorer_lda, weights_lda) = PyProphet(config_lda).learn_and_apply(table_lda) + self.table['main_var_lda_score'] = result_lda.scored_tables['d_score'] + + logger.info("LDA scores computed! Now running XGBoost on top of LDA scores.") + + # rename the column that was the main score + found = False + for col in self.table.columns: + if col.startswith("main") and not found: + self.table = self.table.rename(columns={col:col[5:]}) + found = True + + config_xgb = self.config.copy() + config_xgb.runner.ss_main_score = 'var_lda_score' # use lda score as the main score for XGBoost + config_xgb.runner.classifier = "XGBoost" + config_xgb.runner.ss_use_dynamic_main_score = False # since using lda score do not ned to dynamically select the main score + self.config.runner.classifier = "XGBoost" # need to change to XGBoost for saving the weights + + (result_xgb, scorer_xgb, weights_xgb) = PyProphet(config_xgb).learn_and_apply(self.table) + return (result_xgb, scorer_xgb, weights_xgb) + class PyProphetLearner(PyProphetRunner): """ Implements the learning and scoring workflow for PyProphet. From 038899117d669c23695933bc0ea2e925826debb7 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 7 Aug 2025 17:03:35 -0400 Subject: [PATCH 2/8] test: add test for pyprophet_lda --- .../test_pyprophet_score.test_osw_11.out | 14 ++++++++++++++ tests/test_pyprophet_score.py | 15 +++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out diff --git a/tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out b/tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out new file mode 100644 index 00000000..dabd5a1e --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out @@ -0,0 +1,14 @@ + feature_id ms1_precursor_pep ms2_peakgroup_pep ms2_precursor_pep +0 -9078977811506172301 0.0063 0.0022 0.0025 +1 -9009602369958523731 0.0063 0.0022 0.0325 +2 -8990894093332793487 0.0063 0.0022 0.0025 +3 -8915955323477460297 0.0063 0.0022 0.0071 +4 -8858715981476206597 0.0063 0.0022 0.0025 +.. ... ... ... ... +95 -2912234918591861719 0.0063 0.0022 0.0025 +96 -2872329084347808160 0.0063 0.0022 0.0025 +97 -2789098353857361973 1.0000 0.0022 0.0025 +98 -2788620575140019858 0.0063 0.0022 0.0025 +99 -2741276427609241638 0.0063 0.0022 0.0325 + +[100 rows x 4 columns] diff --git a/tests/test_pyprophet_score.py b/tests/test_pyprophet_score.py index 14e6ccfb..2fe7e144 100644 --- a/tests/test_pyprophet_score.py +++ b/tests/test_pyprophet_score.py @@ -190,6 +190,8 @@ def execute(self, levels=None, **kwargs): level_cmd += " --classifier=XGBoost" if kwargs.get("xgboost_tune"): level_cmd += " --autotune" + if kwargs.get("lda_xgboost"): + level_cmd += " --classifier=LDA_XGBoost" if kwargs.get("score_filter"): level_cmd = self.config.add_score_filter(level_cmd, level) @@ -770,6 +772,19 @@ def test_osw_9(test_runner, test_config, regtest): def test_osw_10(test_runner, test_config, regtest): run_metabo_test(test_runner, test_config, regtest, ms1ms2=True, score_filter=True) +# Tests LDA then XGBoost +def test_osw_11(test_runner, test_config, regtest): + run_generic_test( + test_runner, + test_config, + OSWTestStrategy, + regtest, + pfdr=True, + pi0_lambda="0 0 0", + ms1ms2=True, + lda_xgboost=True, + ) + # Parquet Tests def test_parquet_0(test_runner, test_config, regtest): From 8df4a91dd0e361592a205133463b3bd5dbd4d897 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Thu, 7 Aug 2025 17:06:10 -0400 Subject: [PATCH 3/8] minor: make log more verbose --- pyprophet/scoring/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyprophet/scoring/runner.py b/pyprophet/scoring/runner.py index 64374027..aa8215c1 100644 --- a/pyprophet/scoring/runner.py +++ b/pyprophet/scoring/runner.py @@ -281,7 +281,7 @@ def run_algo(self, part=None): (result_lda, scorer_lda, weights_lda) = PyProphet(config_lda).learn_and_apply(table_lda) self.table['main_var_lda_score'] = result_lda.scored_tables['d_score'] - logger.info("LDA scores computed! Now running XGBoost on top of LDA scores.") + logger.info("LDA scores computed! Now running XGBoost using the LDA score as the main score") # rename the column that was the main score found = False From f6d48dcaa187f0133d2112b2baeb1611d005b850 Mon Sep 17 00:00:00 2001 From: Joshua Charkow <47336288+jcharkow@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:57:38 -0400 Subject: [PATCH 4/8] remove extra line Co-authored-by: Justin Sing <32938975+singjc@users.noreply.github.com> --- pyprophet/cli/score.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py index 49ea355c..51303924 100644 --- a/pyprophet/cli/score.py +++ b/pyprophet/cli/score.py @@ -406,7 +406,6 @@ def score( ) PyProphetMultiLearner(config).run() - else: logger.info( f"Conducting {level} semi-supervised learning.", From ac82df15f6ba8afc3780db0de42ed8860af70b82 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 12 Aug 2025 17:22:30 -0400 Subject: [PATCH 5/8] refactor: make MultiLearner an abstract class --- pyprophet/cli/score.py | 4 +- pyprophet/scoring/runner.py | 75 ++++++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py index 51303924..a30e5136 100644 --- a/pyprophet/cli/score.py +++ b/pyprophet/cli/score.py @@ -12,7 +12,7 @@ memray_profile, ) from .._config import RunnerIOConfig -from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier, PyProphetMultiLearner +from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier, LDA_XGBoostMultiLearner # PyProphet semi-supervised learning and scoring @@ -404,7 +404,7 @@ def score( logger.info( f"Conducting {level} semi-supervised learning with LDA followed by XGBoost.", ) - PyProphetMultiLearner(config).run() + LDA_XGBoostMultiLearner(config).run() else: logger.info( diff --git a/pyprophet/scoring/runner.py b/pyprophet/scoring/runner.py index aa8215c1..4c0e7175 100644 --- a/pyprophet/scoring/runner.py +++ b/pyprophet/scoring/runner.py @@ -256,6 +256,21 @@ def print_summary(self, result): class PyProphetMultiLearner(PyProphetRunner): + """ + Implements the learning and scoring workflow for PyProphet with multiple classifiers run sequentially. + """ + + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def run_algo(self, part=None): + if self.glyco: + raise click.ClickException( + "Multi-classifier learning is not supported for glycopeptide workflows." + ) + + +class LDA_XGBoostMultiLearner(PyProphetMultiLearner): """ Implements the learning and scoring workflow for PyProphet with multiple classifiers run sequentially """ @@ -267,37 +282,35 @@ def run_algo(self, part=None): Returns: tuple: A tuple containing the result, scorer, and weights. """ - if self.glyco: - raise click.ClickException( - "Multi-classifier learning is not supported for glycopeptide workflows." - ) - else: - config_lda = self.config.copy() - config_lda.runner.classifier = "LDA" - - # remove columns that are not needed for LDA - table_lda = self.table.drop(columns=["var_precursor_charge", "var_product_charge", "var_transition_count"], errors='ignore') - - (result_lda, scorer_lda, weights_lda) = PyProphet(config_lda).learn_and_apply(table_lda) - self.table['main_var_lda_score'] = result_lda.scored_tables['d_score'] - - logger.info("LDA scores computed! Now running XGBoost using the LDA score as the main score") - - # rename the column that was the main score - found = False - for col in self.table.columns: - if col.startswith("main") and not found: - self.table = self.table.rename(columns={col:col[5:]}) - found = True - - config_xgb = self.config.copy() - config_xgb.runner.ss_main_score = 'var_lda_score' # use lda score as the main score for XGBoost - config_xgb.runner.classifier = "XGBoost" - config_xgb.runner.ss_use_dynamic_main_score = False # since using lda score do not ned to dynamically select the main score - self.config.runner.classifier = "XGBoost" # need to change to XGBoost for saving the weights - - (result_xgb, scorer_xgb, weights_xgb) = PyProphet(config_xgb).learn_and_apply(self.table) - return (result_xgb, scorer_xgb, weights_xgb) + + super(LDA_XGBoostMultiLearner, self).run_algo(part) + + config_lda = self.config.copy() + config_lda.runner.classifier = "LDA" + + # remove columns that are not needed for LDA + table_lda = self.table.drop(columns=["var_precursor_charge", "var_product_charge", "var_transition_count"], errors='ignore') + + (result_lda, scorer_lda, weights_lda) = PyProphet(config_lda).learn_and_apply(table_lda) + self.table['main_var_lda_score'] = result_lda.scored_tables['d_score'] + + logger.info("LDA scores computed! Now running XGBoost using the LDA score as the main score") + + # rename the column that was the main score + found = False + for col in self.table.columns: + if col.startswith("main") and not found: + self.table = self.table.rename(columns={col:col[5:]}) + found = True + + config_xgb = self.config.copy() + config_xgb.runner.ss_main_score = 'var_lda_score' # use lda score as the main score for XGBoost + config_xgb.runner.classifier = "XGBoost" + config_xgb.runner.ss_use_dynamic_main_score = False # since using lda score do not ned to dynamically select the main score + self.config.runner.classifier = "XGBoost" # need to change to XGBoost for saving the weights + + (result_xgb, scorer_xgb, weights_xgb) = PyProphet(config_xgb).learn_and_apply(self.table) + return (result_xgb, scorer_xgb, weights_xgb) class PyProphetLearner(PyProphetRunner): """ From 493c343581e26a5f39a7aa59eff336f9ada8c4ef Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 12 Aug 2025 17:31:09 -0400 Subject: [PATCH 6/8] apply comments: main score column renaming --- pyprophet/scoring/runner.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pyprophet/scoring/runner.py b/pyprophet/scoring/runner.py index 4c0e7175..93d23624 100644 --- a/pyprophet/scoring/runner.py +++ b/pyprophet/scoring/runner.py @@ -292,17 +292,14 @@ def run_algo(self, part=None): table_lda = self.table.drop(columns=["var_precursor_charge", "var_product_charge", "var_transition_count"], errors='ignore') (result_lda, scorer_lda, weights_lda) = PyProphet(config_lda).learn_and_apply(table_lda) + + # rename the column that was the main score + self.table.columns = self.table.columns.str.replace('^main', '', regex=True) + self.table['main_var_lda_score'] = result_lda.scored_tables['d_score'] logger.info("LDA scores computed! Now running XGBoost using the LDA score as the main score") - # rename the column that was the main score - found = False - for col in self.table.columns: - if col.startswith("main") and not found: - self.table = self.table.rename(columns={col:col[5:]}) - found = True - config_xgb = self.config.copy() config_xgb.runner.ss_main_score = 'var_lda_score' # use lda score as the main score for XGBoost config_xgb.runner.classifier = "XGBoost" From 2961a67e798142ece0da81dece273ba50802a15c Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 12 Aug 2025 17:34:32 -0400 Subject: [PATCH 7/8] apply comments: add LDA_XGBoost to config --- pyprophet/_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 41ec4b1e..49b8deec 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -90,7 +90,7 @@ class RunnerConfig: Configuration for scoring, classifier setup, learning parameters, and optional features. Attributes: - classifier (str): Classifier type used for semi-supervised learning ('LDA', 'SVM' or 'XGBoost'). + classifier (str): Classifier type used for semi-supervised learning Can either be a single classifier ('LDA', 'SVM', 'XGBoost') or a multiclassifier ('LDA_XGBoost'). autotune (bool): Whether to autotune hyperparameters for the classifier (XGBoost / SVM) ss_main_score (str): Starting main score for semi-supervised learning (can be 'auto'). main_score_selection_report (bool): Whether to generate a report for main score selection. @@ -127,7 +127,7 @@ class RunnerConfig: """ # Scoring / classifier options - classifier: Literal["LDA", "SVM", "XGBoost"] = "LDA" + classifier: Literal["LDA", "SVM", "XGBoost", 'LDA_XGBoost'] = "LDA" autotune: bool = False ss_main_score: str = "auto" main_score_selection_report: bool = False From 224d45b85c1e55c61f8946a7c640a28c0d93dc27 Mon Sep 17 00:00:00 2001 From: Joshua Charkow Date: Tue, 19 Aug 2025 16:26:18 -0400 Subject: [PATCH 8/8] apply copilot suggestions --- pyprophet/cli/score.py | 4 ++-- pyprophet/scoring/runner.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyprophet/cli/score.py b/pyprophet/cli/score.py index a30e5136..54c47432 100644 --- a/pyprophet/cli/score.py +++ b/pyprophet/cli/score.py @@ -360,7 +360,7 @@ def score( config.subsample_ratio = 1.0 if not apply_weights: - if config.subsample_ratio < 1.0: + if config.subsample_ratio < 1.0: # currently LDA_XGBoostMultiLearner does not support subsampling logger.info( f"Conducting {level} semi-supervised learning on {config.subsample_ratio * 100}% of the data.", ) @@ -399,7 +399,7 @@ def score( PyProphetWeightApplier(weights_path, run_config).run() else: PyProphetWeightApplier(weights_path, config).run() - else: + else: # No subsampling if config.runner.classifier == "LDA_XGBoost": logger.info( f"Conducting {level} semi-supervised learning with LDA followed by XGBoost.", diff --git a/pyprophet/scoring/runner.py b/pyprophet/scoring/runner.py index 93d23624..0cad5a08 100644 --- a/pyprophet/scoring/runner.py +++ b/pyprophet/scoring/runner.py @@ -303,7 +303,7 @@ def run_algo(self, part=None): config_xgb = self.config.copy() config_xgb.runner.ss_main_score = 'var_lda_score' # use lda score as the main score for XGBoost config_xgb.runner.classifier = "XGBoost" - config_xgb.runner.ss_use_dynamic_main_score = False # since using lda score do not ned to dynamically select the main score + config_xgb.runner.ss_use_dynamic_main_score = False # since using lda score do not need to dynamically select the main score self.config.runner.classifier = "XGBoost" # need to change to XGBoost for saving the weights (result_xgb, scorer_xgb, weights_xgb) = PyProphet(config_xgb).learn_and_apply(self.table)