ml_pid_cbm.binary.validate_binary_model

  1import argparse
  2import os
  3import sys
  4from collections import defaultdict
  5from typing import List, Tuple
  6
  7import numpy as np
  8import pandas as pd
  9from hipe4ml.model_handler import ModelHandler
 10from sklearn.metrics import confusion_matrix
 11
 12from ml_pid_cbm.tools import json_tools, plotting_tools
 13from ml_pid_cbm.tools.load_data import LoadData
 14from ml_pid_cbm.tools.particles_id import ParticlesId as Pid
 15from ml_pid_cbm.validate_model import ValidateModel
 16
 17
 18class ValidateBinaryModel(ValidateModel):
 19    """
 20    Class for testing the ml model
 21    """
 22
 23    def __init__(
 24        self,
 25        lower_p_cut: float,
 26        upper_p_cut: float,
 27        anti_particles: bool,
 28        json_file_name: str,
 29        particles_df: pd.DataFrame,
 30    ):
 31        super().__init__(
 32            lower_p_cut, upper_p_cut, anti_particles, json_file_name, particles_df
 33        )
 34        self.classes_names = ["kaons", "bckgr"]
 35
 36    def get_n_classes(self):
 37        return len(self.classes_names)
 38
 39    def xgb_preds(self, proba: float):
 40        """Gets particle type as selected by xgboost model if above probability threshold.
 41
 42        Args:
 43            proba(float): Probablity threshold to classify particle as signal.
 44        """
 45        df = self.particles_df
 46        df["xgb_preds"] = 0
 47        # setting to bckgr if smaller than probability threshold
 48        particle = df["model_output"] < proba
 49        df.loc[~(particle), "xgb_preds"] = 1
 50
 51        self.particles_df = df
 52
 53    def remap_names(self):
 54        """
 55        Remaps Pid of particles to output format from XGBoost Model.
 56        Kaons: 0; Other: 1
 57
 58        """
 59        df = self.particles_df
 60        if self.anti_particles:
 61            df[self.pid_variable_name] = (
 62                df[self.pid_variable_name]
 63                .map(
 64                    defaultdict(
 65                        lambda: 1.0,
 66                        {
 67                            Pid.NEG_KAON.value: 0.0,
 68                        },
 69                    ),
 70                    na_action="ignore",
 71                )
 72                .astype(float)
 73            )
 74        else:
 75            df[self.pid_variable_name] = (
 76                df[self.pid_variable_name]
 77                .map(
 78                    defaultdict(
 79                        lambda: 1.0,
 80                        {
 81                            Pid.POS_KAON.value: 0.0,
 82                        },
 83                    ),
 84                    na_action="ignore",
 85                )
 86                .astype(float)
 87            )
 88        self.particles_df = df
 89
 90    def evaluate_probas(
 91        self,
 92        start: float = 0.3,
 93        stop: float = 0.98,
 94        n_steps: int = 30,
 95        purity_cut: float = 0.0,
 96        save_fig: bool = True,
 97    ) -> Tuple[float, float, float]:
 98        """Method for evaluating probability (BDT) cut effect on efficency and purity.
 99
100        Args:
101            start (float, optional): Lower range of probablity cuts. Defaults to 0.3.
102            stop (float, optional): Upper range of probablity cuts. Defaults to 0.98.
103            n_steps (int, optional): Number of probability cuts to try. Defaults to 30.
104            pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid".
105            purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0..
106            save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True.
107
108        Returns:
109            Tuple[float, float, float]: Probability cut for each variable.
110        """
111        print(
112            f"Checking efficiency and purity for {int(n_steps)} probablity cuts between {start}, and {stop}..."
113        )
114        probas = np.linspace(start, stop, n_steps)
115        efficiencies = []
116        purities = []
117        best_cut = 0.0
118        max_efficiency = 0.0
119        max_purity = 0.0
120
121        for proba in probas:
122            self.xgb_preds(proba)
123            # confusion matrix
124            cnf_matrix = confusion_matrix(
125                self.particles_df[self.pid_variable_name],
126                self.particles_df["xgb_preds"],
127            )
128            pid = 0
129            efficiency, purity = self.efficiency_stats(
130                cnf_matrix, pid, print_output=False
131            )
132            efficiencies.append(efficiency)
133            purities.append(purity)
134            if purity_cut > 0.0:
135                # Minimal purity for automatic threshold selection.
136                # Will choose the highest efficiency for purity above this value.
137                if purity >= purity_cut:
138                    if efficiency > max_efficiency:
139                        best_cut = proba
140                        max_efficiency = efficiency
141                        max_purity = purity
142                # If max purity is below this value, will choose the highest purity available.
143                else:
144                    if purity > max_purity:
145                        best_cut = proba
146                        max_efficiency = efficiency
147                        max_purity = purity
148
149        plotting_tools.plot_efficiency_purity(
150            probas, [efficiencies], [purities], save_fig, particle_names=["kaons"]
151        )
152        if save_fig:
153            print("Plots ready!")
154        if purity_cut > 0:
155            print(f"Selected probaility cuts: {best_cut}")
156            return best_cut
157        else:
158            return -1.0
159
160    def confusion_matrix_and_stats(
161        self, efficiency_filename: str = "efficiency_stats.txt", save_fig: bool = True
162    ):
163        """
164        Generates confusion matrix and efficiency/purity stats.
165        """
166        cnf_matrix = confusion_matrix(
167            self.particles_df[self.pid_variable_name], self.particles_df["xgb_preds"]
168        )
169        plotting_tools.plot_confusion_matrix(
170            cnf_matrix, classes=self.classes_names, save_fig=save_fig
171        )
172        plotting_tools.plot_confusion_matrix(
173            cnf_matrix, classes=self.classes_names, normalize=True, save_fig=save_fig
174        )
175        txt_file = open(efficiency_filename, "w+")
176        for pid in range(self.get_n_classes() - 1):
177            self.efficiency_stats(cnf_matrix, pid, txt_file)
178        txt_file.close()
179
180    def generate_plots(self):
181        """
182        Generate tof, mass2, vars, and pT-rapidity plots
183        """
184        self._tof_plots()
185        self._mass2_plots()
186        self._vars_distributions_plots()
187
188    def _tof_plots(self):
189        """
190        Generates tof plots.
191        """
192        for pid, particle_name in enumerate(self.classes_names):
193            # simulated:
194            try:
195                plotting_tools.tof_plot(
196                    self.particles_df[self.particles_df[self.pid_variable_name] == pid],
197                    self.json_file_name,
198                    f"{particle_name} (all simulated)",
199                )
200            except ValueError:
201                print(f"No simulated {particle_name}s")
202            # xgb selected
203            try:
204                plotting_tools.tof_plot(
205                    self.particles_df[self.particles_df["xgb_preds"] == pid],
206                    self.json_file_name,
207                    f"{particle_name} (XGB-selected)",
208                )
209            except ValueError:
210                print(f"No XGB-selected {particle_name}s")
211
212    def _mass2_plots(self):
213        """
214        Generates mass2 plots.
215        """
216        kaons_range = (-0.2, 0.6)
217        for pid, particle_name in enumerate(self.classes_names):
218            plotting_tools.plot_mass2(
219                self.particles_df[self.particles_df["xgb_preds"] == pid][
220                    self.mass2_variable_name
221                ],
222                self.particles_df[self.particles_df[self.pid_variable_name] == pid][
223                    self.mass2_variable_name
224                ],
225                particle_name,
226                kaons_range,
227            )
228            plotting_tools.plot_all_particles_mass2(
229                self.particles_df[self.particles_df["xgb_preds"] == pid],
230                self.mass2_variable_name,
231                self.pid_variable_name,
232                particle_name,
233                kaons_range,
234            )
235
236    def _vars_distributions_plots(self):
237        """
238        Generates distributions of variables and pT-rapidity graphs.
239        """
240        vars_to_draw = json_tools.load_vars_to_draw(self.json_file_name)
241        for pid, particle_name in enumerate(self.classes_names):
242            plotting_tools.var_distributions_plot(
243                vars_to_draw,
244                [
245                    self.particles_df[
246                        (self.particles_df[self.pid_variable_name] == pid)
247                    ],
248                    self.particles_df[
249                        (
250                            (self.particles_df[self.pid_variable_name] == pid)
251                            & (self.particles_df["xgb_preds"] == pid)
252                        )
253                    ],
254                    self.particles_df[
255                        (
256                            (self.particles_df[self.pid_variable_name] != pid)
257                            & (self.particles_df["xgb_preds"] == pid)
258                        )
259                    ],
260                ],
261                [
262                    f"true MC {particle_name}",
263                    f"true selected {particle_name}",
264                    f"false selected {particle_name}",
265                ],
266                filename=f"vars_dist_{particle_name}",
267            )
268            plotting_tools.plot_eff_pT_rap(self.particles_df, pid)
269            plotting_tools.plot_pt_rapidity(self.particles_df, pid)
270
271
272def parse_args(args: List[str]) -> argparse.Namespace:
273    """
274    Arguments parser for the main method.
275
276    Args:
277        args (List[str]): Arguments from the command line, should be sys.argv[1:].
278
279    Returns:
280        argparse.Namespace: argparse.Namespace containg args
281    """
282    parser = argparse.ArgumentParser(
283        prog="ML_PID_CBM ValidateBinaryModel",
284        description="Program for validating Binary PID ML models",
285    )
286    parser.add_argument(
287        "--config",
288        "-c",
289        nargs=1,
290        required=True,
291        type=str,
292        help="Filename of path of config json file.",
293    )
294    parser.add_argument(
295        "--modelname",
296        "-m",
297        nargs=1,
298        required=True,
299        type=str,
300        help="Name of folder containing trained ml model.",
301    )
302    proba_group = parser.add_mutually_exclusive_group(required=True)
303    proba_group.add_argument(
304        "--probabilitycuts",
305        "-p",
306        nargs=1,
307        type=float,
308        help="Probability cut value e.g., 0.2",
309    )
310    proba_group.add_argument(
311        "--evaluateproba",
312        "-e",
313        nargs=3,
314        type=float,
315        help="""Minimal probability cut, maximal, and number of steps to investigate. 
316        Contrary to multi model, the lower the value of probability, the higher probability of signal""",
317    )
318    parser.add_argument(
319        "--nworkers",
320        "-n",
321        type=int,
322        default=1,
323        help="Max number of workers for ThreadPoolExecutor which reads Root tree with data.",
324    )
325    decision_group = parser.add_mutually_exclusive_group()
326    decision_group.add_argument(
327        "--interactive",
328        "-i",
329        action="store_true",
330        help="Interactive mode allows selection of probability cuts after evaluating them.",
331    )
332    decision_group.add_argument(
333        "--automatic",
334        "-a",
335        nargs=1,
336        type=float,
337        help="""Minimal purity for automatic threshold selection (in percent) e.g., 90.
338        Will choose the highest efficiency for purity above this value.
339        If max purity is below this value, will choose the highest purity available.""",
340    )
341    return parser.parse_args(args)
342
343
344if __name__ == "__main__":
345    # parser for main class
346    args = parse_args(sys.argv[1:])
347    # config  arguments to be loaded from args
348    json_file_name = args.config[0]
349    model_name = args.modelname[0]
350    proba_proton = args.probabilitycuts[0] if args.probabilitycuts is not None else -1.0
351
352    n_workers = args.nworkers
353    purity_cut = args.automatic[0] if args.automatic is not None else 0.0
354    lower_p, upper_p, is_anti = ValidateModel.parse_model_name(model_name)
355    # loading test data
356    data_file_name = json_tools.load_file_name(json_file_name, "test")
357
358    loader = LoadData(data_file_name, json_file_name, lower_p, upper_p, is_anti)
359    # sigma selection
360    # loading model handler and applying on dataset
361    print(
362        f"\nLoading data from {data_file_name}\nApplying model handler from {model_name}"
363    )
364    os.chdir(f"{model_name}")
365    model_hdl = ModelHandler()
366    model_hdl.load_model_handler(model_name)
367    test_particles = loader.load_tree(model_handler=model_hdl, max_workers=n_workers)
368    # validate model object
369    validate = ValidateBinaryModel(
370        lower_p, upper_p, is_anti, json_file_name, test_particles.get_data_frame()
371    )
372    # remap Pid to match output XGBoost format
373    validate.remap_names()
374    pid_variable_name = json_tools.load_var_name(json_file_name, "pid")
375    # set probability cuts
376    if args.evaluateproba is not None:
377        proba = validate.evaluate_probas(
378            args.evaluateproba[0],
379            args.evaluateproba[1],
380            int(args.evaluateproba[2]),
381            purity_cut,
382            not args.interactive,
383        )
384    # if probabilites are set
385    # apply probabilty cuts
386    print(f"\nApplying probability cuts for kaon: {proba}")
387    validate.xgb_preds(proba)
388    # graphs
389    validate.confusion_matrix_and_stats()
390    print("Generating plots...")
391    validate.generate_plots()
392    # save validated dataset
393    validate.save_df()
class ValidateBinaryModel(ml_pid_cbm.validate_model.ValidateModel):
 19class ValidateBinaryModel(ValidateModel):
 20    """
 21    Class for testing the ml model
 22    """
 23
 24    def __init__(
 25        self,
 26        lower_p_cut: float,
 27        upper_p_cut: float,
 28        anti_particles: bool,
 29        json_file_name: str,
 30        particles_df: pd.DataFrame,
 31    ):
 32        super().__init__(
 33            lower_p_cut, upper_p_cut, anti_particles, json_file_name, particles_df
 34        )
 35        self.classes_names = ["kaons", "bckgr"]
 36
 37    def get_n_classes(self):
 38        return len(self.classes_names)
 39
 40    def xgb_preds(self, proba: float):
 41        """Gets particle type as selected by xgboost model if above probability threshold.
 42
 43        Args:
 44            proba(float): Probablity threshold to classify particle as signal.
 45        """
 46        df = self.particles_df
 47        df["xgb_preds"] = 0
 48        # setting to bckgr if smaller than probability threshold
 49        particle = df["model_output"] < proba
 50        df.loc[~(particle), "xgb_preds"] = 1
 51
 52        self.particles_df = df
 53
 54    def remap_names(self):
 55        """
 56        Remaps Pid of particles to output format from XGBoost Model.
 57        Kaons: 0; Other: 1
 58
 59        """
 60        df = self.particles_df
 61        if self.anti_particles:
 62            df[self.pid_variable_name] = (
 63                df[self.pid_variable_name]
 64                .map(
 65                    defaultdict(
 66                        lambda: 1.0,
 67                        {
 68                            Pid.NEG_KAON.value: 0.0,
 69                        },
 70                    ),
 71                    na_action="ignore",
 72                )
 73                .astype(float)
 74            )
 75        else:
 76            df[self.pid_variable_name] = (
 77                df[self.pid_variable_name]
 78                .map(
 79                    defaultdict(
 80                        lambda: 1.0,
 81                        {
 82                            Pid.POS_KAON.value: 0.0,
 83                        },
 84                    ),
 85                    na_action="ignore",
 86                )
 87                .astype(float)
 88            )
 89        self.particles_df = df
 90
 91    def evaluate_probas(
 92        self,
 93        start: float = 0.3,
 94        stop: float = 0.98,
 95        n_steps: int = 30,
 96        purity_cut: float = 0.0,
 97        save_fig: bool = True,
 98    ) -> Tuple[float, float, float]:
 99        """Method for evaluating probability (BDT) cut effect on efficency and purity.
100
101        Args:
102            start (float, optional): Lower range of probablity cuts. Defaults to 0.3.
103            stop (float, optional): Upper range of probablity cuts. Defaults to 0.98.
104            n_steps (int, optional): Number of probability cuts to try. Defaults to 30.
105            pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid".
106            purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0..
107            save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True.
108
109        Returns:
110            Tuple[float, float, float]: Probability cut for each variable.
111        """
112        print(
113            f"Checking efficiency and purity for {int(n_steps)} probablity cuts between {start}, and {stop}..."
114        )
115        probas = np.linspace(start, stop, n_steps)
116        efficiencies = []
117        purities = []
118        best_cut = 0.0
119        max_efficiency = 0.0
120        max_purity = 0.0
121
122        for proba in probas:
123            self.xgb_preds(proba)
124            # confusion matrix
125            cnf_matrix = confusion_matrix(
126                self.particles_df[self.pid_variable_name],
127                self.particles_df["xgb_preds"],
128            )
129            pid = 0
130            efficiency, purity = self.efficiency_stats(
131                cnf_matrix, pid, print_output=False
132            )
133            efficiencies.append(efficiency)
134            purities.append(purity)
135            if purity_cut > 0.0:
136                # Minimal purity for automatic threshold selection.
137                # Will choose the highest efficiency for purity above this value.
138                if purity >= purity_cut:
139                    if efficiency > max_efficiency:
140                        best_cut = proba
141                        max_efficiency = efficiency
142                        max_purity = purity
143                # If max purity is below this value, will choose the highest purity available.
144                else:
145                    if purity > max_purity:
146                        best_cut = proba
147                        max_efficiency = efficiency
148                        max_purity = purity
149
150        plotting_tools.plot_efficiency_purity(
151            probas, [efficiencies], [purities], save_fig, particle_names=["kaons"]
152        )
153        if save_fig:
154            print("Plots ready!")
155        if purity_cut > 0:
156            print(f"Selected probaility cuts: {best_cut}")
157            return best_cut
158        else:
159            return -1.0
160
161    def confusion_matrix_and_stats(
162        self, efficiency_filename: str = "efficiency_stats.txt", save_fig: bool = True
163    ):
164        """
165        Generates confusion matrix and efficiency/purity stats.
166        """
167        cnf_matrix = confusion_matrix(
168            self.particles_df[self.pid_variable_name], self.particles_df["xgb_preds"]
169        )
170        plotting_tools.plot_confusion_matrix(
171            cnf_matrix, classes=self.classes_names, save_fig=save_fig
172        )
173        plotting_tools.plot_confusion_matrix(
174            cnf_matrix, classes=self.classes_names, normalize=True, save_fig=save_fig
175        )
176        txt_file = open(efficiency_filename, "w+")
177        for pid in range(self.get_n_classes() - 1):
178            self.efficiency_stats(cnf_matrix, pid, txt_file)
179        txt_file.close()
180
181    def generate_plots(self):
182        """
183        Generate tof, mass2, vars, and pT-rapidity plots
184        """
185        self._tof_plots()
186        self._mass2_plots()
187        self._vars_distributions_plots()
188
189    def _tof_plots(self):
190        """
191        Generates tof plots.
192        """
193        for pid, particle_name in enumerate(self.classes_names):
194            # simulated:
195            try:
196                plotting_tools.tof_plot(
197                    self.particles_df[self.particles_df[self.pid_variable_name] == pid],
198                    self.json_file_name,
199                    f"{particle_name} (all simulated)",
200                )
201            except ValueError:
202                print(f"No simulated {particle_name}s")
203            # xgb selected
204            try:
205                plotting_tools.tof_plot(
206                    self.particles_df[self.particles_df["xgb_preds"] == pid],
207                    self.json_file_name,
208                    f"{particle_name} (XGB-selected)",
209                )
210            except ValueError:
211                print(f"No XGB-selected {particle_name}s")
212
213    def _mass2_plots(self):
214        """
215        Generates mass2 plots.
216        """
217        kaons_range = (-0.2, 0.6)
218        for pid, particle_name in enumerate(self.classes_names):
219            plotting_tools.plot_mass2(
220                self.particles_df[self.particles_df["xgb_preds"] == pid][
221                    self.mass2_variable_name
222                ],
223                self.particles_df[self.particles_df[self.pid_variable_name] == pid][
224                    self.mass2_variable_name
225                ],
226                particle_name,
227                kaons_range,
228            )
229            plotting_tools.plot_all_particles_mass2(
230                self.particles_df[self.particles_df["xgb_preds"] == pid],
231                self.mass2_variable_name,
232                self.pid_variable_name,
233                particle_name,
234                kaons_range,
235            )
236
237    def _vars_distributions_plots(self):
238        """
239        Generates distributions of variables and pT-rapidity graphs.
240        """
241        vars_to_draw = json_tools.load_vars_to_draw(self.json_file_name)
242        for pid, particle_name in enumerate(self.classes_names):
243            plotting_tools.var_distributions_plot(
244                vars_to_draw,
245                [
246                    self.particles_df[
247                        (self.particles_df[self.pid_variable_name] == pid)
248                    ],
249                    self.particles_df[
250                        (
251                            (self.particles_df[self.pid_variable_name] == pid)
252                            & (self.particles_df["xgb_preds"] == pid)
253                        )
254                    ],
255                    self.particles_df[
256                        (
257                            (self.particles_df[self.pid_variable_name] != pid)
258                            & (self.particles_df["xgb_preds"] == pid)
259                        )
260                    ],
261                ],
262                [
263                    f"true MC {particle_name}",
264                    f"true selected {particle_name}",
265                    f"false selected {particle_name}",
266                ],
267                filename=f"vars_dist_{particle_name}",
268            )
269            plotting_tools.plot_eff_pT_rap(self.particles_df, pid)
270            plotting_tools.plot_pt_rapidity(self.particles_df, pid)

Class for testing the ml model

ValidateBinaryModel( lower_p_cut: float, upper_p_cut: float, anti_particles: bool, json_file_name: str, particles_df: pandas.core.frame.DataFrame)
24    def __init__(
25        self,
26        lower_p_cut: float,
27        upper_p_cut: float,
28        anti_particles: bool,
29        json_file_name: str,
30        particles_df: pd.DataFrame,
31    ):
32        super().__init__(
33            lower_p_cut, upper_p_cut, anti_particles, json_file_name, particles_df
34        )
35        self.classes_names = ["kaons", "bckgr"]
def get_n_classes(self):
37    def get_n_classes(self):
38        return len(self.classes_names)
def xgb_preds(self, proba: float):
40    def xgb_preds(self, proba: float):
41        """Gets particle type as selected by xgboost model if above probability threshold.
42
43        Args:
44            proba(float): Probablity threshold to classify particle as signal.
45        """
46        df = self.particles_df
47        df["xgb_preds"] = 0
48        # setting to bckgr if smaller than probability threshold
49        particle = df["model_output"] < proba
50        df.loc[~(particle), "xgb_preds"] = 1
51
52        self.particles_df = df

Gets particle type as selected by xgboost model if above probability threshold.

Args: proba(float): Probablity threshold to classify particle as signal.

def remap_names(self):
54    def remap_names(self):
55        """
56        Remaps Pid of particles to output format from XGBoost Model.
57        Kaons: 0; Other: 1
58
59        """
60        df = self.particles_df
61        if self.anti_particles:
62            df[self.pid_variable_name] = (
63                df[self.pid_variable_name]
64                .map(
65                    defaultdict(
66                        lambda: 1.0,
67                        {
68                            Pid.NEG_KAON.value: 0.0,
69                        },
70                    ),
71                    na_action="ignore",
72                )
73                .astype(float)
74            )
75        else:
76            df[self.pid_variable_name] = (
77                df[self.pid_variable_name]
78                .map(
79                    defaultdict(
80                        lambda: 1.0,
81                        {
82                            Pid.POS_KAON.value: 0.0,
83                        },
84                    ),
85                    na_action="ignore",
86                )
87                .astype(float)
88            )
89        self.particles_df = df

Remaps Pid of particles to output format from XGBoost Model. Kaons: 0; Other: 1

def evaluate_probas( self, start: float = 0.3, stop: float = 0.98, n_steps: int = 30, purity_cut: float = 0.0, save_fig: bool = True) -> Tuple[float, float, float]:
 91    def evaluate_probas(
 92        self,
 93        start: float = 0.3,
 94        stop: float = 0.98,
 95        n_steps: int = 30,
 96        purity_cut: float = 0.0,
 97        save_fig: bool = True,
 98    ) -> Tuple[float, float, float]:
 99        """Method for evaluating probability (BDT) cut effect on efficency and purity.
100
101        Args:
102            start (float, optional): Lower range of probablity cuts. Defaults to 0.3.
103            stop (float, optional): Upper range of probablity cuts. Defaults to 0.98.
104            n_steps (int, optional): Number of probability cuts to try. Defaults to 30.
105            pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid".
106            purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0..
107            save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True.
108
109        Returns:
110            Tuple[float, float, float]: Probability cut for each variable.
111        """
112        print(
113            f"Checking efficiency and purity for {int(n_steps)} probablity cuts between {start}, and {stop}..."
114        )
115        probas = np.linspace(start, stop, n_steps)
116        efficiencies = []
117        purities = []
118        best_cut = 0.0
119        max_efficiency = 0.0
120        max_purity = 0.0
121
122        for proba in probas:
123            self.xgb_preds(proba)
124            # confusion matrix
125            cnf_matrix = confusion_matrix(
126                self.particles_df[self.pid_variable_name],
127                self.particles_df["xgb_preds"],
128            )
129            pid = 0
130            efficiency, purity = self.efficiency_stats(
131                cnf_matrix, pid, print_output=False
132            )
133            efficiencies.append(efficiency)
134            purities.append(purity)
135            if purity_cut > 0.0:
136                # Minimal purity for automatic threshold selection.
137                # Will choose the highest efficiency for purity above this value.
138                if purity >= purity_cut:
139                    if efficiency > max_efficiency:
140                        best_cut = proba
141                        max_efficiency = efficiency
142                        max_purity = purity
143                # If max purity is below this value, will choose the highest purity available.
144                else:
145                    if purity > max_purity:
146                        best_cut = proba
147                        max_efficiency = efficiency
148                        max_purity = purity
149
150        plotting_tools.plot_efficiency_purity(
151            probas, [efficiencies], [purities], save_fig, particle_names=["kaons"]
152        )
153        if save_fig:
154            print("Plots ready!")
155        if purity_cut > 0:
156            print(f"Selected probaility cuts: {best_cut}")
157            return best_cut
158        else:
159            return -1.0

Method for evaluating probability (BDT) cut effect on efficency and purity.

Args: start (float, optional): Lower range of probablity cuts. Defaults to 0.3. stop (float, optional): Upper range of probablity cuts. Defaults to 0.98. n_steps (int, optional): Number of probability cuts to try. Defaults to 30. pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid". purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0.. save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True.

Returns: Tuple[float, float, float]: Probability cut for each variable.

def confusion_matrix_and_stats( self, efficiency_filename: str = 'efficiency_stats.txt', save_fig: bool = True):
161    def confusion_matrix_and_stats(
162        self, efficiency_filename: str = "efficiency_stats.txt", save_fig: bool = True
163    ):
164        """
165        Generates confusion matrix and efficiency/purity stats.
166        """
167        cnf_matrix = confusion_matrix(
168            self.particles_df[self.pid_variable_name], self.particles_df["xgb_preds"]
169        )
170        plotting_tools.plot_confusion_matrix(
171            cnf_matrix, classes=self.classes_names, save_fig=save_fig
172        )
173        plotting_tools.plot_confusion_matrix(
174            cnf_matrix, classes=self.classes_names, normalize=True, save_fig=save_fig
175        )
176        txt_file = open(efficiency_filename, "w+")
177        for pid in range(self.get_n_classes() - 1):
178            self.efficiency_stats(cnf_matrix, pid, txt_file)
179        txt_file.close()

Generates confusion matrix and efficiency/purity stats.

def generate_plots(self):
181    def generate_plots(self):
182        """
183        Generate tof, mass2, vars, and pT-rapidity plots
184        """
185        self._tof_plots()
186        self._mass2_plots()
187        self._vars_distributions_plots()

Generate tof, mass2, vars, and pT-rapidity plots

def parse_args(args: List[str]) -> argparse.Namespace:
273def parse_args(args: List[str]) -> argparse.Namespace:
274    """
275    Arguments parser for the main method.
276
277    Args:
278        args (List[str]): Arguments from the command line, should be sys.argv[1:].
279
280    Returns:
281        argparse.Namespace: argparse.Namespace containg args
282    """
283    parser = argparse.ArgumentParser(
284        prog="ML_PID_CBM ValidateBinaryModel",
285        description="Program for validating Binary PID ML models",
286    )
287    parser.add_argument(
288        "--config",
289        "-c",
290        nargs=1,
291        required=True,
292        type=str,
293        help="Filename of path of config json file.",
294    )
295    parser.add_argument(
296        "--modelname",
297        "-m",
298        nargs=1,
299        required=True,
300        type=str,
301        help="Name of folder containing trained ml model.",
302    )
303    proba_group = parser.add_mutually_exclusive_group(required=True)
304    proba_group.add_argument(
305        "--probabilitycuts",
306        "-p",
307        nargs=1,
308        type=float,
309        help="Probability cut value e.g., 0.2",
310    )
311    proba_group.add_argument(
312        "--evaluateproba",
313        "-e",
314        nargs=3,
315        type=float,
316        help="""Minimal probability cut, maximal, and number of steps to investigate. 
317        Contrary to multi model, the lower the value of probability, the higher probability of signal""",
318    )
319    parser.add_argument(
320        "--nworkers",
321        "-n",
322        type=int,
323        default=1,
324        help="Max number of workers for ThreadPoolExecutor which reads Root tree with data.",
325    )
326    decision_group = parser.add_mutually_exclusive_group()
327    decision_group.add_argument(
328        "--interactive",
329        "-i",
330        action="store_true",
331        help="Interactive mode allows selection of probability cuts after evaluating them.",
332    )
333    decision_group.add_argument(
334        "--automatic",
335        "-a",
336        nargs=1,
337        type=float,
338        help="""Minimal purity for automatic threshold selection (in percent) e.g., 90.
339        Will choose the highest efficiency for purity above this value.
340        If max purity is below this value, will choose the highest purity available.""",
341    )
342    return parser.parse_args(args)

Arguments parser for the main method.

Args: args (List[str]): Arguments from the command line, should be sys.argv[1:].

Returns: argparse.Namespace: argparse.Namespace containg args