ml_pid_cbm.binary.validate_binary_model
1import argparse 2import os 3import sys 4from collections import defaultdict 5from typing import List, Tuple 6 7import numpy as np 8import pandas as pd 9from hipe4ml.model_handler import ModelHandler 10from sklearn.metrics import confusion_matrix 11 12from ml_pid_cbm.tools import json_tools, plotting_tools 13from ml_pid_cbm.tools.load_data import LoadData 14from ml_pid_cbm.tools.particles_id import ParticlesId as Pid 15from ml_pid_cbm.validate_model import ValidateModel 16 17 18class ValidateBinaryModel(ValidateModel): 19 """ 20 Class for testing the ml model 21 """ 22 23 def __init__( 24 self, 25 lower_p_cut: float, 26 upper_p_cut: float, 27 anti_particles: bool, 28 json_file_name: str, 29 particles_df: pd.DataFrame, 30 ): 31 super().__init__( 32 lower_p_cut, upper_p_cut, anti_particles, json_file_name, particles_df 33 ) 34 self.classes_names = ["kaons", "bckgr"] 35 36 def get_n_classes(self): 37 return len(self.classes_names) 38 39 def xgb_preds(self, proba: float): 40 """Gets particle type as selected by xgboost model if above probability threshold. 41 42 Args: 43 proba(float): Probablity threshold to classify particle as signal. 44 """ 45 df = self.particles_df 46 df["xgb_preds"] = 0 47 # setting to bckgr if smaller than probability threshold 48 particle = df["model_output"] < proba 49 df.loc[~(particle), "xgb_preds"] = 1 50 51 self.particles_df = df 52 53 def remap_names(self): 54 """ 55 Remaps Pid of particles to output format from XGBoost Model. 56 Kaons: 0; Other: 1 57 58 """ 59 df = self.particles_df 60 if self.anti_particles: 61 df[self.pid_variable_name] = ( 62 df[self.pid_variable_name] 63 .map( 64 defaultdict( 65 lambda: 1.0, 66 { 67 Pid.NEG_KAON.value: 0.0, 68 }, 69 ), 70 na_action="ignore", 71 ) 72 .astype(float) 73 ) 74 else: 75 df[self.pid_variable_name] = ( 76 df[self.pid_variable_name] 77 .map( 78 defaultdict( 79 lambda: 1.0, 80 { 81 Pid.POS_KAON.value: 0.0, 82 }, 83 ), 84 na_action="ignore", 85 ) 86 .astype(float) 87 ) 88 self.particles_df = df 89 90 def evaluate_probas( 91 self, 92 start: float = 0.3, 93 stop: float = 0.98, 94 n_steps: int = 30, 95 purity_cut: float = 0.0, 96 save_fig: bool = True, 97 ) -> Tuple[float, float, float]: 98 """Method for evaluating probability (BDT) cut effect on efficency and purity. 99 100 Args: 101 start (float, optional): Lower range of probablity cuts. Defaults to 0.3. 102 stop (float, optional): Upper range of probablity cuts. Defaults to 0.98. 103 n_steps (int, optional): Number of probability cuts to try. Defaults to 30. 104 pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid". 105 purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0.. 106 save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True. 107 108 Returns: 109 Tuple[float, float, float]: Probability cut for each variable. 110 """ 111 print( 112 f"Checking efficiency and purity for {int(n_steps)} probablity cuts between {start}, and {stop}..." 113 ) 114 probas = np.linspace(start, stop, n_steps) 115 efficiencies = [] 116 purities = [] 117 best_cut = 0.0 118 max_efficiency = 0.0 119 max_purity = 0.0 120 121 for proba in probas: 122 self.xgb_preds(proba) 123 # confusion matrix 124 cnf_matrix = confusion_matrix( 125 self.particles_df[self.pid_variable_name], 126 self.particles_df["xgb_preds"], 127 ) 128 pid = 0 129 efficiency, purity = self.efficiency_stats( 130 cnf_matrix, pid, print_output=False 131 ) 132 efficiencies.append(efficiency) 133 purities.append(purity) 134 if purity_cut > 0.0: 135 # Minimal purity for automatic threshold selection. 136 # Will choose the highest efficiency for purity above this value. 137 if purity >= purity_cut: 138 if efficiency > max_efficiency: 139 best_cut = proba 140 max_efficiency = efficiency 141 max_purity = purity 142 # If max purity is below this value, will choose the highest purity available. 143 else: 144 if purity > max_purity: 145 best_cut = proba 146 max_efficiency = efficiency 147 max_purity = purity 148 149 plotting_tools.plot_efficiency_purity( 150 probas, [efficiencies], [purities], save_fig, particle_names=["kaons"] 151 ) 152 if save_fig: 153 print("Plots ready!") 154 if purity_cut > 0: 155 print(f"Selected probaility cuts: {best_cut}") 156 return best_cut 157 else: 158 return -1.0 159 160 def confusion_matrix_and_stats( 161 self, efficiency_filename: str = "efficiency_stats.txt", save_fig: bool = True 162 ): 163 """ 164 Generates confusion matrix and efficiency/purity stats. 165 """ 166 cnf_matrix = confusion_matrix( 167 self.particles_df[self.pid_variable_name], self.particles_df["xgb_preds"] 168 ) 169 plotting_tools.plot_confusion_matrix( 170 cnf_matrix, classes=self.classes_names, save_fig=save_fig 171 ) 172 plotting_tools.plot_confusion_matrix( 173 cnf_matrix, classes=self.classes_names, normalize=True, save_fig=save_fig 174 ) 175 txt_file = open(efficiency_filename, "w+") 176 for pid in range(self.get_n_classes() - 1): 177 self.efficiency_stats(cnf_matrix, pid, txt_file) 178 txt_file.close() 179 180 def generate_plots(self): 181 """ 182 Generate tof, mass2, vars, and pT-rapidity plots 183 """ 184 self._tof_plots() 185 self._mass2_plots() 186 self._vars_distributions_plots() 187 188 def _tof_plots(self): 189 """ 190 Generates tof plots. 191 """ 192 for pid, particle_name in enumerate(self.classes_names): 193 # simulated: 194 try: 195 plotting_tools.tof_plot( 196 self.particles_df[self.particles_df[self.pid_variable_name] == pid], 197 self.json_file_name, 198 f"{particle_name} (all simulated)", 199 ) 200 except ValueError: 201 print(f"No simulated {particle_name}s") 202 # xgb selected 203 try: 204 plotting_tools.tof_plot( 205 self.particles_df[self.particles_df["xgb_preds"] == pid], 206 self.json_file_name, 207 f"{particle_name} (XGB-selected)", 208 ) 209 except ValueError: 210 print(f"No XGB-selected {particle_name}s") 211 212 def _mass2_plots(self): 213 """ 214 Generates mass2 plots. 215 """ 216 kaons_range = (-0.2, 0.6) 217 for pid, particle_name in enumerate(self.classes_names): 218 plotting_tools.plot_mass2( 219 self.particles_df[self.particles_df["xgb_preds"] == pid][ 220 self.mass2_variable_name 221 ], 222 self.particles_df[self.particles_df[self.pid_variable_name] == pid][ 223 self.mass2_variable_name 224 ], 225 particle_name, 226 kaons_range, 227 ) 228 plotting_tools.plot_all_particles_mass2( 229 self.particles_df[self.particles_df["xgb_preds"] == pid], 230 self.mass2_variable_name, 231 self.pid_variable_name, 232 particle_name, 233 kaons_range, 234 ) 235 236 def _vars_distributions_plots(self): 237 """ 238 Generates distributions of variables and pT-rapidity graphs. 239 """ 240 vars_to_draw = json_tools.load_vars_to_draw(self.json_file_name) 241 for pid, particle_name in enumerate(self.classes_names): 242 plotting_tools.var_distributions_plot( 243 vars_to_draw, 244 [ 245 self.particles_df[ 246 (self.particles_df[self.pid_variable_name] == pid) 247 ], 248 self.particles_df[ 249 ( 250 (self.particles_df[self.pid_variable_name] == pid) 251 & (self.particles_df["xgb_preds"] == pid) 252 ) 253 ], 254 self.particles_df[ 255 ( 256 (self.particles_df[self.pid_variable_name] != pid) 257 & (self.particles_df["xgb_preds"] == pid) 258 ) 259 ], 260 ], 261 [ 262 f"true MC {particle_name}", 263 f"true selected {particle_name}", 264 f"false selected {particle_name}", 265 ], 266 filename=f"vars_dist_{particle_name}", 267 ) 268 plotting_tools.plot_eff_pT_rap(self.particles_df, pid) 269 plotting_tools.plot_pt_rapidity(self.particles_df, pid) 270 271 272def parse_args(args: List[str]) -> argparse.Namespace: 273 """ 274 Arguments parser for the main method. 275 276 Args: 277 args (List[str]): Arguments from the command line, should be sys.argv[1:]. 278 279 Returns: 280 argparse.Namespace: argparse.Namespace containg args 281 """ 282 parser = argparse.ArgumentParser( 283 prog="ML_PID_CBM ValidateBinaryModel", 284 description="Program for validating Binary PID ML models", 285 ) 286 parser.add_argument( 287 "--config", 288 "-c", 289 nargs=1, 290 required=True, 291 type=str, 292 help="Filename of path of config json file.", 293 ) 294 parser.add_argument( 295 "--modelname", 296 "-m", 297 nargs=1, 298 required=True, 299 type=str, 300 help="Name of folder containing trained ml model.", 301 ) 302 proba_group = parser.add_mutually_exclusive_group(required=True) 303 proba_group.add_argument( 304 "--probabilitycuts", 305 "-p", 306 nargs=1, 307 type=float, 308 help="Probability cut value e.g., 0.2", 309 ) 310 proba_group.add_argument( 311 "--evaluateproba", 312 "-e", 313 nargs=3, 314 type=float, 315 help="""Minimal probability cut, maximal, and number of steps to investigate. 316 Contrary to multi model, the lower the value of probability, the higher probability of signal""", 317 ) 318 parser.add_argument( 319 "--nworkers", 320 "-n", 321 type=int, 322 default=1, 323 help="Max number of workers for ThreadPoolExecutor which reads Root tree with data.", 324 ) 325 decision_group = parser.add_mutually_exclusive_group() 326 decision_group.add_argument( 327 "--interactive", 328 "-i", 329 action="store_true", 330 help="Interactive mode allows selection of probability cuts after evaluating them.", 331 ) 332 decision_group.add_argument( 333 "--automatic", 334 "-a", 335 nargs=1, 336 type=float, 337 help="""Minimal purity for automatic threshold selection (in percent) e.g., 90. 338 Will choose the highest efficiency for purity above this value. 339 If max purity is below this value, will choose the highest purity available.""", 340 ) 341 return parser.parse_args(args) 342 343 344if __name__ == "__main__": 345 # parser for main class 346 args = parse_args(sys.argv[1:]) 347 # config arguments to be loaded from args 348 json_file_name = args.config[0] 349 model_name = args.modelname[0] 350 proba_proton = args.probabilitycuts[0] if args.probabilitycuts is not None else -1.0 351 352 n_workers = args.nworkers 353 purity_cut = args.automatic[0] if args.automatic is not None else 0.0 354 lower_p, upper_p, is_anti = ValidateModel.parse_model_name(model_name) 355 # loading test data 356 data_file_name = json_tools.load_file_name(json_file_name, "test") 357 358 loader = LoadData(data_file_name, json_file_name, lower_p, upper_p, is_anti) 359 # sigma selection 360 # loading model handler and applying on dataset 361 print( 362 f"\nLoading data from {data_file_name}\nApplying model handler from {model_name}" 363 ) 364 os.chdir(f"{model_name}") 365 model_hdl = ModelHandler() 366 model_hdl.load_model_handler(model_name) 367 test_particles = loader.load_tree(model_handler=model_hdl, max_workers=n_workers) 368 # validate model object 369 validate = ValidateBinaryModel( 370 lower_p, upper_p, is_anti, json_file_name, test_particles.get_data_frame() 371 ) 372 # remap Pid to match output XGBoost format 373 validate.remap_names() 374 pid_variable_name = json_tools.load_var_name(json_file_name, "pid") 375 # set probability cuts 376 if args.evaluateproba is not None: 377 proba = validate.evaluate_probas( 378 args.evaluateproba[0], 379 args.evaluateproba[1], 380 int(args.evaluateproba[2]), 381 purity_cut, 382 not args.interactive, 383 ) 384 # if probabilites are set 385 # apply probabilty cuts 386 print(f"\nApplying probability cuts for kaon: {proba}") 387 validate.xgb_preds(proba) 388 # graphs 389 validate.confusion_matrix_and_stats() 390 print("Generating plots...") 391 validate.generate_plots() 392 # save validated dataset 393 validate.save_df()
19class ValidateBinaryModel(ValidateModel): 20 """ 21 Class for testing the ml model 22 """ 23 24 def __init__( 25 self, 26 lower_p_cut: float, 27 upper_p_cut: float, 28 anti_particles: bool, 29 json_file_name: str, 30 particles_df: pd.DataFrame, 31 ): 32 super().__init__( 33 lower_p_cut, upper_p_cut, anti_particles, json_file_name, particles_df 34 ) 35 self.classes_names = ["kaons", "bckgr"] 36 37 def get_n_classes(self): 38 return len(self.classes_names) 39 40 def xgb_preds(self, proba: float): 41 """Gets particle type as selected by xgboost model if above probability threshold. 42 43 Args: 44 proba(float): Probablity threshold to classify particle as signal. 45 """ 46 df = self.particles_df 47 df["xgb_preds"] = 0 48 # setting to bckgr if smaller than probability threshold 49 particle = df["model_output"] < proba 50 df.loc[~(particle), "xgb_preds"] = 1 51 52 self.particles_df = df 53 54 def remap_names(self): 55 """ 56 Remaps Pid of particles to output format from XGBoost Model. 57 Kaons: 0; Other: 1 58 59 """ 60 df = self.particles_df 61 if self.anti_particles: 62 df[self.pid_variable_name] = ( 63 df[self.pid_variable_name] 64 .map( 65 defaultdict( 66 lambda: 1.0, 67 { 68 Pid.NEG_KAON.value: 0.0, 69 }, 70 ), 71 na_action="ignore", 72 ) 73 .astype(float) 74 ) 75 else: 76 df[self.pid_variable_name] = ( 77 df[self.pid_variable_name] 78 .map( 79 defaultdict( 80 lambda: 1.0, 81 { 82 Pid.POS_KAON.value: 0.0, 83 }, 84 ), 85 na_action="ignore", 86 ) 87 .astype(float) 88 ) 89 self.particles_df = df 90 91 def evaluate_probas( 92 self, 93 start: float = 0.3, 94 stop: float = 0.98, 95 n_steps: int = 30, 96 purity_cut: float = 0.0, 97 save_fig: bool = True, 98 ) -> Tuple[float, float, float]: 99 """Method for evaluating probability (BDT) cut effect on efficency and purity. 100 101 Args: 102 start (float, optional): Lower range of probablity cuts. Defaults to 0.3. 103 stop (float, optional): Upper range of probablity cuts. Defaults to 0.98. 104 n_steps (int, optional): Number of probability cuts to try. Defaults to 30. 105 pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid". 106 purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0.. 107 save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True. 108 109 Returns: 110 Tuple[float, float, float]: Probability cut for each variable. 111 """ 112 print( 113 f"Checking efficiency and purity for {int(n_steps)} probablity cuts between {start}, and {stop}..." 114 ) 115 probas = np.linspace(start, stop, n_steps) 116 efficiencies = [] 117 purities = [] 118 best_cut = 0.0 119 max_efficiency = 0.0 120 max_purity = 0.0 121 122 for proba in probas: 123 self.xgb_preds(proba) 124 # confusion matrix 125 cnf_matrix = confusion_matrix( 126 self.particles_df[self.pid_variable_name], 127 self.particles_df["xgb_preds"], 128 ) 129 pid = 0 130 efficiency, purity = self.efficiency_stats( 131 cnf_matrix, pid, print_output=False 132 ) 133 efficiencies.append(efficiency) 134 purities.append(purity) 135 if purity_cut > 0.0: 136 # Minimal purity for automatic threshold selection. 137 # Will choose the highest efficiency for purity above this value. 138 if purity >= purity_cut: 139 if efficiency > max_efficiency: 140 best_cut = proba 141 max_efficiency = efficiency 142 max_purity = purity 143 # If max purity is below this value, will choose the highest purity available. 144 else: 145 if purity > max_purity: 146 best_cut = proba 147 max_efficiency = efficiency 148 max_purity = purity 149 150 plotting_tools.plot_efficiency_purity( 151 probas, [efficiencies], [purities], save_fig, particle_names=["kaons"] 152 ) 153 if save_fig: 154 print("Plots ready!") 155 if purity_cut > 0: 156 print(f"Selected probaility cuts: {best_cut}") 157 return best_cut 158 else: 159 return -1.0 160 161 def confusion_matrix_and_stats( 162 self, efficiency_filename: str = "efficiency_stats.txt", save_fig: bool = True 163 ): 164 """ 165 Generates confusion matrix and efficiency/purity stats. 166 """ 167 cnf_matrix = confusion_matrix( 168 self.particles_df[self.pid_variable_name], self.particles_df["xgb_preds"] 169 ) 170 plotting_tools.plot_confusion_matrix( 171 cnf_matrix, classes=self.classes_names, save_fig=save_fig 172 ) 173 plotting_tools.plot_confusion_matrix( 174 cnf_matrix, classes=self.classes_names, normalize=True, save_fig=save_fig 175 ) 176 txt_file = open(efficiency_filename, "w+") 177 for pid in range(self.get_n_classes() - 1): 178 self.efficiency_stats(cnf_matrix, pid, txt_file) 179 txt_file.close() 180 181 def generate_plots(self): 182 """ 183 Generate tof, mass2, vars, and pT-rapidity plots 184 """ 185 self._tof_plots() 186 self._mass2_plots() 187 self._vars_distributions_plots() 188 189 def _tof_plots(self): 190 """ 191 Generates tof plots. 192 """ 193 for pid, particle_name in enumerate(self.classes_names): 194 # simulated: 195 try: 196 plotting_tools.tof_plot( 197 self.particles_df[self.particles_df[self.pid_variable_name] == pid], 198 self.json_file_name, 199 f"{particle_name} (all simulated)", 200 ) 201 except ValueError: 202 print(f"No simulated {particle_name}s") 203 # xgb selected 204 try: 205 plotting_tools.tof_plot( 206 self.particles_df[self.particles_df["xgb_preds"] == pid], 207 self.json_file_name, 208 f"{particle_name} (XGB-selected)", 209 ) 210 except ValueError: 211 print(f"No XGB-selected {particle_name}s") 212 213 def _mass2_plots(self): 214 """ 215 Generates mass2 plots. 216 """ 217 kaons_range = (-0.2, 0.6) 218 for pid, particle_name in enumerate(self.classes_names): 219 plotting_tools.plot_mass2( 220 self.particles_df[self.particles_df["xgb_preds"] == pid][ 221 self.mass2_variable_name 222 ], 223 self.particles_df[self.particles_df[self.pid_variable_name] == pid][ 224 self.mass2_variable_name 225 ], 226 particle_name, 227 kaons_range, 228 ) 229 plotting_tools.plot_all_particles_mass2( 230 self.particles_df[self.particles_df["xgb_preds"] == pid], 231 self.mass2_variable_name, 232 self.pid_variable_name, 233 particle_name, 234 kaons_range, 235 ) 236 237 def _vars_distributions_plots(self): 238 """ 239 Generates distributions of variables and pT-rapidity graphs. 240 """ 241 vars_to_draw = json_tools.load_vars_to_draw(self.json_file_name) 242 for pid, particle_name in enumerate(self.classes_names): 243 plotting_tools.var_distributions_plot( 244 vars_to_draw, 245 [ 246 self.particles_df[ 247 (self.particles_df[self.pid_variable_name] == pid) 248 ], 249 self.particles_df[ 250 ( 251 (self.particles_df[self.pid_variable_name] == pid) 252 & (self.particles_df["xgb_preds"] == pid) 253 ) 254 ], 255 self.particles_df[ 256 ( 257 (self.particles_df[self.pid_variable_name] != pid) 258 & (self.particles_df["xgb_preds"] == pid) 259 ) 260 ], 261 ], 262 [ 263 f"true MC {particle_name}", 264 f"true selected {particle_name}", 265 f"false selected {particle_name}", 266 ], 267 filename=f"vars_dist_{particle_name}", 268 ) 269 plotting_tools.plot_eff_pT_rap(self.particles_df, pid) 270 plotting_tools.plot_pt_rapidity(self.particles_df, pid)
Class for testing the ml model
24 def __init__( 25 self, 26 lower_p_cut: float, 27 upper_p_cut: float, 28 anti_particles: bool, 29 json_file_name: str, 30 particles_df: pd.DataFrame, 31 ): 32 super().__init__( 33 lower_p_cut, upper_p_cut, anti_particles, json_file_name, particles_df 34 ) 35 self.classes_names = ["kaons", "bckgr"]
40 def xgb_preds(self, proba: float): 41 """Gets particle type as selected by xgboost model if above probability threshold. 42 43 Args: 44 proba(float): Probablity threshold to classify particle as signal. 45 """ 46 df = self.particles_df 47 df["xgb_preds"] = 0 48 # setting to bckgr if smaller than probability threshold 49 particle = df["model_output"] < proba 50 df.loc[~(particle), "xgb_preds"] = 1 51 52 self.particles_df = df
Gets particle type as selected by xgboost model if above probability threshold.
Args: proba(float): Probablity threshold to classify particle as signal.
54 def remap_names(self): 55 """ 56 Remaps Pid of particles to output format from XGBoost Model. 57 Kaons: 0; Other: 1 58 59 """ 60 df = self.particles_df 61 if self.anti_particles: 62 df[self.pid_variable_name] = ( 63 df[self.pid_variable_name] 64 .map( 65 defaultdict( 66 lambda: 1.0, 67 { 68 Pid.NEG_KAON.value: 0.0, 69 }, 70 ), 71 na_action="ignore", 72 ) 73 .astype(float) 74 ) 75 else: 76 df[self.pid_variable_name] = ( 77 df[self.pid_variable_name] 78 .map( 79 defaultdict( 80 lambda: 1.0, 81 { 82 Pid.POS_KAON.value: 0.0, 83 }, 84 ), 85 na_action="ignore", 86 ) 87 .astype(float) 88 ) 89 self.particles_df = df
Remaps Pid of particles to output format from XGBoost Model. Kaons: 0; Other: 1
91 def evaluate_probas( 92 self, 93 start: float = 0.3, 94 stop: float = 0.98, 95 n_steps: int = 30, 96 purity_cut: float = 0.0, 97 save_fig: bool = True, 98 ) -> Tuple[float, float, float]: 99 """Method for evaluating probability (BDT) cut effect on efficency and purity. 100 101 Args: 102 start (float, optional): Lower range of probablity cuts. Defaults to 0.3. 103 stop (float, optional): Upper range of probablity cuts. Defaults to 0.98. 104 n_steps (int, optional): Number of probability cuts to try. Defaults to 30. 105 pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid". 106 purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0.. 107 save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True. 108 109 Returns: 110 Tuple[float, float, float]: Probability cut for each variable. 111 """ 112 print( 113 f"Checking efficiency and purity for {int(n_steps)} probablity cuts between {start}, and {stop}..." 114 ) 115 probas = np.linspace(start, stop, n_steps) 116 efficiencies = [] 117 purities = [] 118 best_cut = 0.0 119 max_efficiency = 0.0 120 max_purity = 0.0 121 122 for proba in probas: 123 self.xgb_preds(proba) 124 # confusion matrix 125 cnf_matrix = confusion_matrix( 126 self.particles_df[self.pid_variable_name], 127 self.particles_df["xgb_preds"], 128 ) 129 pid = 0 130 efficiency, purity = self.efficiency_stats( 131 cnf_matrix, pid, print_output=False 132 ) 133 efficiencies.append(efficiency) 134 purities.append(purity) 135 if purity_cut > 0.0: 136 # Minimal purity for automatic threshold selection. 137 # Will choose the highest efficiency for purity above this value. 138 if purity >= purity_cut: 139 if efficiency > max_efficiency: 140 best_cut = proba 141 max_efficiency = efficiency 142 max_purity = purity 143 # If max purity is below this value, will choose the highest purity available. 144 else: 145 if purity > max_purity: 146 best_cut = proba 147 max_efficiency = efficiency 148 max_purity = purity 149 150 plotting_tools.plot_efficiency_purity( 151 probas, [efficiencies], [purities], save_fig, particle_names=["kaons"] 152 ) 153 if save_fig: 154 print("Plots ready!") 155 if purity_cut > 0: 156 print(f"Selected probaility cuts: {best_cut}") 157 return best_cut 158 else: 159 return -1.0
Method for evaluating probability (BDT) cut effect on efficency and purity.
Args: start (float, optional): Lower range of probablity cuts. Defaults to 0.3. stop (float, optional): Upper range of probablity cuts. Defaults to 0.98. n_steps (int, optional): Number of probability cuts to try. Defaults to 30. pid_variable_name (str, optional): Name of the variable containing true Pid. Defaults to "Complex_pid". purity_cut (float, optional): Minimal purity for automatic cuts selection. Defaults to 0.. save_fig (bool, optional): Saves figures (BDT cut vs efficiency and purity) to file if True. Defaults to True.
Returns: Tuple[float, float, float]: Probability cut for each variable.
161 def confusion_matrix_and_stats( 162 self, efficiency_filename: str = "efficiency_stats.txt", save_fig: bool = True 163 ): 164 """ 165 Generates confusion matrix and efficiency/purity stats. 166 """ 167 cnf_matrix = confusion_matrix( 168 self.particles_df[self.pid_variable_name], self.particles_df["xgb_preds"] 169 ) 170 plotting_tools.plot_confusion_matrix( 171 cnf_matrix, classes=self.classes_names, save_fig=save_fig 172 ) 173 plotting_tools.plot_confusion_matrix( 174 cnf_matrix, classes=self.classes_names, normalize=True, save_fig=save_fig 175 ) 176 txt_file = open(efficiency_filename, "w+") 177 for pid in range(self.get_n_classes() - 1): 178 self.efficiency_stats(cnf_matrix, pid, txt_file) 179 txt_file.close()
Generates confusion matrix and efficiency/purity stats.
273def parse_args(args: List[str]) -> argparse.Namespace: 274 """ 275 Arguments parser for the main method. 276 277 Args: 278 args (List[str]): Arguments from the command line, should be sys.argv[1:]. 279 280 Returns: 281 argparse.Namespace: argparse.Namespace containg args 282 """ 283 parser = argparse.ArgumentParser( 284 prog="ML_PID_CBM ValidateBinaryModel", 285 description="Program for validating Binary PID ML models", 286 ) 287 parser.add_argument( 288 "--config", 289 "-c", 290 nargs=1, 291 required=True, 292 type=str, 293 help="Filename of path of config json file.", 294 ) 295 parser.add_argument( 296 "--modelname", 297 "-m", 298 nargs=1, 299 required=True, 300 type=str, 301 help="Name of folder containing trained ml model.", 302 ) 303 proba_group = parser.add_mutually_exclusive_group(required=True) 304 proba_group.add_argument( 305 "--probabilitycuts", 306 "-p", 307 nargs=1, 308 type=float, 309 help="Probability cut value e.g., 0.2", 310 ) 311 proba_group.add_argument( 312 "--evaluateproba", 313 "-e", 314 nargs=3, 315 type=float, 316 help="""Minimal probability cut, maximal, and number of steps to investigate. 317 Contrary to multi model, the lower the value of probability, the higher probability of signal""", 318 ) 319 parser.add_argument( 320 "--nworkers", 321 "-n", 322 type=int, 323 default=1, 324 help="Max number of workers for ThreadPoolExecutor which reads Root tree with data.", 325 ) 326 decision_group = parser.add_mutually_exclusive_group() 327 decision_group.add_argument( 328 "--interactive", 329 "-i", 330 action="store_true", 331 help="Interactive mode allows selection of probability cuts after evaluating them.", 332 ) 333 decision_group.add_argument( 334 "--automatic", 335 "-a", 336 nargs=1, 337 type=float, 338 help="""Minimal purity for automatic threshold selection (in percent) e.g., 90. 339 Will choose the highest efficiency for purity above this value. 340 If max purity is below this value, will choose the highest purity available.""", 341 ) 342 return parser.parse_args(args)
Arguments parser for the main method.
Args: args (List[str]): Arguments from the command line, should be sys.argv[1:].
Returns: argparse.Namespace: argparse.Namespace containg args