ml_pid_cbm.tools.prepare_model
This module is used for preparing the handler of the ML model.
1""" 2This module is used for preparing the handler of the ML model. 3""" 4import json 5from typing import Dict, List, Tuple, Union 6 7import xgboost as xgb 8from hipe4ml.analysis_utils import train_test_generator 9from hipe4ml.model_handler import ModelHandler 10from hipe4ml.tree_handler import TreeHandler 11from optuna.study import Study 12 13from . import json_tools 14 15 16class PrepareModel: 17 """ 18 Class for preparing the ML pid ModelHandler 19 """ 20 21 def __init__(self, json_file_name: str, optimize_hyper_params: bool, use_gpu: bool): 22 self.json_file_name = json_file_name 23 self.optimize_hyper_params = optimize_hyper_params 24 self.use_gpu = use_gpu 25 26 def prepare_model_handler( 27 self, 28 json_file_name: str = None, 29 train_test_data=None, 30 ) -> Tuple[ModelHandler, Union[None, Study]]: 31 """ 32 Prepares model handler for training 33 34 Args: 35 json_file_name (str, optional): Name of json file containing list of features. Defaults to None. 36 train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for 37 optimization of hyper params if this option is set to True. Defaults to None. 38 39 Raises: 40 TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided. 41 42 Returns: 43 Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training, 44 and optuna optuna.study.Study if was performed. 45 """ 46 json_file_name = json_file_name or self.json_file_name 47 features_for_train = json_tools.load_features_for_train(json_file_name) 48 if self.use_gpu: 49 tree_method = "gpu_hist" 50 else: 51 tree_method = "hist" 52 if self.optimize_hyper_params is True and train_test_data is not None: 53 model_clf = xgb.XGBClassifier(tree_method=tree_method) 54 model_hdl = ModelHandler(model_clf, features_for_train) 55 hyper_params_ranges = self.load_hyper_params_ranges(json_file_name) 56 study = model_hdl.optimize_params_optuna( 57 train_test_data, 58 hyper_params_ranges, 59 cross_val_scoring="roc_auc_ovo", 60 timeout=120, 61 n_jobs=-1, 62 n_trials=3, 63 direction="maximize", 64 ) 65 elif self.optimize_hyper_params is True and train_test_data is None: 66 raise TypeError("train_test_data must be defined to optimize hyper params") 67 elif self.optimize_hyper_params is False: 68 n_estimators, max_depth, learning_rate = json_tools.load_hyper_params_vals( 69 json_file_name 70 ) 71 model_clf = xgb.XGBClassifier( 72 n_estimators=n_estimators, 73 max_depth=max_depth, 74 learning_rate=learning_rate, 75 tree_method=tree_method, 76 ) 77 model_hdl = ModelHandler(model_clf, features_for_train) 78 print(f"\nModelHandler ready using configuration from {json_file_name}") 79 if self.optimize_hyper_params: 80 return model_hdl, study 81 else: 82 return model_hdl, None 83 84 def load_hyper_params_ranges( 85 self, json_file_name: str = None 86 ) -> Dict[str, Tuple[float, float]]: 87 """Loads XGBoost hyper parameters ranges for optimization from json file. 88 89 Args: 90 json_file_name (str, optional): Name of json file. Defaults to None. 91 92 Returns: 93 Dict[str, Tuple[float, float]]: Dictionary containg variables names 94 for optimization and their ranges. 95 """ 96 json_file_name = json_file_name or self.json_file_name 97 with open(json_file_name, "r") as json_file: 98 hyper_params_ranges = json.load(json_file)["hyper_params"]["ranges"] 99 # json accepts only lists, so they need to be transformed back into tuples 100 hyper_params_ranges = {k: tuple(v) for k, v in hyper_params_ranges.items()} 101 return hyper_params_ranges 102 103 @staticmethod 104 def prepare_train_test_data( 105 tree_handlers: List[TreeHandler], 106 test_size: float = 0.2, 107 ): 108 """Prepares trainig_test_dataset using hipe4ml.train_test_generator 109 110 Args: 111 protons_th (TreeHandler): TreeHandler containg protons. 112 kaons_th (TreeHandler): TreeHandler containg kaons. 113 pions_th (TreeHandler): TreeHandler containg pions. 114 test_size(float, optional): Size of created test dataset. Defaults to 0.1. 115 116 Returns: 117 List containing respectively training set dataframe, 118 training label array, test set dataframe, test label array. 119 """ 120 indexes = [index for index, _ in enumerate(tree_handlers)] 121 train_test_data = train_test_generator( 122 tree_handlers, indexes, test_size=test_size 123 ) 124 return train_test_data
17class PrepareModel: 18 """ 19 Class for preparing the ML pid ModelHandler 20 """ 21 22 def __init__(self, json_file_name: str, optimize_hyper_params: bool, use_gpu: bool): 23 self.json_file_name = json_file_name 24 self.optimize_hyper_params = optimize_hyper_params 25 self.use_gpu = use_gpu 26 27 def prepare_model_handler( 28 self, 29 json_file_name: str = None, 30 train_test_data=None, 31 ) -> Tuple[ModelHandler, Union[None, Study]]: 32 """ 33 Prepares model handler for training 34 35 Args: 36 json_file_name (str, optional): Name of json file containing list of features. Defaults to None. 37 train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for 38 optimization of hyper params if this option is set to True. Defaults to None. 39 40 Raises: 41 TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided. 42 43 Returns: 44 Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training, 45 and optuna optuna.study.Study if was performed. 46 """ 47 json_file_name = json_file_name or self.json_file_name 48 features_for_train = json_tools.load_features_for_train(json_file_name) 49 if self.use_gpu: 50 tree_method = "gpu_hist" 51 else: 52 tree_method = "hist" 53 if self.optimize_hyper_params is True and train_test_data is not None: 54 model_clf = xgb.XGBClassifier(tree_method=tree_method) 55 model_hdl = ModelHandler(model_clf, features_for_train) 56 hyper_params_ranges = self.load_hyper_params_ranges(json_file_name) 57 study = model_hdl.optimize_params_optuna( 58 train_test_data, 59 hyper_params_ranges, 60 cross_val_scoring="roc_auc_ovo", 61 timeout=120, 62 n_jobs=-1, 63 n_trials=3, 64 direction="maximize", 65 ) 66 elif self.optimize_hyper_params is True and train_test_data is None: 67 raise TypeError("train_test_data must be defined to optimize hyper params") 68 elif self.optimize_hyper_params is False: 69 n_estimators, max_depth, learning_rate = json_tools.load_hyper_params_vals( 70 json_file_name 71 ) 72 model_clf = xgb.XGBClassifier( 73 n_estimators=n_estimators, 74 max_depth=max_depth, 75 learning_rate=learning_rate, 76 tree_method=tree_method, 77 ) 78 model_hdl = ModelHandler(model_clf, features_for_train) 79 print(f"\nModelHandler ready using configuration from {json_file_name}") 80 if self.optimize_hyper_params: 81 return model_hdl, study 82 else: 83 return model_hdl, None 84 85 def load_hyper_params_ranges( 86 self, json_file_name: str = None 87 ) -> Dict[str, Tuple[float, float]]: 88 """Loads XGBoost hyper parameters ranges for optimization from json file. 89 90 Args: 91 json_file_name (str, optional): Name of json file. Defaults to None. 92 93 Returns: 94 Dict[str, Tuple[float, float]]: Dictionary containg variables names 95 for optimization and their ranges. 96 """ 97 json_file_name = json_file_name or self.json_file_name 98 with open(json_file_name, "r") as json_file: 99 hyper_params_ranges = json.load(json_file)["hyper_params"]["ranges"] 100 # json accepts only lists, so they need to be transformed back into tuples 101 hyper_params_ranges = {k: tuple(v) for k, v in hyper_params_ranges.items()} 102 return hyper_params_ranges 103 104 @staticmethod 105 def prepare_train_test_data( 106 tree_handlers: List[TreeHandler], 107 test_size: float = 0.2, 108 ): 109 """Prepares trainig_test_dataset using hipe4ml.train_test_generator 110 111 Args: 112 protons_th (TreeHandler): TreeHandler containg protons. 113 kaons_th (TreeHandler): TreeHandler containg kaons. 114 pions_th (TreeHandler): TreeHandler containg pions. 115 test_size(float, optional): Size of created test dataset. Defaults to 0.1. 116 117 Returns: 118 List containing respectively training set dataframe, 119 training label array, test set dataframe, test label array. 120 """ 121 indexes = [index for index, _ in enumerate(tree_handlers)] 122 train_test_data = train_test_generator( 123 tree_handlers, indexes, test_size=test_size 124 ) 125 return train_test_data
Class for preparing the ML pid ModelHandler
27 def prepare_model_handler( 28 self, 29 json_file_name: str = None, 30 train_test_data=None, 31 ) -> Tuple[ModelHandler, Union[None, Study]]: 32 """ 33 Prepares model handler for training 34 35 Args: 36 json_file_name (str, optional): Name of json file containing list of features. Defaults to None. 37 train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for 38 optimization of hyper params if this option is set to True. Defaults to None. 39 40 Raises: 41 TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided. 42 43 Returns: 44 Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training, 45 and optuna optuna.study.Study if was performed. 46 """ 47 json_file_name = json_file_name or self.json_file_name 48 features_for_train = json_tools.load_features_for_train(json_file_name) 49 if self.use_gpu: 50 tree_method = "gpu_hist" 51 else: 52 tree_method = "hist" 53 if self.optimize_hyper_params is True and train_test_data is not None: 54 model_clf = xgb.XGBClassifier(tree_method=tree_method) 55 model_hdl = ModelHandler(model_clf, features_for_train) 56 hyper_params_ranges = self.load_hyper_params_ranges(json_file_name) 57 study = model_hdl.optimize_params_optuna( 58 train_test_data, 59 hyper_params_ranges, 60 cross_val_scoring="roc_auc_ovo", 61 timeout=120, 62 n_jobs=-1, 63 n_trials=3, 64 direction="maximize", 65 ) 66 elif self.optimize_hyper_params is True and train_test_data is None: 67 raise TypeError("train_test_data must be defined to optimize hyper params") 68 elif self.optimize_hyper_params is False: 69 n_estimators, max_depth, learning_rate = json_tools.load_hyper_params_vals( 70 json_file_name 71 ) 72 model_clf = xgb.XGBClassifier( 73 n_estimators=n_estimators, 74 max_depth=max_depth, 75 learning_rate=learning_rate, 76 tree_method=tree_method, 77 ) 78 model_hdl = ModelHandler(model_clf, features_for_train) 79 print(f"\nModelHandler ready using configuration from {json_file_name}") 80 if self.optimize_hyper_params: 81 return model_hdl, study 82 else: 83 return model_hdl, None
Prepares model handler for training
Args: json_file_name (str, optional): Name of json file containing list of features. Defaults to None. train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for optimization of hyper params if this option is set to True. Defaults to None.
Raises: TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided.
Returns: Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training, and optuna optuna.study.Study if was performed.
85 def load_hyper_params_ranges( 86 self, json_file_name: str = None 87 ) -> Dict[str, Tuple[float, float]]: 88 """Loads XGBoost hyper parameters ranges for optimization from json file. 89 90 Args: 91 json_file_name (str, optional): Name of json file. Defaults to None. 92 93 Returns: 94 Dict[str, Tuple[float, float]]: Dictionary containg variables names 95 for optimization and their ranges. 96 """ 97 json_file_name = json_file_name or self.json_file_name 98 with open(json_file_name, "r") as json_file: 99 hyper_params_ranges = json.load(json_file)["hyper_params"]["ranges"] 100 # json accepts only lists, so they need to be transformed back into tuples 101 hyper_params_ranges = {k: tuple(v) for k, v in hyper_params_ranges.items()} 102 return hyper_params_ranges
Loads XGBoost hyper parameters ranges for optimization from json file.
Args: json_file_name (str, optional): Name of json file. Defaults to None.
Returns: Dict[str, Tuple[float, float]]: Dictionary containg variables names for optimization and their ranges.
104 @staticmethod 105 def prepare_train_test_data( 106 tree_handlers: List[TreeHandler], 107 test_size: float = 0.2, 108 ): 109 """Prepares trainig_test_dataset using hipe4ml.train_test_generator 110 111 Args: 112 protons_th (TreeHandler): TreeHandler containg protons. 113 kaons_th (TreeHandler): TreeHandler containg kaons. 114 pions_th (TreeHandler): TreeHandler containg pions. 115 test_size(float, optional): Size of created test dataset. Defaults to 0.1. 116 117 Returns: 118 List containing respectively training set dataframe, 119 training label array, test set dataframe, test label array. 120 """ 121 indexes = [index for index, _ in enumerate(tree_handlers)] 122 train_test_data = train_test_generator( 123 tree_handlers, indexes, test_size=test_size 124 ) 125 return train_test_data
Prepares trainig_test_dataset using hipe4ml.train_test_generator
Args: protons_th (TreeHandler): TreeHandler containg protons. kaons_th (TreeHandler): TreeHandler containg kaons. pions_th (TreeHandler): TreeHandler containg pions. test_size(float, optional): Size of created test dataset. Defaults to 0.1.
Returns: List containing respectively training set dataframe, training label array, test set dataframe, test label array.