ml_pid_cbm.tools.prepare_model

This module is used for preparing the handler of the ML model.

  1"""
  2This module is used for preparing the handler of the ML model.
  3"""
  4import json
  5from typing import Dict, List, Tuple, Union
  6
  7import xgboost as xgb
  8from hipe4ml.analysis_utils import train_test_generator
  9from hipe4ml.model_handler import ModelHandler
 10from hipe4ml.tree_handler import TreeHandler
 11from optuna.study import Study
 12
 13from . import json_tools
 14
 15
 16class PrepareModel:
 17    """
 18    Class for preparing the ML pid ModelHandler
 19    """
 20
 21    def __init__(self, json_file_name: str, optimize_hyper_params: bool, use_gpu: bool):
 22        self.json_file_name = json_file_name
 23        self.optimize_hyper_params = optimize_hyper_params
 24        self.use_gpu = use_gpu
 25
 26    def prepare_model_handler(
 27        self,
 28        json_file_name: str = None,
 29        train_test_data=None,
 30    ) -> Tuple[ModelHandler, Union[None, Study]]:
 31        """
 32        Prepares model handler for training
 33
 34        Args:
 35            json_file_name (str, optional): Name of json file containing list of features. Defaults to None.
 36            train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for
 37            optimization of hyper params if this option is set to True. Defaults to None.
 38
 39        Raises:
 40            TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided.
 41
 42        Returns:
 43            Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training,
 44            and optuna optuna.study.Study if was performed.
 45        """
 46        json_file_name = json_file_name or self.json_file_name
 47        features_for_train = json_tools.load_features_for_train(json_file_name)
 48        if self.use_gpu:
 49            tree_method = "gpu_hist"
 50        else:
 51            tree_method = "hist"
 52        if self.optimize_hyper_params is True and train_test_data is not None:
 53            model_clf = xgb.XGBClassifier(tree_method=tree_method)
 54            model_hdl = ModelHandler(model_clf, features_for_train)
 55            hyper_params_ranges = self.load_hyper_params_ranges(json_file_name)
 56            study = model_hdl.optimize_params_optuna(
 57                train_test_data,
 58                hyper_params_ranges,
 59                cross_val_scoring="roc_auc_ovo",
 60                timeout=120,
 61                n_jobs=-1,
 62                n_trials=3,
 63                direction="maximize",
 64            )
 65        elif self.optimize_hyper_params is True and train_test_data is None:
 66            raise TypeError("train_test_data must be defined to optimize hyper params")
 67        elif self.optimize_hyper_params is False:
 68            n_estimators, max_depth, learning_rate = json_tools.load_hyper_params_vals(
 69                json_file_name
 70            )
 71            model_clf = xgb.XGBClassifier(
 72                n_estimators=n_estimators,
 73                max_depth=max_depth,
 74                learning_rate=learning_rate,
 75                tree_method=tree_method,
 76            )
 77            model_hdl = ModelHandler(model_clf, features_for_train)
 78        print(f"\nModelHandler ready using configuration from {json_file_name}")
 79        if self.optimize_hyper_params:
 80            return model_hdl, study
 81        else:
 82            return model_hdl, None
 83
 84    def load_hyper_params_ranges(
 85        self, json_file_name: str = None
 86    ) -> Dict[str, Tuple[float, float]]:
 87        """Loads XGBoost hyper parameters ranges for optimization from json file.
 88
 89        Args:
 90            json_file_name (str, optional): Name of json file. Defaults to None.
 91
 92        Returns:
 93            Dict[str, Tuple[float, float]]: Dictionary containg variables names
 94            for optimization and their ranges.
 95        """
 96        json_file_name = json_file_name or self.json_file_name
 97        with open(json_file_name, "r") as json_file:
 98            hyper_params_ranges = json.load(json_file)["hyper_params"]["ranges"]
 99        # json accepts only lists, so they need to be transformed back into tuples
100        hyper_params_ranges = {k: tuple(v) for k, v in hyper_params_ranges.items()}
101        return hyper_params_ranges
102
103    @staticmethod
104    def prepare_train_test_data(
105        tree_handlers: List[TreeHandler],
106        test_size: float = 0.2,
107    ):
108        """Prepares trainig_test_dataset using hipe4ml.train_test_generator
109
110        Args:
111            protons_th (TreeHandler): TreeHandler containg protons.
112            kaons_th (TreeHandler): TreeHandler containg kaons.
113            pions_th (TreeHandler): TreeHandler containg pions.
114            test_size(float, optional): Size of created test dataset. Defaults to 0.1.
115
116        Returns:
117            List containing respectively training set dataframe,
118            training label array, test set dataframe, test label array.
119        """
120        indexes = [index for index, _ in enumerate(tree_handlers)]
121        train_test_data = train_test_generator(
122            tree_handlers, indexes, test_size=test_size
123        )
124        return train_test_data
class PrepareModel:
 17class PrepareModel:
 18    """
 19    Class for preparing the ML pid ModelHandler
 20    """
 21
 22    def __init__(self, json_file_name: str, optimize_hyper_params: bool, use_gpu: bool):
 23        self.json_file_name = json_file_name
 24        self.optimize_hyper_params = optimize_hyper_params
 25        self.use_gpu = use_gpu
 26
 27    def prepare_model_handler(
 28        self,
 29        json_file_name: str = None,
 30        train_test_data=None,
 31    ) -> Tuple[ModelHandler, Union[None, Study]]:
 32        """
 33        Prepares model handler for training
 34
 35        Args:
 36            json_file_name (str, optional): Name of json file containing list of features. Defaults to None.
 37            train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for
 38            optimization of hyper params if this option is set to True. Defaults to None.
 39
 40        Raises:
 41            TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided.
 42
 43        Returns:
 44            Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training,
 45            and optuna optuna.study.Study if was performed.
 46        """
 47        json_file_name = json_file_name or self.json_file_name
 48        features_for_train = json_tools.load_features_for_train(json_file_name)
 49        if self.use_gpu:
 50            tree_method = "gpu_hist"
 51        else:
 52            tree_method = "hist"
 53        if self.optimize_hyper_params is True and train_test_data is not None:
 54            model_clf = xgb.XGBClassifier(tree_method=tree_method)
 55            model_hdl = ModelHandler(model_clf, features_for_train)
 56            hyper_params_ranges = self.load_hyper_params_ranges(json_file_name)
 57            study = model_hdl.optimize_params_optuna(
 58                train_test_data,
 59                hyper_params_ranges,
 60                cross_val_scoring="roc_auc_ovo",
 61                timeout=120,
 62                n_jobs=-1,
 63                n_trials=3,
 64                direction="maximize",
 65            )
 66        elif self.optimize_hyper_params is True and train_test_data is None:
 67            raise TypeError("train_test_data must be defined to optimize hyper params")
 68        elif self.optimize_hyper_params is False:
 69            n_estimators, max_depth, learning_rate = json_tools.load_hyper_params_vals(
 70                json_file_name
 71            )
 72            model_clf = xgb.XGBClassifier(
 73                n_estimators=n_estimators,
 74                max_depth=max_depth,
 75                learning_rate=learning_rate,
 76                tree_method=tree_method,
 77            )
 78            model_hdl = ModelHandler(model_clf, features_for_train)
 79        print(f"\nModelHandler ready using configuration from {json_file_name}")
 80        if self.optimize_hyper_params:
 81            return model_hdl, study
 82        else:
 83            return model_hdl, None
 84
 85    def load_hyper_params_ranges(
 86        self, json_file_name: str = None
 87    ) -> Dict[str, Tuple[float, float]]:
 88        """Loads XGBoost hyper parameters ranges for optimization from json file.
 89
 90        Args:
 91            json_file_name (str, optional): Name of json file. Defaults to None.
 92
 93        Returns:
 94            Dict[str, Tuple[float, float]]: Dictionary containg variables names
 95            for optimization and their ranges.
 96        """
 97        json_file_name = json_file_name or self.json_file_name
 98        with open(json_file_name, "r") as json_file:
 99            hyper_params_ranges = json.load(json_file)["hyper_params"]["ranges"]
100        # json accepts only lists, so they need to be transformed back into tuples
101        hyper_params_ranges = {k: tuple(v) for k, v in hyper_params_ranges.items()}
102        return hyper_params_ranges
103
104    @staticmethod
105    def prepare_train_test_data(
106        tree_handlers: List[TreeHandler],
107        test_size: float = 0.2,
108    ):
109        """Prepares trainig_test_dataset using hipe4ml.train_test_generator
110
111        Args:
112            protons_th (TreeHandler): TreeHandler containg protons.
113            kaons_th (TreeHandler): TreeHandler containg kaons.
114            pions_th (TreeHandler): TreeHandler containg pions.
115            test_size(float, optional): Size of created test dataset. Defaults to 0.1.
116
117        Returns:
118            List containing respectively training set dataframe,
119            training label array, test set dataframe, test label array.
120        """
121        indexes = [index for index, _ in enumerate(tree_handlers)]
122        train_test_data = train_test_generator(
123            tree_handlers, indexes, test_size=test_size
124        )
125        return train_test_data

Class for preparing the ML pid ModelHandler

PrepareModel(json_file_name: str, optimize_hyper_params: bool, use_gpu: bool)
22    def __init__(self, json_file_name: str, optimize_hyper_params: bool, use_gpu: bool):
23        self.json_file_name = json_file_name
24        self.optimize_hyper_params = optimize_hyper_params
25        self.use_gpu = use_gpu
def prepare_model_handler( self, json_file_name: str = None, train_test_data=None) -> Tuple[hipe4ml.model_handler.ModelHandler, Union[NoneType, optuna.study.study.Study]]:
27    def prepare_model_handler(
28        self,
29        json_file_name: str = None,
30        train_test_data=None,
31    ) -> Tuple[ModelHandler, Union[None, Study]]:
32        """
33        Prepares model handler for training
34
35        Args:
36            json_file_name (str, optional): Name of json file containing list of features. Defaults to None.
37            train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for
38            optimization of hyper params if this option is set to True. Defaults to None.
39
40        Raises:
41            TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided.
42
43        Returns:
44            Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training,
45            and optuna optuna.study.Study if was performed.
46        """
47        json_file_name = json_file_name or self.json_file_name
48        features_for_train = json_tools.load_features_for_train(json_file_name)
49        if self.use_gpu:
50            tree_method = "gpu_hist"
51        else:
52            tree_method = "hist"
53        if self.optimize_hyper_params is True and train_test_data is not None:
54            model_clf = xgb.XGBClassifier(tree_method=tree_method)
55            model_hdl = ModelHandler(model_clf, features_for_train)
56            hyper_params_ranges = self.load_hyper_params_ranges(json_file_name)
57            study = model_hdl.optimize_params_optuna(
58                train_test_data,
59                hyper_params_ranges,
60                cross_val_scoring="roc_auc_ovo",
61                timeout=120,
62                n_jobs=-1,
63                n_trials=3,
64                direction="maximize",
65            )
66        elif self.optimize_hyper_params is True and train_test_data is None:
67            raise TypeError("train_test_data must be defined to optimize hyper params")
68        elif self.optimize_hyper_params is False:
69            n_estimators, max_depth, learning_rate = json_tools.load_hyper_params_vals(
70                json_file_name
71            )
72            model_clf = xgb.XGBClassifier(
73                n_estimators=n_estimators,
74                max_depth=max_depth,
75                learning_rate=learning_rate,
76                tree_method=tree_method,
77            )
78            model_hdl = ModelHandler(model_clf, features_for_train)
79        print(f"\nModelHandler ready using configuration from {json_file_name}")
80        if self.optimize_hyper_params:
81            return model_hdl, study
82        else:
83            return model_hdl, None

Prepares model handler for training

Args: json_file_name (str, optional): Name of json file containing list of features. Defaults to None. train_test_data (_type_, optional): Trainig_test_dataset using hipe4ml.train_test_generator. Only used for optimization of hyper params if this option is set to True. Defaults to None.

Raises: TypeError: Raises error if optimize_hyper_params is set to True, but no train_test_data was provided.

Returns: Tuple[ModelHandler, Union[None, Study]]: Tuple of Hipe4ml model handler ready for training, and optuna optuna.study.Study if was performed.

def load_hyper_params_ranges(self, json_file_name: str = None) -> Dict[str, Tuple[float, float]]:
 85    def load_hyper_params_ranges(
 86        self, json_file_name: str = None
 87    ) -> Dict[str, Tuple[float, float]]:
 88        """Loads XGBoost hyper parameters ranges for optimization from json file.
 89
 90        Args:
 91            json_file_name (str, optional): Name of json file. Defaults to None.
 92
 93        Returns:
 94            Dict[str, Tuple[float, float]]: Dictionary containg variables names
 95            for optimization and their ranges.
 96        """
 97        json_file_name = json_file_name or self.json_file_name
 98        with open(json_file_name, "r") as json_file:
 99            hyper_params_ranges = json.load(json_file)["hyper_params"]["ranges"]
100        # json accepts only lists, so they need to be transformed back into tuples
101        hyper_params_ranges = {k: tuple(v) for k, v in hyper_params_ranges.items()}
102        return hyper_params_ranges

Loads XGBoost hyper parameters ranges for optimization from json file.

Args: json_file_name (str, optional): Name of json file. Defaults to None.

Returns: Dict[str, Tuple[float, float]]: Dictionary containg variables names for optimization and their ranges.

@staticmethod
def prepare_train_test_data( tree_handlers: List[hipe4ml.tree_handler.TreeHandler], test_size: float = 0.2):
104    @staticmethod
105    def prepare_train_test_data(
106        tree_handlers: List[TreeHandler],
107        test_size: float = 0.2,
108    ):
109        """Prepares trainig_test_dataset using hipe4ml.train_test_generator
110
111        Args:
112            protons_th (TreeHandler): TreeHandler containg protons.
113            kaons_th (TreeHandler): TreeHandler containg kaons.
114            pions_th (TreeHandler): TreeHandler containg pions.
115            test_size(float, optional): Size of created test dataset. Defaults to 0.1.
116
117        Returns:
118            List containing respectively training set dataframe,
119            training label array, test set dataframe, test label array.
120        """
121        indexes = [index for index, _ in enumerate(tree_handlers)]
122        train_test_data = train_test_generator(
123            tree_handlers, indexes, test_size=test_size
124        )
125        return train_test_data

Prepares trainig_test_dataset using hipe4ml.train_test_generator

Args: protons_th (TreeHandler): TreeHandler containg protons. kaons_th (TreeHandler): TreeHandler containg kaons. pions_th (TreeHandler): TreeHandler containg pions. test_size(float, optional): Size of created test dataset. Defaults to 0.1.

Returns: List containing respectively training set dataframe, training label array, test set dataframe, test label array.