Source code for mafese.embedded.tree

#!/usr/bin/env python
# Created by "Thieu" at 16:43, 31/05/2023 ----------%                                                                               
#       Email: nguyenthieu2102@gmail.com            %                                                    
#       Github: https://github.com/thieu1995        %                         
# --------------------------------------------------%

import numpy as np
from sklearn.feature_selection import SelectFromModel
from mafese.selector import Selector
from mafese.utils import validator
from mafese.utils.estimator import get_tree_based_estimator


[docs]class TreeSelector(Selector):
    """
    Defines a TreeSelector class that hold all Tree-based Feature Selection methods for feature selection problems

    Parameters
    ----------
    problem: str, default = "classification"
        The problem you are trying to solve (or type of dataset), "classification" or "regression"

    estimator: str, default = 'tree'
        We are currently support:
            - rf: random forest
            - adaboost: AdaBoost
            - xgb: Gradient Boosting
            - tree: Extra Trees

    estimator_paras: None or dict, default = None
        The parameters of the estimator, please see the official document of scikit-learn to selected estimator.
        If None, we use the best parameter for selected estimator

    threshold : str or float, default=None
        The threshold value to use for feature selection. Features whose absolute importance value is greater or equal
        are kept while the others are discarded. If "median" (resp. "mean"), then the ``threshold`` value is the median
        (resp. the mean) of the feature importances. A scaling factor (e.g., "1.25*mean") may also be used. If None and if the
        estimator has a parameter penalty set to l1, either explicitly or implicitly (e.g, Lasso), the threshold used is 1e-5.
        Otherwise, "mean" is used by default.

    norm_order : non-zero int, inf, -inf, default=1
        Order of the norm used to filter the vectors of coefficients below ``threshold`` in the case where the
        ``coef_`` attribute of the estimator is of dimension 2.

    max_features : int, callable, default=None
        The maximum number of features to select.

        - If an integer, then it specifies the maximum number of features to
          allow.
        - If a callable, then it specifies how to calculate the maximum number of
          features allowed by using the output of `max_feaures(X)`.
        - If `None`, then all features are kept.

        To only select based on ``max_features``, set ``threshold=-np.inf``.

    Examples
    --------
    The following example shows how to retrieve the most informative features in the Tree-based FS method

    >>> import pandas as pd
    >>> from mafese.embedded.tree import TreeSelector
    >>> # load dataset
    >>> dataset = pd.read_csv('your_path/dataset.csv', index_col=0).values
    >>> X, y = dataset[:, 0:-1], dataset[:, -1]     # Assumption that the last column is label column
    >>> # define mafese feature selection method
    >>> feat_selector = TreeSelector(problem="classification", estimator="tree")
    >>> # find all relevant features
    >>> feat_selector.fit(X, y)
    >>> # check selected features - True (or 1) is selected, False (or 0) is not selected
    >>> print(feat_selector.selected_feature_masks)
    array([ True, True, True, False, False, True, False, False, False, True])
    >>> print(feat_selector.selected_feature_solution)
    array([ 1, 1, 1, 0, 0, 1, 0, 0, 0, 1])
    >>> # check the index of selected features
    >>> print(feat_selector.selected_feature_indexes)
    array([ 0, 1, 2, 5, 9])
    >>> # call transform() on X to filter it down to selected features
    >>> X_filtered = feat_selector.transform(X)
    """

    SUPPORTED = ["rf", "adaboost", "xgb", "tree"]

    def __init__(self, problem="classification", estimator="tree", estimator_paras=None, threshold=None, norm_order=1, max_features=None):
        super().__init__(problem)
        self.estimator = self._set_estimator(estimator, estimator_paras)
        self.estimator_paras = estimator_paras
        self.threshold = threshold
        self.norm_order = norm_order
        self.max_features = max_features
        self.selector = SelectFromModel(estimator=self.estimator, threshold=self.threshold, prefit=False,
                                        norm_order=self.norm_order, max_features=self.max_features)

    def _set_estimator(self, estimator=None, paras=None):
        if type(estimator) is str:
            estimator_name = validator.check_str("estimator", estimator, self.SUPPORTED)
            return get_tree_based_estimator(self.problem, estimator_name, paras)
        else:
            raise TypeError("Estimator should be a string.")

[docs]    def fit(self, X, y=None):
        self.selector.fit(X, y)
        self.selected_feature_masks = self.selector.get_support()
        self.selected_feature_solution = np.array(self.selected_feature_masks, dtype=int)
        self.selected_feature_indexes = np.where(self.selected_feature_masks)[0]