Source code for mafese.filter

#!/usr/bin/env python
# Created by "Thieu" at 22:14, 24/05/2023 ----------%
#       Email: nguyenthieu2102@gmail.com            %
#       Github: https://github.com/thieu1995        %
# --------------------------------------------------%

import numpy as np
from mafese.selector import Selector
from mafese.utils import validator, correlation


[docs]class FilterSelector(Selector):
    """
    Defines a FilterSelector class that hold all filter methods for feature selection problems

    Parameters
    ----------
    problem: str, default = "classification"
        The problem you are trying to solve (or type of dataset), "classification" or "regression"

    method: str, default = "ANOVA"
        If the problem = "classification", FilterSelector's support method can be one of this value:

            - "CHI": Chi-Squared statistic
            - "ANOVA": ANOVA F-score
            - "MI": Mutual information
            - "KENDALL":  Kendall Tau correlation
            - "SPEARMAN": Spearman’s Rho correlation
            - "POINT": Point-biserial correlation
            - "RELIEF": Original Relief method
            - "RELIEF-F": Weighted average Relief based on the frequency of each class
            - "VLS-RELIEF-F": Very Large Scale ReliefF

        If the problem = "regression", FilterSelector's support method can be one of this value:

            - "PEARSON": Pearson correlation
            - "ANOVA": ANOVA F-score
            - "MI": Mutual information
            - "KENDALL": Kendall Tau correlation
            - "SPEARMAN": Spearman’s Rho correlation
            - "POINT": Point-biserial correlation
            - "RELIEF": Original Relief method
            - "RELIEF-F": Weighted average Relief based on the frequency of each class
            - "VLS-RELIEF-F": Very Large Scale ReliefF

    n_features: int or float, default=3
        If integer, the parameter is the absolute number of features to select.
        If float between 0 and 1, it is the fraction of features to select.

    n_neighbors : int, default=5, Optional
        Number of neighbors to use for computing feature importance scores of Relief-based family

    n_bins : int, default=10, Optional
        Number of bins to use for discretizing the target variable of Relief-based family in regression problems.

    normalized: bool, default=True, Optional
        Normalize feature importance scores by the number of instances in the dataset

    Attributes
    ----------

    n_features: int
        The number of selected features.

    supported_methods: dict
        Key: is the support method name
        Value: is the support method function

    method_name: str
        The method that will be used

    Examples
    --------
    The following example shows how to retrieve the most informative features in the FilterSelector FS method

    >>> import pandas as pd
    >>> from mafese.filter import FilterSelector
    >>> # load dataset
    >>> dataset = pd.read_csv('your_path/dataset.csv', index_col=0).values
    >>> X, y = dataset[:, 0:-1], dataset[:, -1]     # Assumption that the last column is label column
    >>> # define mafese feature selection method
    >>> feat_selector = FilterSelector(problem='classification', method='SPEARMAN', n_features=5)
    >>> # find all relevant features
    >>> feat_selector.fit(X, y)
    >>> # check selected features - True (or 1) is selected, False (or 0) is not selected
    >>> print(feat_selector.selected_feature_masks)
    array([ True, True, True, False, False, True, False, False, False, True])
    >>> print(feat_selector.selected_feature_solution)
    array([ 1, 1, 1, 0, 0, 1, 0, 0, 0, 1])
    >>> # check the index of selected features
    >>> print(feat_selector.selected_feature_indexes)
    array([ 0, 1, 2, 5, 9])
    >>> # call transform() on X to filter it down to selected features
    >>> X_filtered = feat_selector.transform(X)
    """

    SUPPORT = {
        "classification": {"CHI": "chi2_func", "ANOVA": "f_classification_func", "MI": "mutual_info_classif",
                           "KENDALL": "kendall_func", "SPEARMAN": "spearman_func", "POINT": "point_func",
                           "RELIEF": "relief_func", "RELIEF-F": "relief_f_func",
                           "VLS-RELIEF-F": "vls_relief_f_func"},
        "regression": {"PEARSON": "r_regression", "ANOVA": "f_regression_func", "MI": "mutual_info_regression",
                       "KENDALL": "kendall_func", "SPEARMAN": "spearman_func", "POINT": "point_func",
                       "RELIEF": "relief_func", "RELIEF-F": "relief_f_func",
                       "VLS-RELIEF-F": "vls_relief_f_func"}
    }

    def __init__(self, problem="classification", method="ANOVA", n_features=3, n_neighbors=5, n_bins=10, normalized=True):
        super().__init__(problem)
        self.supported_methods = self.SUPPORT[self.problem]
        self.method, self.method_name = self._set_method(method)
        self.n_features = n_features
        self.n_neighbors = n_neighbors
        self.n_bins = n_bins
        self.normalized = normalized

    def _set_method(self, method=None):
        if type(method) is str:
            method_name = validator.check_str("method", method, list(self.supported_methods.keys()))
            return getattr(correlation, self.supported_methods[method_name]), method_name
        else:
            raise TypeError(f"Your method needs to be a string.")

[docs]    def fit(self, X, y=None):
        if self.method_name in ("RELIEF", "RELIEF-F", "VLS-RELIEF-F"):
            importance_scores = self.method(X, y, n_neighbors=self.n_neighbors, n_bins=self.n_bins,
                                            problem=self.problem, normalized=self.normalized)
        else:
            importance_scores = self.method(X, y)
        self.selected_feature_masks = correlation.select_bests(importance_scores, n_features=self.n_features)
        self.selected_feature_solution = np.array(self.selected_feature_masks, dtype=int)
        self.selected_feature_indexes = np.where(self.selected_feature_masks)[0]

[docs]    def transform(self, X):
        return X[:, self.selected_feature_indexes]

[docs]    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y)
        return self.transform(X)