Source code for mafese.utils.correlation

#!/usr/bin/env python
# Created by "Thieu" at 08:38, 25/05/2023 ----------%                                                                               
#       Email: nguyenthieu2102@gmail.com            %                                                    
#       Github: https://github.com/thieu1995        %                         
# --------------------------------------------------%

"""
Refs:
1. https://docs.scipy.org/doc/scipy/reference/stats.html#correlation-functions
2. https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
"""

import numpy as np
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression
from scipy import stats


[docs]def chi2_func(X, y): return chi2(X, y)[0]
[docs]def f_classification_func(X, y): return f_classif(X, y)[0]
[docs]def f_regression_func(X, y, center=True, force_finite=True): return f_regression(X, y, center=center, force_finite=force_finite)[0]
[docs]def kendall_func(X, y): return np.array([stats.kendalltau(X[:, f], y).correlation for f in range(X.shape[1])])
[docs]def spearman_func(X, y): return np.array([stats.spearmanr(X[:, f], y).correlation for f in range(X.shape[1])])
[docs]def point_func(X, y): return np.array([stats.pointbiserialr(X[:, f], y).correlation for f in range(X.shape[1])])
[docs]def select_bests(importance_scores=None, n_features=3): """ Select features according to the k highest scores or percentile of the highest scores. Parameters ---------- importance_scores : array-like of shape (n_features,) Scores of features. n_features : int, float. default=3 Number of selected features. - If `float`, it should be in range of (0, 1). That represent the percentile of the highest scores. - If `int`, it should be in range of (1, N-1). N is total number of features in your dataset. Returns ------- mask: Number of top features to select. """ max_features = len(importance_scores) if type(n_features) in (int, float): if type(n_features) is float: if 0 < n_features < 1: n_features = int(n_features * max_features) + 1 else: raise ValueError("n_features based on percentile should has value in range (0, 1).") if n_features < 1 or n_features > max_features: raise ValueError(f"n_features should has value in range [1, {max_features}].") else: raise ValueError("n_features should be int if based on k highest scores, or float if based on percentile of highest scores.") mask = np.zeros(importance_scores.shape, dtype=bool) mask[np.argsort(importance_scores, kind="mergesort")[-n_features:]] = 1 return mask
[docs]def relief_func(X, y, n_neighbors=5, n_bins=10, problem="classification", normalized=True, **kwargs): """ Performs Relief feature selection on the input dataset X and target variable y. Returns a vector of feature importance scores. Parameters ---------- X : numpy array Input dataset of shape (n_samples, n_features). y : numpy array Target variable of shape (n_samples,). n_neighbors : int, default=5 Number of neighbors to use for computing feature importance scores. n_bins : int, default=10 Number of bins to use for discretizing the target variable in regression problems. problem : str The problem of dataset, either regression or classification If `regression`, discretize the target variable into n_bins classes normalized : bool, default=True Normalize feature importance scores by the number of instances in the dataset Returns ------- importance score : np.ndarray Vector of feature importance scores, with shape (n_features,). """ # Initialize feature importance scores to zero importance_scores = np.zeros(X.shape[1]) # Regression problem: discretize the target variable into n_bins classes if problem == "regression": y_bins = np.linspace(np.min(y), np.max(y), n_bins) y = np.digitize(y, y_bins) - 1 # Compute distance matrix between instances in the dataset dist_matrix = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1)) # Loop over instances in the dataset for i in range(X.shape[0]): # Get the target value of the current instance target_value = y[i] # Find the indices of the n_neighbors nearest instances with different target labels indices = np.argsort(dist_matrix[i]) neighbors = [] for j in range(len(indices)): if len(neighbors) == n_neighbors: break if y[indices[j]] != target_value: neighbors.append(indices[j]) # Update feature importance scores based on the distances to the nearest neighbors for j in range(X.shape[1]): diff = np.abs(X[i, j] - X[neighbors, j]) importance_scores[j] += np.sum(diff) / n_neighbors # Normalize feature importance scores by the number of instances in the dataset if normalized: importance_scores /= X.shape[0] return importance_scores
[docs]def relief_f_func(X, y, n_neighbors=5, n_bins=10, problem="classification", normalized=True, **kwargs): """ Performs Relief-F feature selection on the input dataset X and target variable y. Returns a vector of feature importance scores Parameters ---------- X : numpy array Input dataset of shape (n_samples, n_features). y : numpy array Target variable of shape (n_samples,). n_neighbors : int, default=5 Number of neighbors to use for computing feature importance scores. n_bins : int, default=10 Number of bins to use for discretizing the target variable in regression problems. problem : str The problem of dataset, either regression or classification If `regression`, discretize the target variable into n_bins classes normalized : bool, default=True Normalize feature importance scores by the number of instances in the dataset Returns ------- importance score : np.ndarray Vector of feature importance scores, with shape (n_features,). """ # Initialize feature importance scores to zero for each class n_features = X.shape[1] # Regression problem: discretize the target variable into n_bins classes if problem == "regression": y_bins = np.linspace(np.min(y), np.max(y), n_bins) y = np.digitize(y, y_bins) - 1 n_classes = len(np.unique(y)) importance_scores = np.zeros((n_features, n_classes)) # Compute distance matrix between instances in the dataset dist_matrix = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1)) # Loop over instances in the dataset for i in range(X.shape[0]): # Get the target value of the current instance target_value = y[i] # Find the indices of the n_neighbors nearest instances with different target labels indices = np.argsort(dist_matrix[i]) neighbors_diff = [] neighbors_same = [] for j in range(len(indices)): if len(neighbors_diff) == n_neighbors and len(neighbors_same) == n_neighbors: break if y[indices[j]] != target_value: neighbors_diff.append(indices[j]) else: neighbors_same.append(indices[j]) # Update feature importance scores based on the distances to the nearest neighbors for j in range(X.shape[1]): diff = np.abs(X[i, j] - X[neighbors_diff, j]) importance_scores[j, target_value] += np.sum(diff) / n_neighbors same = np.abs(X[i, j] - X[neighbors_same, j]) importance_scores[j, target_value] -= np.sum(same) / n_neighbors # Combine feature importance scores for each class using a weighted average based on the frequency of each class class_freq = np.bincount(y) / y.shape[0] importance_scores_weighted = np.dot(importance_scores, class_freq) # Normalize feature importance scores by the number of instances in the dataset if normalized: importance_scores_weighted /= X.shape[0] return importance_scores_weighted
[docs]def vls_relief_f_func(X, y, n_neighbors=5, n_bins=10, problem="classification", normalized=True, **kwargs): """ Performs Very Large Scale ReliefF feature selection on the input dataset X and target variable y. Returns a vector of feature importance scores Parameters ---------- X : numpy array Input dataset of shape (n_samples, n_features). y : numpy array Target variable of shape (n_samples,). n_neighbors : int, default=5 Number of neighbors to use for computing feature importance scores. n_bins : int, default=10 Number of bins to use for discretizing the target variable in regression problems. problem : str The problem of dataset, either regression or classification If `regression`, discretize the target variable into n_bins classes normalized : bool, default=True Normalize feature importance scores by the number of instances in the dataset Returns ------- importance score : np.ndarray Vector of feature importance scores, with shape (n_features,). """ n_samples, n_features = X.shape # Regression problem: discretize the target variable into n_bins classes if problem == "regression": y_bins = np.linspace(np.min(y), np.max(y), n_bins) y = np.digitize(y, y_bins) - 1 relevance_scores = np.zeros(n_features) redundancy_scores = np.zeros(n_features) for i in range(n_samples): # Randomly select k neighbors neighbors = np.random.choice(n_samples, n_neighbors, replace=False) for j in range(n_features): feature_values = X[neighbors, j] # Calculate relevance score relevant_diff = np.abs(feature_values - X[i, j]) relevance_scores[j] += np.sum(relevant_diff * (y[neighbors] != y[i])) # Calculate redundancy score redundant_diff = np.abs(feature_values - feature_values[:, np.newaxis]) redundancy_scores[j] += np.sum(redundant_diff) # Normalize the scores relevance_scores /= (n_samples * n_neighbors) redundancy_scores /= (n_samples * n_neighbors * (n_samples - 1)) # Compute the feature importance by subtracting redundancy from relevance feature_importance = relevance_scores - redundancy_scores # Normalize feature importance scores by the number of instances in the dataset if normalized: feature_importance /= X.shape[0] return feature_importance