#!/usr/bin/env python
# Created by "Thieu" at 08:38, 25/05/2023 ----------%
# Email: nguyenthieu2102@gmail.com %
# Github: https://github.com/thieu1995 %
# --------------------------------------------------%
"""
Refs:
1. https://docs.scipy.org/doc/scipy/reference/stats.html#correlation-functions
2. https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
"""
import numpy as np
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression
from scipy import stats
[docs]def chi2_func(X, y):
return chi2(X, y)[0]
[docs]def f_classification_func(X, y):
return f_classif(X, y)[0]
[docs]def f_regression_func(X, y, center=True, force_finite=True):
return f_regression(X, y, center=center, force_finite=force_finite)[0]
[docs]def kendall_func(X, y):
return np.array([stats.kendalltau(X[:, f], y).correlation for f in range(X.shape[1])])
[docs]def spearman_func(X, y):
return np.array([stats.spearmanr(X[:, f], y).correlation for f in range(X.shape[1])])
[docs]def point_func(X, y):
return np.array([stats.pointbiserialr(X[:, f], y).correlation for f in range(X.shape[1])])
[docs]def select_bests(importance_scores=None, n_features=3):
"""
Select features according to the k highest scores or percentile of the highest scores.
Parameters
----------
importance_scores : array-like of shape (n_features,)
Scores of features.
n_features : int, float. default=3
Number of selected features.
- If `float`, it should be in range of (0, 1). That represent the percentile of the highest scores.
- If `int`, it should be in range of (1, N-1). N is total number of features in your dataset.
Returns
-------
mask: Number of top features to select.
"""
max_features = len(importance_scores)
if type(n_features) in (int, float):
if type(n_features) is float:
if 0 < n_features < 1:
n_features = int(n_features * max_features) + 1
else:
raise ValueError("n_features based on percentile should has value in range (0, 1).")
if n_features < 1 or n_features > max_features:
raise ValueError(f"n_features should has value in range [1, {max_features}].")
else:
raise ValueError("n_features should be int if based on k highest scores, or float if based on percentile of highest scores.")
mask = np.zeros(importance_scores.shape, dtype=bool)
mask[np.argsort(importance_scores, kind="mergesort")[-n_features:]] = 1
return mask
[docs]def relief_func(X, y, n_neighbors=5, n_bins=10, problem="classification", normalized=True, **kwargs):
"""
Performs Relief feature selection on the input dataset X and target variable y.
Returns a vector of feature importance scores.
Parameters
----------
X : numpy array
Input dataset of shape (n_samples, n_features).
y : numpy array
Target variable of shape (n_samples,).
n_neighbors : int, default=5
Number of neighbors to use for computing feature importance scores.
n_bins : int, default=10
Number of bins to use for discretizing the target variable in regression problems.
problem : str
The problem of dataset, either regression or classification
If `regression`, discretize the target variable into n_bins classes
normalized : bool, default=True
Normalize feature importance scores by the number of instances in the dataset
Returns
-------
importance score : np.ndarray
Vector of feature importance scores, with shape (n_features,).
"""
# Initialize feature importance scores to zero
importance_scores = np.zeros(X.shape[1])
# Regression problem: discretize the target variable into n_bins classes
if problem == "regression":
y_bins = np.linspace(np.min(y), np.max(y), n_bins)
y = np.digitize(y, y_bins) - 1
# Compute distance matrix between instances in the dataset
dist_matrix = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1))
# Loop over instances in the dataset
for i in range(X.shape[0]):
# Get the target value of the current instance
target_value = y[i]
# Find the indices of the n_neighbors nearest instances with different target labels
indices = np.argsort(dist_matrix[i])
neighbors = []
for j in range(len(indices)):
if len(neighbors) == n_neighbors:
break
if y[indices[j]] != target_value:
neighbors.append(indices[j])
# Update feature importance scores based on the distances to the nearest neighbors
for j in range(X.shape[1]):
diff = np.abs(X[i, j] - X[neighbors, j])
importance_scores[j] += np.sum(diff) / n_neighbors
# Normalize feature importance scores by the number of instances in the dataset
if normalized:
importance_scores /= X.shape[0]
return importance_scores
[docs]def relief_f_func(X, y, n_neighbors=5, n_bins=10, problem="classification", normalized=True, **kwargs):
"""
Performs Relief-F feature selection on the input dataset X and target variable y.
Returns a vector of feature importance scores
Parameters
----------
X : numpy array
Input dataset of shape (n_samples, n_features).
y : numpy array
Target variable of shape (n_samples,).
n_neighbors : int, default=5
Number of neighbors to use for computing feature importance scores.
n_bins : int, default=10
Number of bins to use for discretizing the target variable in regression problems.
problem : str
The problem of dataset, either regression or classification
If `regression`, discretize the target variable into n_bins classes
normalized : bool, default=True
Normalize feature importance scores by the number of instances in the dataset
Returns
-------
importance score : np.ndarray
Vector of feature importance scores, with shape (n_features,).
"""
# Initialize feature importance scores to zero for each class
n_features = X.shape[1]
# Regression problem: discretize the target variable into n_bins classes
if problem == "regression":
y_bins = np.linspace(np.min(y), np.max(y), n_bins)
y = np.digitize(y, y_bins) - 1
n_classes = len(np.unique(y))
importance_scores = np.zeros((n_features, n_classes))
# Compute distance matrix between instances in the dataset
dist_matrix = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1))
# Loop over instances in the dataset
for i in range(X.shape[0]):
# Get the target value of the current instance
target_value = y[i]
# Find the indices of the n_neighbors nearest instances with different target labels
indices = np.argsort(dist_matrix[i])
neighbors_diff = []
neighbors_same = []
for j in range(len(indices)):
if len(neighbors_diff) == n_neighbors and len(neighbors_same) == n_neighbors:
break
if y[indices[j]] != target_value:
neighbors_diff.append(indices[j])
else:
neighbors_same.append(indices[j])
# Update feature importance scores based on the distances to the nearest neighbors
for j in range(X.shape[1]):
diff = np.abs(X[i, j] - X[neighbors_diff, j])
importance_scores[j, target_value] += np.sum(diff) / n_neighbors
same = np.abs(X[i, j] - X[neighbors_same, j])
importance_scores[j, target_value] -= np.sum(same) / n_neighbors
# Combine feature importance scores for each class using a weighted average based on the frequency of each class
class_freq = np.bincount(y) / y.shape[0]
importance_scores_weighted = np.dot(importance_scores, class_freq)
# Normalize feature importance scores by the number of instances in the dataset
if normalized:
importance_scores_weighted /= X.shape[0]
return importance_scores_weighted
[docs]def vls_relief_f_func(X, y, n_neighbors=5, n_bins=10, problem="classification", normalized=True, **kwargs):
"""
Performs Very Large Scale ReliefF feature selection on the input dataset X and target variable y.
Returns a vector of feature importance scores
Parameters
----------
X : numpy array
Input dataset of shape (n_samples, n_features).
y : numpy array
Target variable of shape (n_samples,).
n_neighbors : int, default=5
Number of neighbors to use for computing feature importance scores.
n_bins : int, default=10
Number of bins to use for discretizing the target variable in regression problems.
problem : str
The problem of dataset, either regression or classification
If `regression`, discretize the target variable into n_bins classes
normalized : bool, default=True
Normalize feature importance scores by the number of instances in the dataset
Returns
-------
importance score : np.ndarray
Vector of feature importance scores, with shape (n_features,).
"""
n_samples, n_features = X.shape
# Regression problem: discretize the target variable into n_bins classes
if problem == "regression":
y_bins = np.linspace(np.min(y), np.max(y), n_bins)
y = np.digitize(y, y_bins) - 1
relevance_scores = np.zeros(n_features)
redundancy_scores = np.zeros(n_features)
for i in range(n_samples):
# Randomly select k neighbors
neighbors = np.random.choice(n_samples, n_neighbors, replace=False)
for j in range(n_features):
feature_values = X[neighbors, j]
# Calculate relevance score
relevant_diff = np.abs(feature_values - X[i, j])
relevance_scores[j] += np.sum(relevant_diff * (y[neighbors] != y[i]))
# Calculate redundancy score
redundant_diff = np.abs(feature_values - feature_values[:, np.newaxis])
redundancy_scores[j] += np.sum(redundant_diff)
# Normalize the scores
relevance_scores /= (n_samples * n_neighbors)
redundancy_scores /= (n_samples * n_neighbors * (n_samples - 1))
# Compute the feature importance by subtracting redundancy from relevance
feature_importance = relevance_scores - redundancy_scores
# Normalize feature importance scores by the number of instances in the dataset
if normalized:
feature_importance /= X.shape[0]
return feature_importance