-
-
Notifications
You must be signed in to change notification settings - Fork 46.8k
Dimensionality reduction #8590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dimensionality reduction #8590
Changes from 12 commits
babd745
5476b7d
3d8c1be
24a68e9
eb50e28
0eb4e10
5509e7d
041aa1d
7e1fc35
43e1f53
19727cf
6000941
d2e4833
56a2131
8f6323b
5328195
38a4019
f2d3293
7ed93e2
85f1730
7f524e1
6521ef1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
# Copyright (c) 2023 Diego Gasco ([email protected]), Diegomangasco on GitHub | ||
|
||
""" | ||
Requirements: | ||
- numpy version 1.21 | ||
- scipy version 1.3.3 | ||
Notes: | ||
- Each column of the features matrix corresponds to a class item | ||
""" | ||
|
||
import logging | ||
|
||
import numpy as np | ||
from scipy.linalg import eigh | ||
|
||
logging.basicConfig(level=logging.INFO, format='%(message)s') | ||
|
||
|
||
def column_reshape(input_array: np.ndarray) -> np.ndarray: | ||
"""Function to reshape a row Numpy array into a column Numpy array | ||
>>> input_array = np.array([1, 2, 3]) | ||
>>> column_reshape(input_array) | ||
array([[1], | ||
[2], | ||
[3]]) | ||
""" | ||
|
||
return input_array.reshape((input_array.size, 1)) | ||
|
||
|
||
def covariance_within_classes( | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
features: np.ndarray, labels: np.ndarray, classes: int | ||
) -> np.ndarray: | ||
"""Function to compute the covariance matrix inside each class. | ||
>>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) | ||
>>> labels = np.array([0, 1, 0]) | ||
>>> covariance_within_classes(features, labels, 2) | ||
array([[0.66666667, 0.66666667, 0.66666667], | ||
[0.66666667, 0.66666667, 0.66666667], | ||
[0.66666667, 0.66666667, 0.66666667]]) | ||
""" | ||
|
||
covariance_sum = np.nan | ||
for i in range(classes): | ||
data = features[:, labels == i] | ||
data_mean = data.mean(1) | ||
# Centralize the data of class i | ||
centered_data = data - column_reshape(data_mean) | ||
if i > 0: | ||
# If covariance_sum is not None | ||
covariance_sum += np.dot(centered_data, centered_data.T) | ||
else: | ||
# If covariance_sum is np.nan (i.e. first loop) | ||
covariance_sum = np.dot(centered_data, centered_data.T) | ||
|
||
return covariance_sum / features.shape[1] | ||
|
||
|
||
def covariance_between_classes( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. """
""" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pytest discovery is not finding/running these See the GitHub Actions output.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why it is not running these??? |
||
features: np.ndarray, labels: np.ndarray, classes: int | ||
) -> np.ndarray: | ||
"""Function to compute the covariance matrix between multiple classes | ||
>>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]]) | ||
>>> labels = np.array([0, 1, 0]) | ||
>>> covariance_between_classes(features, labels, 2) | ||
array([[ 3.55555556, 1.77777778, -2.66666667], | ||
[ 1.77777778, 0.88888889, -1.33333333], | ||
[-2.66666667, -1.33333333, 2. ]]) | ||
""" | ||
|
||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
general_data_mean = features.mean(1) | ||
covariance_sum = np.nan | ||
for i in range(classes): | ||
data = features[:, labels == i] | ||
device_data = data.shape[1] | ||
data_mean = data.mean(1) | ||
if i > 0: | ||
# If covariance_sum is not None | ||
covariance_sum += device_data * np.dot( | ||
column_reshape(data_mean) - column_reshape(general_data_mean), | ||
(column_reshape(data_mean) - column_reshape(general_data_mean)).T, | ||
) | ||
else: | ||
# If covariance_sum is np.nan (i.e. first loop) | ||
covariance_sum = device_data * np.dot( | ||
column_reshape(data_mean) - column_reshape(general_data_mean), | ||
(column_reshape(data_mean) - column_reshape(general_data_mean)).T, | ||
) | ||
|
||
return covariance_sum / features.shape[1] | ||
|
||
|
||
def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Principal Component Analysis. | ||
|
||
For more details, see here: https://en.wikipedia.org/wiki/Principal_component_analysis. | ||
Parameters: | ||
* features: the features extracted from the dataset | ||
* dimensions: to filter the projected data for the desired dimension | ||
>>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) | ||
>>> dimensions = 2 | ||
>>> principal_component_analysis(features, dimensions) | ||
array([[ 6.92820323, 8.66025404, 10.39230485], | ||
[ 3. , 3. , 3. ]]) | ||
""" | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Check if the features have been loaded | ||
if features.any(): | ||
data_mean = features.mean(1) | ||
# Center the dataset | ||
centered_data = features - np.reshape(data_mean, (data_mean.size, 1)) | ||
covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1] | ||
_, eigenvectors = np.linalg.eigh(covariance_matrix) | ||
# Take all the columns in the reverse order (-1), and then takes only the first | ||
# columns | ||
filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] | ||
# Project the database on the new space | ||
projected_data = np.dot(filtered_eigenvectors.T, features) | ||
logging.info("Principal Component Analysis computed") | ||
|
||
return projected_data | ||
else: | ||
logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) | ||
logging.error("Dataset empty") | ||
raise AssertionError | ||
|
||
|
||
def linear_discriminant_analysis( | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int | ||
) -> np.ndarray: | ||
""" | ||
Linear Discriminant Analysis. | ||
|
||
For more details, see here: https://en.wikipedia.org/wiki/Linear_discriminant_analysis. | ||
Parameters: | ||
* features: the features extracted from the dataset | ||
* labels: the class labels of the features | ||
* classes: the number of classes present in the dataset | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
* dimensions: to filter the projected data for the desired dimension | ||
>>> features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) | ||
>>> labels = np.array([0, 2, 0, 1, 1]) | ||
>>> classes = 3 | ||
>>> dimensions = 2 | ||
>>> linear_discriminant_analysis(features, labels, classes, dimensions) | ||
array([[0.70710678, 0.70710678, 0.70710678, 0.70710678, 0.70710678], | ||
[3.60806823, 5.10257902, 6.59708982, 8.09160061, 9.58611141]]) | ||
""" | ||
|
||
# Check if the dimension desired is less than the number of classes | ||
assert classes > dimensions | ||
|
||
# Check if features have been already loaded | ||
if features.any: | ||
_, eigenvectors = eigh( | ||
covariance_between_classes(features, labels, classes), | ||
covariance_within_classes(features, labels, classes), | ||
) | ||
filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] | ||
svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) | ||
filtered_svd_matrix = svd_matrix[:, 0:dimensions] | ||
projected_data = np.dot(filtered_svd_matrix.T, features) | ||
logging.info("Linear Discriminant Analysis computed") | ||
|
||
return projected_data | ||
else: | ||
logging.basicConfig(level=logging.ERROR, format='%(message)s', force=True) | ||
logging.error("Dataset empty") | ||
raise AssertionError | ||
|
||
def test_linear_discriminant_analysis(): | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Create dummy dataset with 2 classes and 3 features | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) | ||
labels = np.array([0, 0, 0, 1, 1]) | ||
classes = 2 | ||
dimensions = 2 | ||
|
||
projected_data = linear_discriminant_analysis(features, labels, classes, dimensions) | ||
|
||
# Assert that the shape of the projected data is correct | ||
assert projected_data.shape == (dimensions, features.shape[1]) | ||
|
||
# Assert that the projected data is a numpy array | ||
assert isinstance(projected_data, np.ndarray) | ||
|
||
# Assert that the projected data is not empty | ||
assert projected_data.any() | ||
|
||
# Assert that the function raises an AssertionError if dimensions > classes | ||
try: | ||
Diegomangasco marked this conversation as resolved.
Show resolved
Hide resolved
|
||
projected_data = linear_discriminant_analysis(features, labels, classes, 3) | ||
except AssertionError: | ||
pass | ||
else: | ||
raise AssertionError("Did not raise AssertionError for dimensions > classes") | ||
|
||
if __name__ == "__main__": | ||
import doctest | ||
|
||
doctest.testmod() |
Uh oh!
There was an error while loading. Please reload this page.