The package provides a suite of functions for computing various distance metrics between pairs of groups within a list of data frames. Each data frame represents observations of a species, including multiple factors. In addition to the Mahalanobis distance, which is a dissimilarity measure based on the covariance matrix and useful for statistical matching or data merging, the package includes:
Mahalanobis distance: Also defined as a measure of dissimilarity between two random vectors and with the same probability density function and with covariance matrix.
Euclidean Distance: A direct geometric measure between two points in a multidimensional space, defined as the square root of the sum of the squares of the differences between corresponding coordinates of the points.
Manhattan Distance: Also known as taxi cab distance, it computes the sum of the absolute differences between the coordinates of the points, representing the path a taxi would take in a grid-like road system.
Chebyshev Distance: Defined as the maximum difference between the coordinates of the points, this metric is useful when the largest distance dominates the overall effect.
These metrics are fundamental in various fields, such as cluster analysis, classification, and other applications of machine learning and data mining, where assessing similarity or dissimilarity between data is crucial. The package is designed to be flexible and easily integrated into data analysis workflows, providing reliable tools for evaluating distances in multidimensional contexts.
library(cmahalanobis)
# Load iris dataset
data(iris)
# Split data into three parts
setosa <- subset(iris, Species == "setosa")
setosa <- setosa[,-5] # Remove the column of specie
versicolor <- subset(iris, Species == "versicolor")
versicolor <- versicolor[,-5] # Remove the column of specie
virginica <- subset(iris, Species == "virginica")
virginica <- virginica[,-5] # Remove the column of specie
# Create a list with the three groups of flowers
groups <- list(setosa, versicolor, virginica)
cmahalanobis(groups, plot = TRUE, p.value = TRUE)
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.0000 335.19989 727.42056
#> [2,] 107.1736 0.00000 26.71618
#> [3,] 171.7689 16.88654 0.00000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 2.748687e-71 4.023276e-156
#> [2,] 2.915001e-22 NA 2.268568e-05
#> [3,] 4.363119e-36 2.033555e-03 NA
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.000000 3.208281 4.754507
#> [2,] 3.208281 0.000000 1.620489
#> [3,] 4.754507 1.620489 0.000000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 0.6 0
#> [2,] 0.9 NA 1
#> [3,] 0.0 1.0 NA
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.000 5.466 7.906
#> [2,] 5.466 0.000 2.848
#> [3,] 7.906 2.848 0.000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 0.0 0.0
#> [2,] 0 NA 0.8
#> [3,] 0 0.9 NA
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.000 2.798 4.090
#> [2,] 2.798 0.000 1.292
#> [3,] 4.090 1.292 0.000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 0.98 0.42
#> [2,] 0.96 NA 1.00
#> [3,] 0.44 1.00 NA
# Split the data into 2 parts for each type of transmission
auto <- subset(mtcars, am == 0)
auto <- auto[,-9]
manual <- subset(mtcars, am == 1)
manual <- manual[,-9]
# Create a list with the two groups of cars
groups <- list(auto, manual)
cmahalanobis(groups, plot = TRUE, p.value = TRUE)
#> $distances
#> [,1] [,2]
#> [1,] 0.0000 156.1163
#> [2,] 735.5919 0.0000
#>
#> $p_values
#> [,1] [,2]
#> [1,] NA 2.050145e-28
#> [2,] 1.429549e-151 NA
#> $distances
#> [,1] [,2]
#> [1,] 0.0000 150.8032
#> [2,] 150.8032 0.0000
#>
#> $p_values
#> [,1] [,2]
#> [1,] NA 1
#> [2,] 0 NA
#> $distances
#> [,1] [,2]
#> [1,] 0.0000 193.8557
#> [2,] 193.8557 0.0000
#>
#> $p_values
#> [,1] [,2]
#> [1,] NA 1
#> [2,] 0 NA
#> $distances
#> [,1] [,2]
#> [1,] 0.0000 146.8482
#> [2,] 146.8482 0.0000
#>
#> $p_values
#> [,1] [,2]
#> [1,] NA 1
#> [2,] 0 NA
# Load cmahalanobis package
library(cmahalanobis)
# Define the number of observations and variables for each groups
num_observations <- 100
num_variables <- 5
# We generate three groups of simulated data with normal distribution
set.seed(123) # For the reproducibility of results
group1 <- as.data.frame(matrix(rnorm(num_observations * num_variables), nrow = num_observations))
group2 <- as.data.frame(matrix(rnorm(num_observations * num_variables), nrow = num_observations))
group3 <- as.data.frame(matrix(rnorm(num_observations * num_variables), nrow = num_observations))
# Create a list of three groups of data
groups <- list(group1, group2, group3)
# Calculate Mahalanobis distance with cmahalanobis function
cmahalanobis(groups, plot = TRUE, p.value = TRUE)
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.000000 5.639257 5.567479
#> [2,] 4.722923 0.000000 5.029954
#> [3,] 5.329901 5.783087 0.000000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 0.3429174 0.3506032
#> [2,] 0.4506217 NA 0.4122355
#> [3,] 0.3769584 0.3279009 NA
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.0000000 0.2282174 0.156693
#> [2,] 0.2282174 0.0000000 0.302220
#> [3,] 0.1566930 0.3022200 0.000000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 0.32105263 0.65789474
#> [2,] 0.3789474 NA 0.06315789
#> [3,] 0.5736842 0.02105263 NA
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.0000000 0.4442511 0.2777603
#> [2,] 0.4442511 0.0000000 0.6671049
#> [3,] 0.2777603 0.6671049 0.0000000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 0 0.3
#> [2,] 0.0 NA 0.0
#> [3,] 0.1 0 NA
#> $distances
#> [,1] [,2] [,3]
#> [1,] 0.0000000 0.1327059 0.1230044
#> [2,] 0.1327059 0.0000000 0.1622405
#> [3,] 0.1230044 0.1622405 0.0000000
#>
#> $p_values
#> [,1] [,2] [,3]
#> [1,] NA 0.94 0.88
#> [2,] 0.92 NA 0.66
#> [3,] 0.80 0.54 NA