/* kmeans/src/main.rs v1.0.0 Ver.: OK (c) 12/23 - Christos Iosifides */
/*
This program calculates k-means clustering in Rust programming language.
Name: kmeans/src/main.rs
Version: 1.0.0
Copyright: Ch Iossif @ 2023
Author: Christos Iosifidis
Date: 16/12/23 09:30
Modified: 16/12/23 23:34
Description: Calculates k-means clustering in Rust programming language.
Sample data: https://statisticsbyjim.com/basics/k-means-clustering/
Cargo.toml:
[package]
name = "kmeans"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
rand = "0.8.5"
Compile: cargo build
Run: cargo run
INPUT:
All data are in code variables.
READ THE CODE !
EXAMPLE: ( full output after code )
$ cargo run
Compiling kmeans v0.1.0 (/home/chiossif/kmeans)
Finished dev [unoptimized + debuginfo] target(s) in 0.22s
Running `target/debug/kmeans`
Iteration 1
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 1
Observation 1 is in cluster = 0
Observation 2 is in cluster = 2
Observation 3 is in cluster = 2
...
Observation 21 is in cluster = 1
Cluster 0 centroids are at [8.461538461538462, 1.4670940170940172, 2247907.991452991, 0.8418803418803419]
Cluster 1 centroids are at [105.76069518716577, 11.52994652406417, 38261114.017379686, 11.465240641711228]
Cluster 2 centroids are at [0.0, 0.0, 0.0, 0.0]
Sum of Squared Error is +1.6980898090548012e8
$
Copyright (C) Dec 2023 Ch Iossif
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
use rand::Rng;
fn main() {
const M: usize = 22; // Number of observations
const N: usize = 4; // Number of observation features
let obs: [[f64; N];M] = [ // Observations array MxN
[150.0, 15.4, 50400200.0, 18.0],
[144.0, 11.3, 42100650.0, 15.0],
[120.0, 9.9, 39440420.0, 12.0],
[110.0, 12.5, 36500520.0, 16.0],
[100.0, 9.7, 40650005.0, 10.0],
[ 99.0, 15.2, 45665230.0, 12.0],
[ 56.0, 9.2, 25978080.0, 8.0],
[120.0, 13.1, 37900800.0, 12.0],
[122.0, 12.4, 42560000.0, 13.0],
[142.0, 14.6, 48900090.0, 15.0],
[132.0, 13.4, 46500200.0, 16.0],
[ 68.0, 8.7, 26400500.0, 9.0],
[ 79.0, 12.7, 19800800.0, 7.0],
[103.0, 11.5, 32568740.0, 6.0],
[140.0, 13.9, 47635980.0, 13.0],
[130.0, 11.5, 47005600.0, 14.0],
[ 74.0, 8.6, 24652000.0, 6.0],
[ 49.0, 9.8, 14568990.0, 6.0],
[ 75.0, 11.7, 37555000.0, 8.0],
[ 46.0, 8.7, 22342600.0, 5.0],
[ 79.0, 8.9, 32465890.0, 9.0],
[ 90.0, 9.4, 34560000.0, 10.0]];
const K: usize = 3; // Number of clusters
let mut clusters:[u64;M] = [0;M]; // Clusters M
let mut cl_center:[[f64; N];K] = [[0.0;N];K]; // Cluster centroids KxN
let mut psse:f64 = std::f64::MAX; // previous Sum of Squared Error
let mut sse:f64 = 0.0; // Sum of Squared Error (absolute value instead of squared)
let mut rng = rand::thread_rng(); // cargo add rand OK !
// Initially assign randomly
for elem in clusters.iter_mut() {
*elem = rng.gen::()%(K as u64); // cargo add rand OK !
}
// Sample data result as input instead of randomly assigned
//clusters = [1, 2, 2, 2, 2, 1, 0, 2, 2, 1, 1, 0, 0, 2, 1, 1, 0, 0, 2, 0, 2, 2];
let max_iter = 128; // Maximum number of iterations
// Start looping for MaxIter
for iter in 0..max_iter {
println!("Iteration {}", iter+1);
// Calculate cluster centroids
println!("Calculating cluster centroids...");
for ci in 0..cl_center.len(){ // for each cluster
for fi in 0..cl_center[ci].len(){ // for each feature
cl_center[ci][fi] = 0.0; // reset cluster centroid
}
}
for oi in 0..obs.len(){ // for each observation
let tci = clusters[oi]; // it is assigned to tci
for fi in 0..obs[oi].len(){ // for each feature
cl_center[tci as usize][fi] = (cl_center[tci as usize][fi]*oi as f64 + obs[oi][fi]) / (oi as f64 + 1.0); // recalculate new centroid
}
}
// Calculate Sum of Squared Error
sse = 0.0;
println!("Calculating Sum of Squared Error...");
for oi in 0..obs.len(){ // for each observation
let tci = clusters[oi]; // it is assigned to tci
for fi in 0..obs[oi].len(){ // for each feature
let t = obs[oi][fi] - cl_center[tci as usize][fi]; // calculate distance
sse+=t.abs(); // add up sum
}
}
// Display current position
println!("Current clustering:");
// Display clusters
for i in 0..clusters.len() {
println!("Observation {} is in cluster = {}", i, clusters[i]);
}
// Display cluster centroids
for i in 0..cl_center.len() {
println!("Cluster {} centroids are at {:?}", i, cl_center[i]);
}
// Display Sum of Squared Error
println!("Sum of Squared Error is {:+e}", sse);
// Finishing criterion
let diff = psse-sse;
let finished = diff.abs()<1.0;
// Check if finished
if !finished {
psse = sse; // put last sse as previous
// Assign observations to clusters
println!("Assigning observations to clusters...");
for oi in 0..obs.len(){ // for each observation
let mut ts=0.0; // total sum is zero
let mut tci:u64=K as u64 + 1 ; // it is not assigned
for ci in 0..cl_center.len(){ // for each cluster
let mut s=0.0; // sum is zero
for fi in 0..obs[oi].len(){ // for each feature
let t = obs[oi][fi] - cl_center[ci][fi]; // calculate distance
s+=t.abs(); // add up sum
}
if ts>s || tci>K as u64 { // if this is smaller or the first
ts=s; // keep sum
tci=ci as u64; // keep cluster index
}
}
clusters[oi]=tci; // assign this observation to cluster tci
}
}
else{
println!();
break;
}
println!();
}
// Display final position
println!("Final clustering:");
// Display clusters
for i in 0..clusters.len() {
println!("Observation {} is in cluster = {}", i, clusters[i]);
}
// Display cluster centroids
for i in 0..cl_center.len() {
println!("Cluster {} centroids are at {:?}", i, cl_center[i]);
}
// Display Sum of Squared Error
println!("Sum of Squared Error is {:+e}", sse);
}
/*
Output :
$ cargo run
Compiling kmeans v0.1.0 (/home/chiossif/MEGA/rust/kmeans)
Finished dev [unoptimized + debuginfo] target(s) in 0.21s
Running `target/debug/kmeans`
Iteration 1
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 0
Observation 2 is in cluster = 0
Observation 3 is in cluster = 1
Observation 4 is in cluster = 0
Observation 5 is in cluster = 0
Observation 6 is in cluster = 2
Observation 7 is in cluster = 0
Observation 8 is in cluster = 0
Observation 9 is in cluster = 0
Observation 10 is in cluster = 2
Observation 11 is in cluster = 1
Observation 12 is in cluster = 1
Observation 13 is in cluster = 2
Observation 14 is in cluster = 0
Observation 15 is in cluster = 2
Observation 16 is in cluster = 1
Observation 17 is in cluster = 2
Observation 18 is in cluster = 0
Observation 19 is in cluster = 0
Observation 20 is in cluster = 2
Observation 21 is in cluster = 1
Cluster 0 centroids are at [100.704, 10.432733333333335, 35832946.40333332, 10.474]
Cluster 1 centroids are at [39.30974907445496, 4.764623611682435, 13084712.727272727, 4.937885643767996]
Cluster 2 centroids are at [126.4795973584239, 13.567174968833134, 43624358.72226886, 14.82957244947041]
Sum of Squared Error is +2.3184181646720317e8
Assigning observations to clusters...
Iteration 2
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 2
Observation 2 is in cluster = 0
Observation 3 is in cluster = 0
Observation 4 is in cluster = 2
Observation 5 is in cluster = 2
Observation 6 is in cluster = 0
Observation 7 is in cluster = 0
Observation 8 is in cluster = 2
Observation 9 is in cluster = 2
Observation 10 is in cluster = 2
Observation 11 is in cluster = 0
Observation 12 is in cluster = 1
Observation 13 is in cluster = 0
Observation 14 is in cluster = 2
Observation 15 is in cluster = 2
Observation 16 is in cluster = 0
Observation 17 is in cluster = 1
Observation 18 is in cluster = 0
Observation 19 is in cluster = 1
Observation 20 is in cluster = 0
Observation 21 is in cluster = 0
Cluster 0 centroids are at [70.1817176631418, 7.859943307466528, 24967579.76337904, 7.8094769008081695]
Cluster 1 centroids are at [10.338461538461539, 1.8287393162393166, 3252642.5918803415, 1.0497863247863246]
Cluster 2 centroids are at [131.844696969697, 13.105681818181822, 45665251.28787878, 14.520833333333332]
Sum of Squared Error is +1.4973332350234044e8
Assigning observations to clusters...
Iteration 3
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 2
Observation 2 is in cluster = 2
Observation 3 is in cluster = 2
Observation 4 is in cluster = 2
Observation 5 is in cluster = 2
Observation 6 is in cluster = 0
Observation 7 is in cluster = 2
Observation 8 is in cluster = 2
Observation 9 is in cluster = 2
Observation 10 is in cluster = 2
Observation 11 is in cluster = 0
Observation 12 is in cluster = 0
Observation 13 is in cluster = 0
Observation 14 is in cluster = 2
Observation 15 is in cluster = 2
Observation 16 is in cluster = 0
Observation 17 is in cluster = 0
Observation 18 is in cluster = 2
Observation 19 is in cluster = 0
Observation 20 is in cluster = 0
Observation 21 is in cluster = 0
Cluster 0 centroids are at [34.20815295815296, 4.706885178313751, 12501773.973922903, 3.562358276643991]
Cluster 1 centroids are at [0.0, 0.0, 0.0, 0.0]
Cluster 2 centroids are at [122.38456937799042, 12.657416267942585, 43230942.84389952, 13.537081339712921]
Sum of Squared Error is +1.7345315846109262e8
Assigning observations to clusters...
Iteration 4
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 2
Observation 2 is in cluster = 2
Observation 3 is in cluster = 2
Observation 4 is in cluster = 2
Observation 5 is in cluster = 2
Observation 6 is in cluster = 0
Observation 7 is in cluster = 2
Observation 8 is in cluster = 2
Observation 9 is in cluster = 2
Observation 10 is in cluster = 2
Observation 11 is in cluster = 0
Observation 12 is in cluster = 0
Observation 13 is in cluster = 2
Observation 14 is in cluster = 2
Observation 15 is in cluster = 2
Observation 16 is in cluster = 0
Observation 17 is in cluster = 0
Observation 18 is in cluster = 2
Observation 19 is in cluster = 0
Observation 20 is in cluster = 2
Observation 21 is in cluster = 2
Cluster 0 centroids are at [24.05662393162393, 3.7352930402930404, 8839967.534493286, 2.739255189255189]
Cluster 1 centroids are at [0.0, 0.0, 0.0, 0.0]
Cluster 2 centroids are at [117.83215528490649, 12.273314484558504, 41785617.40240321, 12.74516093953893]
Sum of Squared Error is +1.598304032416967e8
Assigning observations to clusters...
Iteration 5
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 2
Observation 2 is in cluster = 2
Observation 3 is in cluster = 2
Observation 4 is in cluster = 2
Observation 5 is in cluster = 2
Observation 6 is in cluster = 2
Observation 7 is in cluster = 2
Observation 8 is in cluster = 2
Observation 9 is in cluster = 2
Observation 10 is in cluster = 2
Observation 11 is in cluster = 2
Observation 12 is in cluster = 0
Observation 13 is in cluster = 2
Observation 14 is in cluster = 2
Observation 15 is in cluster = 2
Observation 16 is in cluster = 0
Observation 17 is in cluster = 0
Observation 18 is in cluster = 2
Observation 19 is in cluster = 0
Observation 20 is in cluster = 2
Observation 21 is in cluster = 2
Cluster 0 centroids are at [13.923290598290597, 2.231068376068376, 4473332.506410256, 1.338034188034188]
Cluster 1 centroids are at [0.0, 0.0, 0.0, 0.0]
Cluster 2 centroids are at [110.82924641148325, 11.85663875598086, 39856144.85795455, 12.119617224880384]
Sum of Squared Error is +1.7151980745729828e8
Assigning observations to clusters...
Iteration 6
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 2
Observation 2 is in cluster = 2
Observation 3 is in cluster = 2
Observation 4 is in cluster = 2
Observation 5 is in cluster = 2
Observation 6 is in cluster = 2
Observation 7 is in cluster = 2
Observation 8 is in cluster = 2
Observation 9 is in cluster = 2
Observation 10 is in cluster = 2
Observation 11 is in cluster = 2
Observation 12 is in cluster = 0
Observation 13 is in cluster = 2
Observation 14 is in cluster = 2
Observation 15 is in cluster = 2
Observation 16 is in cluster = 2
Observation 17 is in cluster = 0
Observation 18 is in cluster = 2
Observation 19 is in cluster = 2
Observation 20 is in cluster = 2
Observation 21 is in cluster = 2
Cluster 0 centroids are at [8.461538461538462, 1.4670940170940172, 2247907.991452991, 0.8418803418803419]
Cluster 1 centroids are at [0.0, 0.0, 0.0, 0.0]
Cluster 2 centroids are at [105.76069518716577, 11.52994652406417, 38261114.017379686, 11.465240641711228]
Sum of Squared Error is +1.6980898090548012e8
Assigning observations to clusters...
Iteration 7
Calculating cluster centroids...
Calculating Sum of Squared Error...
Current clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 2
Observation 2 is in cluster = 2
Observation 3 is in cluster = 2
Observation 4 is in cluster = 2
Observation 5 is in cluster = 2
Observation 6 is in cluster = 2
Observation 7 is in cluster = 2
Observation 8 is in cluster = 2
Observation 9 is in cluster = 2
Observation 10 is in cluster = 2
Observation 11 is in cluster = 2
Observation 12 is in cluster = 0
Observation 13 is in cluster = 2
Observation 14 is in cluster = 2
Observation 15 is in cluster = 2
Observation 16 is in cluster = 2
Observation 17 is in cluster = 0
Observation 18 is in cluster = 2
Observation 19 is in cluster = 2
Observation 20 is in cluster = 2
Observation 21 is in cluster = 2
Cluster 0 centroids are at [8.461538461538462, 1.4670940170940172, 2247907.991452991, 0.8418803418803419]
Cluster 1 centroids are at [0.0, 0.0, 0.0, 0.0]
Cluster 2 centroids are at [105.76069518716577, 11.52994652406417, 38261114.017379686, 11.465240641711228]
Sum of Squared Error is +1.6980898090548012e8
Final clustering:
Observation 0 is in cluster = 2
Observation 1 is in cluster = 2
Observation 2 is in cluster = 2
Observation 3 is in cluster = 2
Observation 4 is in cluster = 2
Observation 5 is in cluster = 2
Observation 6 is in cluster = 2
Observation 7 is in cluster = 2
Observation 8 is in cluster = 2
Observation 9 is in cluster = 2
Observation 10 is in cluster = 2
Observation 11 is in cluster = 2
Observation 12 is in cluster = 0
Observation 13 is in cluster = 2
Observation 14 is in cluster = 2
Observation 15 is in cluster = 2
Observation 16 is in cluster = 2
Observation 17 is in cluster = 0
Observation 18 is in cluster = 2
Observation 19 is in cluster = 2
Observation 20 is in cluster = 2
Observation 21 is in cluster = 2
Cluster 0 centroids are at [8.461538461538462, 1.4670940170940172, 2247907.991452991, 0.8418803418803419]
Cluster 1 centroids are at [0.0, 0.0, 0.0, 0.0]
Cluster 2 centroids are at [105.76069518716577, 11.52994652406417, 38261114.017379686, 11.465240641711228]
Sum of Squared Error is +1.6980898090548012e8
*/