|
|
|
import random
|
|
|
|
import sys
|
|
|
|
from typing import List
|
|
|
|
|
|
|
|
from kmeans.clustering.cluster import Cluster
|
|
|
|
from kmeans.clustering.point import Point
|
|
|
|
from kmeans.clustering.geometry import dist
|
|
|
|
|
|
|
|
|
|
|
|
def mean_movement(clusters: List[Cluster]) -> float:
|
|
|
|
"""
|
|
|
|
Determines the maximum mean movement between two time periods.
|
|
|
|
|
|
|
|
It will calculate the euclidean distance between each point and it's
|
|
|
|
corresponding point in the other list (by finding the point using
|
|
|
|
the point equality function built onto the Point class) and then return
|
|
|
|
the distance moved.
|
|
|
|
"""
|
|
|
|
|
|
|
|
highest_movement = 0
|
|
|
|
|
|
|
|
for cluster in clusters:
|
|
|
|
if cluster.mean_moved() > highest_movement:
|
|
|
|
highest_movement = cluster.mean_moved()
|
|
|
|
|
|
|
|
return highest_movement
|
|
|
|
|
|
|
|
|
|
|
|
def k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]:
|
|
|
|
"""
|
|
|
|
Runs Lloyd's Algorithm for k-means clustering.
|
|
|
|
|
|
|
|
If no weights are added (that is, all point weights are 1) the mean is
|
|
|
|
found using the arithmetic mean. If there are weights, the mean will be
|
|
|
|
a weighted mean of the points.
|
|
|
|
|
|
|
|
@param points The list of points to cluster.
|
|
|
|
@param k The number of clusters.
|
|
|
|
@param d The threshold distance where we will consider the means converged.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Our means are initialized to points in the set randomly but
|
|
|
|
# are kept separately from the points themselves.
|
|
|
|
means = random.sample(points, k)
|
|
|
|
|
|
|
|
clusters = []
|
|
|
|
|
|
|
|
for i, mean in enumerate(means):
|
|
|
|
clusters.append(Cluster(mean, [], None))
|
|
|
|
|
|
|
|
mean_needs_moving = True
|
|
|
|
|
|
|
|
# Refer to the mean_movement function. It simply returns the largest
|
|
|
|
# mean movement of all clusters. Once the movement of all clusters is
|
|
|
|
# lower than `d` we can consider the series converged and return.
|
|
|
|
while mean_needs_moving:
|
|
|
|
|
|
|
|
# Compare each point to every cluster's centroid and calculate the
|
|
|
|
# distance to it. Assign to the nearest cluster.
|
|
|
|
for point in points:
|
|
|
|
best_dist = sys.maxsize
|
|
|
|
best_cluster = None
|
|
|
|
|
|
|
|
for cluster in clusters:
|
|
|
|
d = dist(point, cluster.mean)
|
|
|
|
if d < best_dist:
|
|
|
|
best_dist = d
|
|
|
|
best_cluster = cluster
|
|
|
|
|
|
|
|
if best_cluster is None:
|
|
|
|
raise ValueError(
|
|
|
|
f'Failed to assign cluster to point {point}')
|
|
|
|
|
|
|
|
if point.cluster is not None:
|
|
|
|
point.cluster.remove_point(point)
|
|
|
|
|
|
|
|
best_cluster.add_point(point)
|
|
|
|
|
|
|
|
for cluster in clusters:
|
|
|
|
# Update the mean with the new points
|
|
|
|
if cluster.points is not None:
|
|
|
|
# When all weights are 1 the sum of these lists will be
|
|
|
|
# the exact same thing as the standard arithmetic mean.
|
|
|
|
xs = [p.x * p.weight for p in cluster.points]
|
|
|
|
ys = [p.y * p.weight for p in cluster.points]
|
|
|
|
|
|
|
|
# When all weights are 1, the sum of this list is
|
|
|
|
# exactly the length of the list.
|
|
|
|
weights = [p.weight for p in cluster.points]
|
|
|
|
|
|
|
|
# Averaging the xs and ys will give us the mean point
|
|
|
|
# of our cluster.
|
|
|
|
new_x = sum(xs) / sum(weights)
|
|
|
|
new_y = sum(ys) / sum(weights)
|
|
|
|
|
|
|
|
new_mean = Point(new_x, new_y)
|
|
|
|
|
|
|
|
cluster.update_mean(new_mean)
|
|
|
|
|
|
|
|
mean_needs_moving = mean_movement(clusters) > d
|
|
|
|
|
|
|
|
return clusters
|