You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
2.9 KiB
93 lines
2.9 KiB
5 years ago
|
import random
|
||
|
import sys
|
||
|
from typing import List
|
||
|
|
||
|
from kmeans.clustering.cluster import Cluster
|
||
|
from kmeans.clustering.point import Point
|
||
|
from kmeans.clustering.geometry import dist
|
||
|
|
||
|
|
||
|
def mean_movement(clusters: List[Cluster]) -> float:
|
||
|
"""
|
||
|
Determines the maximum mean movement between two time periods.
|
||
|
|
||
|
It will calculate the euclidean distance between each point and it's
|
||
|
corresponding point in the other list (by finding the point using
|
||
|
the point equality function built onto the Point class) and then return
|
||
|
the distance moved.
|
||
|
"""
|
||
|
|
||
|
highest_movement = 0
|
||
|
|
||
|
for cluster in clusters:
|
||
|
if cluster.mean_moved() > highest_movement:
|
||
|
highest_movement = cluster.mean_moved()
|
||
|
|
||
|
return highest_movement
|
||
|
|
||
|
|
||
|
def unweighted_k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]:
|
||
|
"""
|
||
|
Runs Lloyd's Algorithm for k-means clustering without weights.
|
||
|
|
||
|
@param points The list of points to cluster.
|
||
|
@param k The number of clusters.
|
||
|
@param d The threshold distance where we will consider the means converged.
|
||
|
"""
|
||
|
|
||
|
# Our means are initialized to points in the set randomly but
|
||
|
# are kept separately from the points themselves.
|
||
|
means = random.sample(points, k)
|
||
|
|
||
|
clusters = []
|
||
|
|
||
|
for i, mean in enumerate(means):
|
||
|
clusters.append(Cluster(mean, [], None))
|
||
|
|
||
|
mean_needs_moving = True
|
||
|
|
||
|
# Refer to the mean_movement function. It simply returns the largest
|
||
|
# mean movement of all clusters. Once the movement of all clusters is
|
||
|
# lower than `d` we can consider the series converged and return.
|
||
|
while mean_needs_moving:
|
||
|
|
||
|
# Compare each point to every cluster's centroid and calculate the
|
||
|
# distance to it. Assign to the nearest cluster.
|
||
|
for point in points:
|
||
|
best_dist = sys.maxsize
|
||
|
best_cluster = None
|
||
|
|
||
|
for cluster in clusters:
|
||
|
d = dist(point, cluster.mean)
|
||
|
if d < best_dist:
|
||
|
best_dist = d
|
||
|
best_cluster = cluster
|
||
|
|
||
|
if best_cluster is None:
|
||
|
raise ValueError(
|
||
|
f'Failed to assign cluster to point {point}')
|
||
|
|
||
|
if point.cluster is not None:
|
||
|
point.cluster.remove_point(point)
|
||
|
|
||
|
best_cluster.add_point(point)
|
||
|
|
||
|
for cluster in clusters:
|
||
|
# Update the mean with the new points
|
||
|
if cluster.points is not None:
|
||
|
xs = [p.x for p in cluster.points]
|
||
|
ys = [p.y for p in cluster.points]
|
||
|
|
||
|
# Averaging the xs and ys will give us the mean point
|
||
|
# of our cluster.
|
||
|
new_x = sum(xs) / len(xs)
|
||
|
new_y = sum(ys) / len(ys)
|
||
|
|
||
|
new_mean = Point(new_x, new_y)
|
||
|
|
||
|
cluster.update_mean(new_mean)
|
||
|
|
||
|
mean_needs_moving = mean_movement(clusters) > d
|
||
|
|
||
|
return clusters
|