You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
92 lines
2.9 KiB
92 lines
2.9 KiB
import random |
|
import sys |
|
from typing import List |
|
|
|
from kmeans.clustering.cluster import Cluster |
|
from kmeans.clustering.point import Point |
|
from kmeans.clustering.geometry import dist |
|
|
|
|
|
def mean_movement(clusters: List[Cluster]) -> float: |
|
""" |
|
Determines the maximum mean movement between two time periods. |
|
|
|
It will calculate the euclidean distance between each point and it's |
|
corresponding point in the other list (by finding the point using |
|
the point equality function built onto the Point class) and then return |
|
the distance moved. |
|
""" |
|
|
|
highest_movement = 0 |
|
|
|
for cluster in clusters: |
|
if cluster.mean_moved() > highest_movement: |
|
highest_movement = cluster.mean_moved() |
|
|
|
return highest_movement |
|
|
|
|
|
def unweighted_k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]: |
|
""" |
|
Runs Lloyd's Algorithm for k-means clustering without weights. |
|
|
|
@param points The list of points to cluster. |
|
@param k The number of clusters. |
|
@param d The threshold distance where we will consider the means converged. |
|
""" |
|
|
|
# Our means are initialized to points in the set randomly but |
|
# are kept separately from the points themselves. |
|
means = random.sample(points, k) |
|
|
|
clusters = [] |
|
|
|
for i, mean in enumerate(means): |
|
clusters.append(Cluster(mean, [], None)) |
|
|
|
mean_needs_moving = True |
|
|
|
# Refer to the mean_movement function. It simply returns the largest |
|
# mean movement of all clusters. Once the movement of all clusters is |
|
# lower than `d` we can consider the series converged and return. |
|
while mean_needs_moving: |
|
|
|
# Compare each point to every cluster's centroid and calculate the |
|
# distance to it. Assign to the nearest cluster. |
|
for point in points: |
|
best_dist = sys.maxsize |
|
best_cluster = None |
|
|
|
for cluster in clusters: |
|
d = dist(point, cluster.mean) |
|
if d < best_dist: |
|
best_dist = d |
|
best_cluster = cluster |
|
|
|
if best_cluster is None: |
|
raise ValueError( |
|
f'Failed to assign cluster to point {point}') |
|
|
|
if point.cluster is not None: |
|
point.cluster.remove_point(point) |
|
|
|
best_cluster.add_point(point) |
|
|
|
for cluster in clusters: |
|
# Update the mean with the new points |
|
if cluster.points is not None: |
|
xs = [p.x for p in cluster.points] |
|
ys = [p.y for p in cluster.points] |
|
|
|
# Averaging the xs and ys will give us the mean point |
|
# of our cluster. |
|
new_x = sum(xs) / len(xs) |
|
new_y = sum(ys) / len(ys) |
|
|
|
new_mean = Point(new_x, new_y) |
|
|
|
cluster.update_mean(new_mean) |
|
|
|
mean_needs_moving = mean_movement(clusters) > d |
|
|
|
return clusters
|
|
|