kmeans/kmeans/algorithms.py

import random
import sys
from typing import List

from kmeans.clustering.cluster import Cluster
from kmeans.clustering.point import Point
from kmeans.clustering.geometry import dist


def mean_movement(clusters: List[Cluster]) -> float:
    """
    Determines the maximum mean movement between two time periods.

    It will calculate the euclidean distance between each point and it's
    corresponding point in the other list (by finding the point using
    the point equality function built onto the Point class) and then return
    the distance moved.
    """

    highest_movement = 0

    for cluster in clusters:
        if cluster.mean_moved() > highest_movement:
            highest_movement = cluster.mean_moved()

    return highest_movement


def k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]:
    """
    Runs Lloyd's Algorithm for k-means clustering.

    If no weights are added (that is, all point weights are 1) the mean is
    found using the arithmetic mean. If there are weights, the mean will be
    a weighted mean of the points.

    @param points The list of points to cluster.
    @param k The number of clusters.
    @param d The threshold distance where we will consider the means converged.
    """

    # Our means are initialized to points in the set randomly but
    # are kept separately from the points themselves.
    means = random.sample(points, k)

    clusters = []

    for i, mean in enumerate(means):
        clusters.append(Cluster(mean, [], None))

    mean_needs_moving = True

    # Refer to the mean_movement function. It simply returns the largest
    # mean movement of all clusters. Once the movement of all clusters is
    # lower than `d` we can consider the series converged and return.
    while mean_needs_moving:

        # Compare each point to every cluster's centroid and calculate the
        # distance to it. Assign to the nearest cluster.
        for point in points:
            best_dist = sys.maxsize
            best_cluster = None

            for cluster in clusters:
                d = dist(point, cluster.mean)
                if d < best_dist:
                    best_dist = d
                    best_cluster = cluster

            if best_cluster is None:
                raise ValueError(
                    f'Failed to assign cluster to point {point}')

            if point.cluster is not None:
                point.cluster.remove_point(point)

            best_cluster.add_point(point)

        for cluster in clusters:
            # Update the mean with the new points
            if cluster.points is not None:
                # When all weights are 1 the sum of these lists will be
                # the exact same thing as the standard arithmetic mean.
                xs = [p.x * p.weight for p in cluster.points]
                ys = [p.y * p.weight for p in cluster.points]

                # When all weights are 1, the sum of this list is
                # exactly the length of the list.
                weights = [p.weight for p in cluster.points]

                # Averaging the xs and ys will give us the mean point
                # of our cluster.
                new_x = sum(xs) / sum(weights)
                new_y = sum(ys) / sum(weights)

                new_mean = Point(new_x, new_y)

                cluster.update_mean(new_mean)

        mean_needs_moving = mean_movement(clusters) > d

    return clusters
Repackaging k-means into it's own package. 5 years ago			`import random`
			`import sys`
			`from typing import List`

			`from kmeans.clustering.cluster import Cluster`
			`from kmeans.clustering.point import Point`
			`from kmeans.clustering.geometry import dist`


			`def mean_movement(clusters: List[Cluster]) -> float:`
			`"""`
			`Determines the maximum mean movement between two time periods.`

			`It will calculate the euclidean distance between each point and it's`
			`corresponding point in the other list (by finding the point using`
			`the point equality function built onto the Point class) and then return`
			`the distance moved.`
			`"""`

			`highest_movement = 0`

			`for cluster in clusters:`
			`if cluster.mean_moved() > highest_movement:`
			`highest_movement = cluster.mean_moved()`

			`return highest_movement`


Update 5 years ago			`def k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]:`
Repackaging k-means into it's own package. 5 years ago			`"""`
Update 5 years ago			`Runs Lloyd's Algorithm for k-means clustering.`

			`If no weights are added (that is, all point weights are 1) the mean is`
			`found using the arithmetic mean. If there are weights, the mean will be`
			`a weighted mean of the points.`
Repackaging k-means into it's own package. 5 years ago
			`@param points The list of points to cluster.`
			`@param k The number of clusters.`
			`@param d The threshold distance where we will consider the means converged.`
			`"""`

			`# Our means are initialized to points in the set randomly but`
			`# are kept separately from the points themselves.`
			`means = random.sample(points, k)`

			`clusters = []`

			`for i, mean in enumerate(means):`
			`clusters.append(Cluster(mean, [], None))`

			`mean_needs_moving = True`

			`# Refer to the mean_movement function. It simply returns the largest`
			`# mean movement of all clusters. Once the movement of all clusters is`
			# lower than `d` we can consider the series converged and return.
			`while mean_needs_moving:`

			`# Compare each point to every cluster's centroid and calculate the`
			`# distance to it. Assign to the nearest cluster.`
			`for point in points:`
			`best_dist = sys.maxsize`
			`best_cluster = None`

			`for cluster in clusters:`
			`d = dist(point, cluster.mean)`
			`if d < best_dist:`
			`best_dist = d`
			`best_cluster = cluster`

			`if best_cluster is None:`
			`raise ValueError(`
			`f'Failed to assign cluster to point {point}')`

			`if point.cluster is not None:`
			`point.cluster.remove_point(point)`

			`best_cluster.add_point(point)`

			`for cluster in clusters:`
			`# Update the mean with the new points`
			`if cluster.points is not None:`
Update 5 years ago			`# When all weights are 1 the sum of these lists will be`
			`# the exact same thing as the standard arithmetic mean.`
			`xs = [p.x * p.weight for p in cluster.points]`
			`ys = [p.y * p.weight for p in cluster.points]`

			`# When all weights are 1, the sum of this list is`
			`# exactly the length of the list.`
			`weights = [p.weight for p in cluster.points]`
Repackaging k-means into it's own package. 5 years ago
			`# Averaging the xs and ys will give us the mean point`
			`# of our cluster.`
Update 5 years ago			`new_x = sum(xs) / sum(weights)`
			`new_y = sum(ys) / sum(weights)`
Repackaging k-means into it's own package. 5 years ago
			`new_mean = Point(new_x, new_y)`

			`cluster.update_mean(new_mean)`

			`mean_needs_moving = mean_movement(clusters) > d`

			`return clusters`