commit 7ff69a83af76cd4920f5ab799c05b2493e2145ab Author: Taylor Bockman Date: Wed Oct 16 13:02:19 2019 -0700 Repackaging k-means into it's own package. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f717486 --- /dev/null +++ b/.gitignore @@ -0,0 +1,125 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..56140ba --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# K-Means + +Implementations of Unweighted and Weighted K-Means. diff --git a/kmeans/__init__.py b/kmeans/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kmeans/algorithms.py b/kmeans/algorithms.py new file mode 100644 index 0000000..dfe89a2 --- /dev/null +++ b/kmeans/algorithms.py @@ -0,0 +1,92 @@ +import random +import sys +from typing import List + +from kmeans.clustering.cluster import Cluster +from kmeans.clustering.point import Point +from kmeans.clustering.geometry import dist + + +def mean_movement(clusters: List[Cluster]) -> float: + """ + Determines the maximum mean movement between two time periods. + + It will calculate the euclidean distance between each point and it's + corresponding point in the other list (by finding the point using + the point equality function built onto the Point class) and then return + the distance moved. + """ + + highest_movement = 0 + + for cluster in clusters: + if cluster.mean_moved() > highest_movement: + highest_movement = cluster.mean_moved() + + return highest_movement + + +def unweighted_k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]: + """ + Runs Lloyd's Algorithm for k-means clustering without weights. + + @param points The list of points to cluster. + @param k The number of clusters. + @param d The threshold distance where we will consider the means converged. + """ + + # Our means are initialized to points in the set randomly but + # are kept separately from the points themselves. + means = random.sample(points, k) + + clusters = [] + + for i, mean in enumerate(means): + clusters.append(Cluster(mean, [], None)) + + mean_needs_moving = True + + # Refer to the mean_movement function. It simply returns the largest + # mean movement of all clusters. Once the movement of all clusters is + # lower than `d` we can consider the series converged and return. + while mean_needs_moving: + + # Compare each point to every cluster's centroid and calculate the + # distance to it. Assign to the nearest cluster. + for point in points: + best_dist = sys.maxsize + best_cluster = None + + for cluster in clusters: + d = dist(point, cluster.mean) + if d < best_dist: + best_dist = d + best_cluster = cluster + + if best_cluster is None: + raise ValueError( + f'Failed to assign cluster to point {point}') + + if point.cluster is not None: + point.cluster.remove_point(point) + + best_cluster.add_point(point) + + for cluster in clusters: + # Update the mean with the new points + if cluster.points is not None: + xs = [p.x for p in cluster.points] + ys = [p.y for p in cluster.points] + + # Averaging the xs and ys will give us the mean point + # of our cluster. + new_x = sum(xs) / len(xs) + new_y = sum(ys) / len(ys) + + new_mean = Point(new_x, new_y) + + cluster.update_mean(new_mean) + + mean_needs_moving = mean_movement(clusters) > d + + return clusters diff --git a/kmeans/clustering/__init__.py b/kmeans/clustering/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kmeans/clustering/cluster.py b/kmeans/clustering/cluster.py new file mode 100644 index 0000000..7a725f8 --- /dev/null +++ b/kmeans/clustering/cluster.py @@ -0,0 +1,72 @@ +from typing import List + +import numpy as np + +from .geometry import dist +from .point import Point + + +class Cluster: + """ + Represents a cluster of points. + """ + + def __init__(self, mean: Point, points: List[Point], color: List[float]): + self._points = points + self._children = None + self._color = color + self._mean = mean + self._last_mean = None + + @property + def points(self): + return self._points + + @property + def color(self): + return self._color + + @property + def mean(self): + return self._mean + + @property + def last_mean(self): + return self._last_mean + + def update_mean(self, mean: Point): + self._last_mean = self._mean + self._mean = mean + + def mean_moved(self) -> float: + return dist(self._mean, self._last_mean) + + @color.setter + def color(self, color: List[float]): + self._color = color + + def add_point(self, point: Point): + self._points.append(point) + point.cluster = self + + def remove_point(self, point: Point): + self._points.remove(point) + + def __repr__(self): + point_pairs = [(p.x, p.y) for p in self._points] + string = f'POINTS: {point_pairs}\n' + + return string + + def __eq__(self, other): + if not isinstance(other, Cluster): + raise NotImplementedError('equality between clusters is only ' + + 'defined for clusters ' + + f'(other={type(other)})') + + point_eq = True + + for point in self._points: + point_eq = point_eq and (point in other.points) + + return point_eq diff --git a/kmeans/clustering/geometry.py b/kmeans/clustering/geometry.py new file mode 100644 index 0000000..70d4e48 --- /dev/null +++ b/kmeans/clustering/geometry.py @@ -0,0 +1,11 @@ +import numpy as np + +from .point import Point + + +def dist(p: Point, q: Point) -> float: + """ + Calculates the Euclidian Distance of a point and another point. + """ + return np.sqrt(pow(q.x - p.x, 2) + + pow(q.y - p.y, 2)) diff --git a/kmeans/clustering/point.py b/kmeans/clustering/point.py new file mode 100644 index 0000000..347120e --- /dev/null +++ b/kmeans/clustering/point.py @@ -0,0 +1,56 @@ +class Point: + def __init__(self, x: int, y: int, weight: float = 1.0): + self._x = x + self._y = y + self._cluster = None + self._weight = weight + + @property + def cluster(self): + return self._cluster + + @cluster.setter + def cluster(self, cluster): + self._cluster = cluster + + @property + def x(self): + return self._x + + @property + def y(self): + return self._y + + @property + def weight(self): + return self._weight + + @x.setter + def x(self, x): + self._x = x + + @y.setter + def y(self): + self._y = y + + @weight.setter + def weight(self, weight): + + if not isinstance(weight, float): + raise ValueError('Weight must be a float') + + self._weight = weight + + def array(self): + """ + Returns array representation for use in convex hull. + """ + return [self._x, self._y] + + def __eq__(self, other): + if not isinstance(other, Point): + raise NotImplementedError('equality between points is only ' + + 'defined for points ' + + f'(other={type(other)})') + + return self._x == other._x and self._y == other._y diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1f25718 --- /dev/null +++ b/setup.py @@ -0,0 +1,19 @@ +from setuptools import setup, find_packages + +with open('README.md', 'r') as fh: + long_description = fh.read() + +setup( + name='kmeans', + version='1.0.0', + author='Taylor Bockman', + author_email='tbockman@xchg.sh', + description='Unweighted and Weighted K-Means implementation.', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://git.xchg.sh/angrygoats/kmeans', + classifiers=[ + ], + python_requires='>=3.6', + packages=find_packages(), +)