Repackaging k-means into it's own package.

6 years ago · 7ff69a83af
9 changed files with 378 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,125 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
+# K-Means
+
+Implementations of Unweighted and Weighted K-Means.
--- a/kmeans/init.py
+++ b/kmeans/init.py
--- a/kmeans/algorithms.py
+++ b/kmeans/algorithms.py
@ -0,0 +1,92 @@
+import random
+import sys
+from typing import List
+
+from kmeans.clustering.cluster import Cluster
+from kmeans.clustering.point import Point
+from kmeans.clustering.geometry import dist
+
+
+def mean_movement(clusters: List[Cluster]) -> float:
+    """
+    Determines the maximum mean movement between two time periods.
+
+    It will calculate the euclidean distance between each point and it's
+    corresponding point in the other list (by finding the point using
+    the point equality function built onto the Point class) and then return
+    the distance moved.
+    """
+
+    highest_movement = 0
+
+    for cluster in clusters:
+        if cluster.mean_moved() > highest_movement:
+            highest_movement = cluster.mean_moved()
+
+    return highest_movement
+
+
+def unweighted_k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]:
+    """
+    Runs Lloyd's Algorithm for k-means clustering without weights.
+
+    @param points The list of points to cluster.
+    @param k The number of clusters.
+    @param d The threshold distance where we will consider the means converged.
+    """
+
+    # Our means are initialized to points in the set randomly but
+    # are kept separately from the points themselves.
+    means = random.sample(points, k)
+
+    clusters = []
+
+    for i, mean in enumerate(means):
+        clusters.append(Cluster(mean, [], None))
+
+    mean_needs_moving = True
+
+    # Refer to the mean_movement function. It simply returns the largest
+    # mean movement of all clusters. Once the movement of all clusters is
+    # lower than `d` we can consider the series converged and return.
+    while mean_needs_moving:
+
+        # Compare each point to every cluster's centroid and calculate the
+        # distance to it. Assign to the nearest cluster.
+        for point in points:
+            best_dist = sys.maxsize
+            best_cluster = None
+
+            for cluster in clusters:
+                d = dist(point, cluster.mean)
+                if d < best_dist:
+                    best_dist = d
+                    best_cluster = cluster
+
+            if best_cluster is None:
+                raise ValueError(
+                    f'Failed to assign cluster to point {point}')
+
+            if point.cluster is not None:
+                point.cluster.remove_point(point)
+
+            best_cluster.add_point(point)
+
+        for cluster in clusters:
+            # Update the mean with the new points
+            if cluster.points is not None:
+                xs = [p.x for p in cluster.points]
+                ys = [p.y for p in cluster.points]
+
+                # Averaging the xs and ys will give us the mean point
+                # of our cluster.
+                new_x = sum(xs) / len(xs)
+                new_y = sum(ys) / len(ys)
+
+                new_mean = Point(new_x, new_y)
+
+                cluster.update_mean(new_mean)
+
+        mean_needs_moving = mean_movement(clusters) > d
+
+    return clusters
--- a/kmeans/clustering/init.py
+++ b/kmeans/clustering/init.py
--- a/kmeans/clustering/cluster.py
+++ b/kmeans/clustering/cluster.py
@ -0,0 +1,72 @@
+from typing import List
+
+import numpy as np
+
+from .geometry import dist
+from .point import Point
+
+
+class Cluster:
+    """
+    Represents a cluster of points.
+    """
+
+    def __init__(self, mean: Point, points: List[Point], color: List[float]):
+        self._points = points
+        self._children = None
+        self._color = color
+        self._mean = mean
+        self._last_mean = None
+
+    @property
+    def points(self):
+        return self._points
+
+    @property
+    def color(self):
+        return self._color
+
+    @property
+    def mean(self):
+        return self._mean
+
+    @property
+    def last_mean(self):
+        return self._last_mean
+
+    def update_mean(self, mean: Point):
+        self._last_mean = self._mean
+        self._mean = mean
+
+    def mean_moved(self) -> float:
+        return dist(self._mean, self._last_mean)
+
+    @color.setter
+    def color(self, color: List[float]):
+        self._color = color
+
+    def add_point(self, point: Point):
+        self._points.append(point)
+        point.cluster = self
+
+    def remove_point(self, point: Point):
+        self._points.remove(point)
+
+    def __repr__(self):
+        point_pairs = [(p.x, p.y) for p in self._points]
+        string = f'POINTS: {point_pairs}\n'
+
+        return string
+
+    def __eq__(self, other):
+        if not isinstance(other, Cluster):
+            raise NotImplementedError('equality between clusters is only ' +
+                                      'defined for clusters ' +
+                                      f'(other={type(other)})')
+
+        point_eq = True
+
+        for point in self._points:
+            point_eq = point_eq and (point in other.points)
+
+        return point_eq
--- a/kmeans/clustering/geometry.py
+++ b/kmeans/clustering/geometry.py
@ -0,0 +1,11 @@
+import numpy as np
+
+from .point import Point
+
+
+def dist(p: Point, q: Point) -> float:
+    """
+    Calculates the Euclidian Distance of a point and another point.
+    """
+    return np.sqrt(pow(q.x - p.x, 2) +
+                   pow(q.y - p.y, 2))
--- a/kmeans/clustering/point.py
+++ b/kmeans/clustering/point.py
@ -0,0 +1,56 @@
+class Point:
+    def __init__(self, x: int, y: int, weight: float = 1.0):
+        self._x = x
+        self._y = y
+        self._cluster = None
+        self._weight = weight
+
+    @property
+    def cluster(self):
+        return self._cluster
+
+    @cluster.setter
+    def cluster(self, cluster):
+        self._cluster = cluster
+
+    @property
+    def x(self):
+        return self._x
+
+    @property
+    def y(self):
+        return self._y
+
+    @property
+    def weight(self):
+        return self._weight
+
+    @x.setter
+    def x(self, x):
+        self._x = x
+
+    @y.setter
+    def y(self):
+        self._y = y
+
+    @weight.setter
+    def weight(self, weight):
+
+        if not isinstance(weight, float):
+            raise ValueError('Weight must be a float')
+
+        self._weight = weight
+
+    def array(self):
+        """
+        Returns array representation for use in convex hull.
+        """
+        return [self._x, self._y]
+
+    def __eq__(self, other):
+        if not isinstance(other, Point):
+            raise NotImplementedError('equality between points is only ' +
+                                      'defined for points ' +
+                                      f'(other={type(other)})')
+
+        return self._x == other._x and self._y == other._y
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,19 @@
+from setuptools import setup, find_packages
+
+with open('README.md', 'r') as fh:
+    long_description = fh.read()
+
+setup(
+    name='kmeans',
+    version='1.0.0',
+    author='Taylor Bockman',
+    author_email='tbockman@xchg.sh',
+    description='Unweighted and Weighted K-Means implementation.',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    url='https://git.xchg.sh/angrygoats/kmeans',
+    classifiers=[
+    ],
+    python_requires='>=3.6',
+    packages=find_packages(),
+)