Browse Source

Repackaging k-means into it's own package.

pull/1/head
Taylor Bockman 5 years ago
commit
7ff69a83af
  1. 125
      .gitignore
  2. 3
      README.md
  3. 0
      kmeans/__init__.py
  4. 92
      kmeans/algorithms.py
  5. 0
      kmeans/clustering/__init__.py
  6. 72
      kmeans/clustering/cluster.py
  7. 11
      kmeans/clustering/geometry.py
  8. 56
      kmeans/clustering/point.py
  9. 19
      setup.py

125
.gitignore vendored

@ -0,0 +1,125 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

3
README.md

@ -0,0 +1,3 @@
# K-Means
Implementations of Unweighted and Weighted K-Means.

0
kmeans/__init__.py

92
kmeans/algorithms.py

@ -0,0 +1,92 @@
import random
import sys
from typing import List
from kmeans.clustering.cluster import Cluster
from kmeans.clustering.point import Point
from kmeans.clustering.geometry import dist
def mean_movement(clusters: List[Cluster]) -> float:
"""
Determines the maximum mean movement between two time periods.
It will calculate the euclidean distance between each point and it's
corresponding point in the other list (by finding the point using
the point equality function built onto the Point class) and then return
the distance moved.
"""
highest_movement = 0
for cluster in clusters:
if cluster.mean_moved() > highest_movement:
highest_movement = cluster.mean_moved()
return highest_movement
def unweighted_k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]:
"""
Runs Lloyd's Algorithm for k-means clustering without weights.
@param points The list of points to cluster.
@param k The number of clusters.
@param d The threshold distance where we will consider the means converged.
"""
# Our means are initialized to points in the set randomly but
# are kept separately from the points themselves.
means = random.sample(points, k)
clusters = []
for i, mean in enumerate(means):
clusters.append(Cluster(mean, [], None))
mean_needs_moving = True
# Refer to the mean_movement function. It simply returns the largest
# mean movement of all clusters. Once the movement of all clusters is
# lower than `d` we can consider the series converged and return.
while mean_needs_moving:
# Compare each point to every cluster's centroid and calculate the
# distance to it. Assign to the nearest cluster.
for point in points:
best_dist = sys.maxsize
best_cluster = None
for cluster in clusters:
d = dist(point, cluster.mean)
if d < best_dist:
best_dist = d
best_cluster = cluster
if best_cluster is None:
raise ValueError(
f'Failed to assign cluster to point {point}')
if point.cluster is not None:
point.cluster.remove_point(point)
best_cluster.add_point(point)
for cluster in clusters:
# Update the mean with the new points
if cluster.points is not None:
xs = [p.x for p in cluster.points]
ys = [p.y for p in cluster.points]
# Averaging the xs and ys will give us the mean point
# of our cluster.
new_x = sum(xs) / len(xs)
new_y = sum(ys) / len(ys)
new_mean = Point(new_x, new_y)
cluster.update_mean(new_mean)
mean_needs_moving = mean_movement(clusters) > d
return clusters

0
kmeans/clustering/__init__.py

72
kmeans/clustering/cluster.py

@ -0,0 +1,72 @@
from typing import List
import numpy as np
from .geometry import dist
from .point import Point
class Cluster:
"""
Represents a cluster of points.
"""
def __init__(self, mean: Point, points: List[Point], color: List[float]):
self._points = points
self._children = None
self._color = color
self._mean = mean
self._last_mean = None
@property
def points(self):
return self._points
@property
def color(self):
return self._color
@property
def mean(self):
return self._mean
@property
def last_mean(self):
return self._last_mean
def update_mean(self, mean: Point):
self._last_mean = self._mean
self._mean = mean
def mean_moved(self) -> float:
return dist(self._mean, self._last_mean)
@color.setter
def color(self, color: List[float]):
self._color = color
def add_point(self, point: Point):
self._points.append(point)
point.cluster = self
def remove_point(self, point: Point):
self._points.remove(point)
def __repr__(self):
point_pairs = [(p.x, p.y) for p in self._points]
string = f'POINTS: {point_pairs}\n'
return string
def __eq__(self, other):
if not isinstance(other, Cluster):
raise NotImplementedError('equality between clusters is only ' +
'defined for clusters ' +
f'(other={type(other)})')
point_eq = True
for point in self._points:
point_eq = point_eq and (point in other.points)
return point_eq

11
kmeans/clustering/geometry.py

@ -0,0 +1,11 @@
import numpy as np
from .point import Point
def dist(p: Point, q: Point) -> float:
"""
Calculates the Euclidian Distance of a point and another point.
"""
return np.sqrt(pow(q.x - p.x, 2) +
pow(q.y - p.y, 2))

56
kmeans/clustering/point.py

@ -0,0 +1,56 @@
class Point:
def __init__(self, x: int, y: int, weight: float = 1.0):
self._x = x
self._y = y
self._cluster = None
self._weight = weight
@property
def cluster(self):
return self._cluster
@cluster.setter
def cluster(self, cluster):
self._cluster = cluster
@property
def x(self):
return self._x
@property
def y(self):
return self._y
@property
def weight(self):
return self._weight
@x.setter
def x(self, x):
self._x = x
@y.setter
def y(self):
self._y = y
@weight.setter
def weight(self, weight):
if not isinstance(weight, float):
raise ValueError('Weight must be a float')
self._weight = weight
def array(self):
"""
Returns array representation for use in convex hull.
"""
return [self._x, self._y]
def __eq__(self, other):
if not isinstance(other, Point):
raise NotImplementedError('equality between points is only ' +
'defined for points ' +
f'(other={type(other)})')
return self._x == other._x and self._y == other._y

19
setup.py

@ -0,0 +1,19 @@
from setuptools import setup, find_packages
with open('README.md', 'r') as fh:
long_description = fh.read()
setup(
name='kmeans',
version='1.0.0',
author='Taylor Bockman',
author_email='tbockman@xchg.sh',
description='Unweighted and Weighted K-Means implementation.',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://git.xchg.sh/angrygoats/kmeans',
classifiers=[
],
python_requires='>=3.6',
packages=find_packages(),
)
Loading…
Cancel
Save