Taylor Bockman
5 years ago
commit
7ff69a83af
9 changed files with 378 additions and 0 deletions
@ -0,0 +1,125 @@
|
||||
# Byte-compiled / optimized / DLL files |
||||
__pycache__/ |
||||
*.py[cod] |
||||
*$py.class |
||||
|
||||
# C extensions |
||||
*.so |
||||
|
||||
# Distribution / packaging |
||||
.Python |
||||
build/ |
||||
develop-eggs/ |
||||
dist/ |
||||
downloads/ |
||||
eggs/ |
||||
.eggs/ |
||||
lib/ |
||||
lib64/ |
||||
parts/ |
||||
sdist/ |
||||
var/ |
||||
wheels/ |
||||
pip-wheel-metadata/ |
||||
share/python-wheels/ |
||||
*.egg-info/ |
||||
.installed.cfg |
||||
*.egg |
||||
MANIFEST |
||||
|
||||
# PyInstaller |
||||
# Usually these files are written by a python script from a template |
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||
*.manifest |
||||
*.spec |
||||
|
||||
# Installer logs |
||||
pip-log.txt |
||||
pip-delete-this-directory.txt |
||||
|
||||
# Unit test / coverage reports |
||||
htmlcov/ |
||||
.tox/ |
||||
.nox/ |
||||
.coverage |
||||
.coverage.* |
||||
.cache |
||||
nosetests.xml |
||||
coverage.xml |
||||
*.cover |
||||
*.py,cover |
||||
.hypothesis/ |
||||
.pytest_cache/ |
||||
|
||||
# Translations |
||||
*.mo |
||||
*.pot |
||||
|
||||
# Django stuff: |
||||
*.log |
||||
local_settings.py |
||||
db.sqlite3 |
||||
db.sqlite3-journal |
||||
|
||||
# Flask stuff: |
||||
instance/ |
||||
.webassets-cache |
||||
|
||||
# Scrapy stuff: |
||||
.scrapy |
||||
|
||||
# Sphinx documentation |
||||
docs/_build/ |
||||
|
||||
# PyBuilder |
||||
target/ |
||||
|
||||
# Jupyter Notebook |
||||
.ipynb_checkpoints |
||||
|
||||
# IPython |
||||
profile_default/ |
||||
ipython_config.py |
||||
|
||||
# pyenv |
||||
.python-version |
||||
|
||||
# pipenv |
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||
# install all needed dependencies. |
||||
#Pipfile.lock |
||||
|
||||
# celery beat schedule file |
||||
celerybeat-schedule |
||||
|
||||
# SageMath parsed files |
||||
*.sage.py |
||||
|
||||
# Environments |
||||
.env |
||||
.venv |
||||
env/ |
||||
venv/ |
||||
ENV/ |
||||
env.bak/ |
||||
venv.bak/ |
||||
|
||||
# Spyder project settings |
||||
.spyderproject |
||||
.spyproject |
||||
|
||||
# Rope project settings |
||||
.ropeproject |
||||
|
||||
# mkdocs documentation |
||||
/site |
||||
|
||||
# mypy |
||||
.mypy_cache/ |
||||
.dmypy.json |
||||
dmypy.json |
||||
|
||||
# Pyre type checker |
||||
.pyre/ |
@ -0,0 +1,3 @@
|
||||
# K-Means |
||||
|
||||
Implementations of Unweighted and Weighted K-Means. |
@ -0,0 +1,92 @@
|
||||
import random |
||||
import sys |
||||
from typing import List |
||||
|
||||
from kmeans.clustering.cluster import Cluster |
||||
from kmeans.clustering.point import Point |
||||
from kmeans.clustering.geometry import dist |
||||
|
||||
|
||||
def mean_movement(clusters: List[Cluster]) -> float: |
||||
""" |
||||
Determines the maximum mean movement between two time periods. |
||||
|
||||
It will calculate the euclidean distance between each point and it's |
||||
corresponding point in the other list (by finding the point using |
||||
the point equality function built onto the Point class) and then return |
||||
the distance moved. |
||||
""" |
||||
|
||||
highest_movement = 0 |
||||
|
||||
for cluster in clusters: |
||||
if cluster.mean_moved() > highest_movement: |
||||
highest_movement = cluster.mean_moved() |
||||
|
||||
return highest_movement |
||||
|
||||
|
||||
def unweighted_k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]: |
||||
""" |
||||
Runs Lloyd's Algorithm for k-means clustering without weights. |
||||
|
||||
@param points The list of points to cluster. |
||||
@param k The number of clusters. |
||||
@param d The threshold distance where we will consider the means converged. |
||||
""" |
||||
|
||||
# Our means are initialized to points in the set randomly but |
||||
# are kept separately from the points themselves. |
||||
means = random.sample(points, k) |
||||
|
||||
clusters = [] |
||||
|
||||
for i, mean in enumerate(means): |
||||
clusters.append(Cluster(mean, [], None)) |
||||
|
||||
mean_needs_moving = True |
||||
|
||||
# Refer to the mean_movement function. It simply returns the largest |
||||
# mean movement of all clusters. Once the movement of all clusters is |
||||
# lower than `d` we can consider the series converged and return. |
||||
while mean_needs_moving: |
||||
|
||||
# Compare each point to every cluster's centroid and calculate the |
||||
# distance to it. Assign to the nearest cluster. |
||||
for point in points: |
||||
best_dist = sys.maxsize |
||||
best_cluster = None |
||||
|
||||
for cluster in clusters: |
||||
d = dist(point, cluster.mean) |
||||
if d < best_dist: |
||||
best_dist = d |
||||
best_cluster = cluster |
||||
|
||||
if best_cluster is None: |
||||
raise ValueError( |
||||
f'Failed to assign cluster to point {point}') |
||||
|
||||
if point.cluster is not None: |
||||
point.cluster.remove_point(point) |
||||
|
||||
best_cluster.add_point(point) |
||||
|
||||
for cluster in clusters: |
||||
# Update the mean with the new points |
||||
if cluster.points is not None: |
||||
xs = [p.x for p in cluster.points] |
||||
ys = [p.y for p in cluster.points] |
||||
|
||||
# Averaging the xs and ys will give us the mean point |
||||
# of our cluster. |
||||
new_x = sum(xs) / len(xs) |
||||
new_y = sum(ys) / len(ys) |
||||
|
||||
new_mean = Point(new_x, new_y) |
||||
|
||||
cluster.update_mean(new_mean) |
||||
|
||||
mean_needs_moving = mean_movement(clusters) > d |
||||
|
||||
return clusters |
@ -0,0 +1,72 @@
|
||||
from typing import List |
||||
|
||||
import numpy as np |
||||
|
||||
from .geometry import dist |
||||
from .point import Point |
||||
|
||||
|
||||
class Cluster: |
||||
""" |
||||
Represents a cluster of points. |
||||
""" |
||||
|
||||
def __init__(self, mean: Point, points: List[Point], color: List[float]): |
||||
self._points = points |
||||
self._children = None |
||||
self._color = color |
||||
self._mean = mean |
||||
self._last_mean = None |
||||
|
||||
@property |
||||
def points(self): |
||||
return self._points |
||||
|
||||
@property |
||||
def color(self): |
||||
return self._color |
||||
|
||||
@property |
||||
def mean(self): |
||||
return self._mean |
||||
|
||||
@property |
||||
def last_mean(self): |
||||
return self._last_mean |
||||
|
||||
def update_mean(self, mean: Point): |
||||
self._last_mean = self._mean |
||||
self._mean = mean |
||||
|
||||
def mean_moved(self) -> float: |
||||
return dist(self._mean, self._last_mean) |
||||
|
||||
@color.setter |
||||
def color(self, color: List[float]): |
||||
self._color = color |
||||
|
||||
def add_point(self, point: Point): |
||||
self._points.append(point) |
||||
point.cluster = self |
||||
|
||||
def remove_point(self, point: Point): |
||||
self._points.remove(point) |
||||
|
||||
def __repr__(self): |
||||
point_pairs = [(p.x, p.y) for p in self._points] |
||||
string = f'POINTS: {point_pairs}\n' |
||||
|
||||
return string |
||||
|
||||
def __eq__(self, other): |
||||
if not isinstance(other, Cluster): |
||||
raise NotImplementedError('equality between clusters is only ' + |
||||
'defined for clusters ' + |
||||
f'(other={type(other)})') |
||||
|
||||
point_eq = True |
||||
|
||||
for point in self._points: |
||||
point_eq = point_eq and (point in other.points) |
||||
|
||||
return point_eq |
@ -0,0 +1,11 @@
|
||||
import numpy as np |
||||
|
||||
from .point import Point |
||||
|
||||
|
||||
def dist(p: Point, q: Point) -> float: |
||||
""" |
||||
Calculates the Euclidian Distance of a point and another point. |
||||
""" |
||||
return np.sqrt(pow(q.x - p.x, 2) + |
||||
pow(q.y - p.y, 2)) |
@ -0,0 +1,56 @@
|
||||
class Point: |
||||
def __init__(self, x: int, y: int, weight: float = 1.0): |
||||
self._x = x |
||||
self._y = y |
||||
self._cluster = None |
||||
self._weight = weight |
||||
|
||||
@property |
||||
def cluster(self): |
||||
return self._cluster |
||||
|
||||
@cluster.setter |
||||
def cluster(self, cluster): |
||||
self._cluster = cluster |
||||
|
||||
@property |
||||
def x(self): |
||||
return self._x |
||||
|
||||
@property |
||||
def y(self): |
||||
return self._y |
||||
|
||||
@property |
||||
def weight(self): |
||||
return self._weight |
||||
|
||||
@x.setter |
||||
def x(self, x): |
||||
self._x = x |
||||
|
||||
@y.setter |
||||
def y(self): |
||||
self._y = y |
||||
|
||||
@weight.setter |
||||
def weight(self, weight): |
||||
|
||||
if not isinstance(weight, float): |
||||
raise ValueError('Weight must be a float') |
||||
|
||||
self._weight = weight |
||||
|
||||
def array(self): |
||||
""" |
||||
Returns array representation for use in convex hull. |
||||
""" |
||||
return [self._x, self._y] |
||||
|
||||
def __eq__(self, other): |
||||
if not isinstance(other, Point): |
||||
raise NotImplementedError('equality between points is only ' + |
||||
'defined for points ' + |
||||
f'(other={type(other)})') |
||||
|
||||
return self._x == other._x and self._y == other._y |
@ -0,0 +1,19 @@
|
||||
from setuptools import setup, find_packages |
||||
|
||||
with open('README.md', 'r') as fh: |
||||
long_description = fh.read() |
||||
|
||||
setup( |
||||
name='kmeans', |
||||
version='1.0.0', |
||||
author='Taylor Bockman', |
||||
author_email='tbockman@xchg.sh', |
||||
description='Unweighted and Weighted K-Means implementation.', |
||||
long_description=long_description, |
||||
long_description_content_type='text/markdown', |
||||
url='https://git.xchg.sh/angrygoats/kmeans', |
||||
classifiers=[ |
||||
], |
||||
python_requires='>=3.6', |
||||
packages=find_packages(), |
||||
) |
Loading…
Reference in new issue