Taylor Bockman
5 years ago
commit
7ff69a83af
9 changed files with 378 additions and 0 deletions
@ -0,0 +1,125 @@ |
|||||||
|
# Byte-compiled / optimized / DLL files |
||||||
|
__pycache__/ |
||||||
|
*.py[cod] |
||||||
|
*$py.class |
||||||
|
|
||||||
|
# C extensions |
||||||
|
*.so |
||||||
|
|
||||||
|
# Distribution / packaging |
||||||
|
.Python |
||||||
|
build/ |
||||||
|
develop-eggs/ |
||||||
|
dist/ |
||||||
|
downloads/ |
||||||
|
eggs/ |
||||||
|
.eggs/ |
||||||
|
lib/ |
||||||
|
lib64/ |
||||||
|
parts/ |
||||||
|
sdist/ |
||||||
|
var/ |
||||||
|
wheels/ |
||||||
|
pip-wheel-metadata/ |
||||||
|
share/python-wheels/ |
||||||
|
*.egg-info/ |
||||||
|
.installed.cfg |
||||||
|
*.egg |
||||||
|
MANIFEST |
||||||
|
|
||||||
|
# PyInstaller |
||||||
|
# Usually these files are written by a python script from a template |
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||||
|
*.manifest |
||||||
|
*.spec |
||||||
|
|
||||||
|
# Installer logs |
||||||
|
pip-log.txt |
||||||
|
pip-delete-this-directory.txt |
||||||
|
|
||||||
|
# Unit test / coverage reports |
||||||
|
htmlcov/ |
||||||
|
.tox/ |
||||||
|
.nox/ |
||||||
|
.coverage |
||||||
|
.coverage.* |
||||||
|
.cache |
||||||
|
nosetests.xml |
||||||
|
coverage.xml |
||||||
|
*.cover |
||||||
|
*.py,cover |
||||||
|
.hypothesis/ |
||||||
|
.pytest_cache/ |
||||||
|
|
||||||
|
# Translations |
||||||
|
*.mo |
||||||
|
*.pot |
||||||
|
|
||||||
|
# Django stuff: |
||||||
|
*.log |
||||||
|
local_settings.py |
||||||
|
db.sqlite3 |
||||||
|
db.sqlite3-journal |
||||||
|
|
||||||
|
# Flask stuff: |
||||||
|
instance/ |
||||||
|
.webassets-cache |
||||||
|
|
||||||
|
# Scrapy stuff: |
||||||
|
.scrapy |
||||||
|
|
||||||
|
# Sphinx documentation |
||||||
|
docs/_build/ |
||||||
|
|
||||||
|
# PyBuilder |
||||||
|
target/ |
||||||
|
|
||||||
|
# Jupyter Notebook |
||||||
|
.ipynb_checkpoints |
||||||
|
|
||||||
|
# IPython |
||||||
|
profile_default/ |
||||||
|
ipython_config.py |
||||||
|
|
||||||
|
# pyenv |
||||||
|
.python-version |
||||||
|
|
||||||
|
# pipenv |
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||||
|
# install all needed dependencies. |
||||||
|
#Pipfile.lock |
||||||
|
|
||||||
|
# celery beat schedule file |
||||||
|
celerybeat-schedule |
||||||
|
|
||||||
|
# SageMath parsed files |
||||||
|
*.sage.py |
||||||
|
|
||||||
|
# Environments |
||||||
|
.env |
||||||
|
.venv |
||||||
|
env/ |
||||||
|
venv/ |
||||||
|
ENV/ |
||||||
|
env.bak/ |
||||||
|
venv.bak/ |
||||||
|
|
||||||
|
# Spyder project settings |
||||||
|
.spyderproject |
||||||
|
.spyproject |
||||||
|
|
||||||
|
# Rope project settings |
||||||
|
.ropeproject |
||||||
|
|
||||||
|
# mkdocs documentation |
||||||
|
/site |
||||||
|
|
||||||
|
# mypy |
||||||
|
.mypy_cache/ |
||||||
|
.dmypy.json |
||||||
|
dmypy.json |
||||||
|
|
||||||
|
# Pyre type checker |
||||||
|
.pyre/ |
@ -0,0 +1,3 @@ |
|||||||
|
# K-Means |
||||||
|
|
||||||
|
Implementations of Unweighted and Weighted K-Means. |
@ -0,0 +1,92 @@ |
|||||||
|
import random |
||||||
|
import sys |
||||||
|
from typing import List |
||||||
|
|
||||||
|
from kmeans.clustering.cluster import Cluster |
||||||
|
from kmeans.clustering.point import Point |
||||||
|
from kmeans.clustering.geometry import dist |
||||||
|
|
||||||
|
|
||||||
|
def mean_movement(clusters: List[Cluster]) -> float: |
||||||
|
""" |
||||||
|
Determines the maximum mean movement between two time periods. |
||||||
|
|
||||||
|
It will calculate the euclidean distance between each point and it's |
||||||
|
corresponding point in the other list (by finding the point using |
||||||
|
the point equality function built onto the Point class) and then return |
||||||
|
the distance moved. |
||||||
|
""" |
||||||
|
|
||||||
|
highest_movement = 0 |
||||||
|
|
||||||
|
for cluster in clusters: |
||||||
|
if cluster.mean_moved() > highest_movement: |
||||||
|
highest_movement = cluster.mean_moved() |
||||||
|
|
||||||
|
return highest_movement |
||||||
|
|
||||||
|
|
||||||
|
def unweighted_k_means(points: List[Point], k: int, d: float = 0.001) -> List[Cluster]: |
||||||
|
""" |
||||||
|
Runs Lloyd's Algorithm for k-means clustering without weights. |
||||||
|
|
||||||
|
@param points The list of points to cluster. |
||||||
|
@param k The number of clusters. |
||||||
|
@param d The threshold distance where we will consider the means converged. |
||||||
|
""" |
||||||
|
|
||||||
|
# Our means are initialized to points in the set randomly but |
||||||
|
# are kept separately from the points themselves. |
||||||
|
means = random.sample(points, k) |
||||||
|
|
||||||
|
clusters = [] |
||||||
|
|
||||||
|
for i, mean in enumerate(means): |
||||||
|
clusters.append(Cluster(mean, [], None)) |
||||||
|
|
||||||
|
mean_needs_moving = True |
||||||
|
|
||||||
|
# Refer to the mean_movement function. It simply returns the largest |
||||||
|
# mean movement of all clusters. Once the movement of all clusters is |
||||||
|
# lower than `d` we can consider the series converged and return. |
||||||
|
while mean_needs_moving: |
||||||
|
|
||||||
|
# Compare each point to every cluster's centroid and calculate the |
||||||
|
# distance to it. Assign to the nearest cluster. |
||||||
|
for point in points: |
||||||
|
best_dist = sys.maxsize |
||||||
|
best_cluster = None |
||||||
|
|
||||||
|
for cluster in clusters: |
||||||
|
d = dist(point, cluster.mean) |
||||||
|
if d < best_dist: |
||||||
|
best_dist = d |
||||||
|
best_cluster = cluster |
||||||
|
|
||||||
|
if best_cluster is None: |
||||||
|
raise ValueError( |
||||||
|
f'Failed to assign cluster to point {point}') |
||||||
|
|
||||||
|
if point.cluster is not None: |
||||||
|
point.cluster.remove_point(point) |
||||||
|
|
||||||
|
best_cluster.add_point(point) |
||||||
|
|
||||||
|
for cluster in clusters: |
||||||
|
# Update the mean with the new points |
||||||
|
if cluster.points is not None: |
||||||
|
xs = [p.x for p in cluster.points] |
||||||
|
ys = [p.y for p in cluster.points] |
||||||
|
|
||||||
|
# Averaging the xs and ys will give us the mean point |
||||||
|
# of our cluster. |
||||||
|
new_x = sum(xs) / len(xs) |
||||||
|
new_y = sum(ys) / len(ys) |
||||||
|
|
||||||
|
new_mean = Point(new_x, new_y) |
||||||
|
|
||||||
|
cluster.update_mean(new_mean) |
||||||
|
|
||||||
|
mean_needs_moving = mean_movement(clusters) > d |
||||||
|
|
||||||
|
return clusters |
@ -0,0 +1,72 @@ |
|||||||
|
from typing import List |
||||||
|
|
||||||
|
import numpy as np |
||||||
|
|
||||||
|
from .geometry import dist |
||||||
|
from .point import Point |
||||||
|
|
||||||
|
|
||||||
|
class Cluster: |
||||||
|
""" |
||||||
|
Represents a cluster of points. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, mean: Point, points: List[Point], color: List[float]): |
||||||
|
self._points = points |
||||||
|
self._children = None |
||||||
|
self._color = color |
||||||
|
self._mean = mean |
||||||
|
self._last_mean = None |
||||||
|
|
||||||
|
@property |
||||||
|
def points(self): |
||||||
|
return self._points |
||||||
|
|
||||||
|
@property |
||||||
|
def color(self): |
||||||
|
return self._color |
||||||
|
|
||||||
|
@property |
||||||
|
def mean(self): |
||||||
|
return self._mean |
||||||
|
|
||||||
|
@property |
||||||
|
def last_mean(self): |
||||||
|
return self._last_mean |
||||||
|
|
||||||
|
def update_mean(self, mean: Point): |
||||||
|
self._last_mean = self._mean |
||||||
|
self._mean = mean |
||||||
|
|
||||||
|
def mean_moved(self) -> float: |
||||||
|
return dist(self._mean, self._last_mean) |
||||||
|
|
||||||
|
@color.setter |
||||||
|
def color(self, color: List[float]): |
||||||
|
self._color = color |
||||||
|
|
||||||
|
def add_point(self, point: Point): |
||||||
|
self._points.append(point) |
||||||
|
point.cluster = self |
||||||
|
|
||||||
|
def remove_point(self, point: Point): |
||||||
|
self._points.remove(point) |
||||||
|
|
||||||
|
def __repr__(self): |
||||||
|
point_pairs = [(p.x, p.y) for p in self._points] |
||||||
|
string = f'POINTS: {point_pairs}\n' |
||||||
|
|
||||||
|
return string |
||||||
|
|
||||||
|
def __eq__(self, other): |
||||||
|
if not isinstance(other, Cluster): |
||||||
|
raise NotImplementedError('equality between clusters is only ' + |
||||||
|
'defined for clusters ' + |
||||||
|
f'(other={type(other)})') |
||||||
|
|
||||||
|
point_eq = True |
||||||
|
|
||||||
|
for point in self._points: |
||||||
|
point_eq = point_eq and (point in other.points) |
||||||
|
|
||||||
|
return point_eq |
@ -0,0 +1,11 @@ |
|||||||
|
import numpy as np |
||||||
|
|
||||||
|
from .point import Point |
||||||
|
|
||||||
|
|
||||||
|
def dist(p: Point, q: Point) -> float: |
||||||
|
""" |
||||||
|
Calculates the Euclidian Distance of a point and another point. |
||||||
|
""" |
||||||
|
return np.sqrt(pow(q.x - p.x, 2) + |
||||||
|
pow(q.y - p.y, 2)) |
@ -0,0 +1,56 @@ |
|||||||
|
class Point: |
||||||
|
def __init__(self, x: int, y: int, weight: float = 1.0): |
||||||
|
self._x = x |
||||||
|
self._y = y |
||||||
|
self._cluster = None |
||||||
|
self._weight = weight |
||||||
|
|
||||||
|
@property |
||||||
|
def cluster(self): |
||||||
|
return self._cluster |
||||||
|
|
||||||
|
@cluster.setter |
||||||
|
def cluster(self, cluster): |
||||||
|
self._cluster = cluster |
||||||
|
|
||||||
|
@property |
||||||
|
def x(self): |
||||||
|
return self._x |
||||||
|
|
||||||
|
@property |
||||||
|
def y(self): |
||||||
|
return self._y |
||||||
|
|
||||||
|
@property |
||||||
|
def weight(self): |
||||||
|
return self._weight |
||||||
|
|
||||||
|
@x.setter |
||||||
|
def x(self, x): |
||||||
|
self._x = x |
||||||
|
|
||||||
|
@y.setter |
||||||
|
def y(self): |
||||||
|
self._y = y |
||||||
|
|
||||||
|
@weight.setter |
||||||
|
def weight(self, weight): |
||||||
|
|
||||||
|
if not isinstance(weight, float): |
||||||
|
raise ValueError('Weight must be a float') |
||||||
|
|
||||||
|
self._weight = weight |
||||||
|
|
||||||
|
def array(self): |
||||||
|
""" |
||||||
|
Returns array representation for use in convex hull. |
||||||
|
""" |
||||||
|
return [self._x, self._y] |
||||||
|
|
||||||
|
def __eq__(self, other): |
||||||
|
if not isinstance(other, Point): |
||||||
|
raise NotImplementedError('equality between points is only ' + |
||||||
|
'defined for points ' + |
||||||
|
f'(other={type(other)})') |
||||||
|
|
||||||
|
return self._x == other._x and self._y == other._y |
@ -0,0 +1,19 @@ |
|||||||
|
from setuptools import setup, find_packages |
||||||
|
|
||||||
|
with open('README.md', 'r') as fh: |
||||||
|
long_description = fh.read() |
||||||
|
|
||||||
|
setup( |
||||||
|
name='kmeans', |
||||||
|
version='1.0.0', |
||||||
|
author='Taylor Bockman', |
||||||
|
author_email='tbockman@xchg.sh', |
||||||
|
description='Unweighted and Weighted K-Means implementation.', |
||||||
|
long_description=long_description, |
||||||
|
long_description_content_type='text/markdown', |
||||||
|
url='https://git.xchg.sh/angrygoats/kmeans', |
||||||
|
classifiers=[ |
||||||
|
], |
||||||
|
python_requires='>=3.6', |
||||||
|
packages=find_packages(), |
||||||
|
) |
Loading…
Reference in new issue