-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdbscan.py
More file actions
110 lines (89 loc) · 3.34 KB
/
dbscan.py
File metadata and controls
110 lines (89 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
# A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise
# Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu
# dbscan: density based spatial clustering of applications with noise
import numpy as np
import math
import csv
import sys
UNCLASSIFIED = False
NOISE = None
cids = []
def _dist(p,q):
return math.sqrt(np.power(p-q,2).sum())
def _eps_neighborhood(p,q,eps):
return _dist(p,q) < eps
def _region_query(m, point_id, eps):
n_points = m.shape[1]
seeds = []
for i in range(0, n_points):
if not i == point_id:
if _eps_neighborhood(m[:,point_id], m[:,i], eps):
seeds.append(i)
return seeds
def _expand_cluster(m, classifications, point_id, cluster_id, eps, min_points):
seeds = _region_query(m, point_id, eps)
if len(seeds) < min_points:
classifications[point_id] = NOISE
return False
else:
classifications[point_id] = cluster_id
for seed_id in seeds:
classifications[seed_id] = cluster_id
while len(seeds) > 0:
current_point = seeds[0]
results = _region_query(m, current_point, eps)
if len(results) >= min_points:
for i in range(0, len(results)):
result_point = results[i]
if classifications[result_point] == UNCLASSIFIED or \
classifications[result_point] == NOISE:
if classifications[result_point] == UNCLASSIFIED:
seeds.append(result_point)
classifications[result_point] = cluster_id
seeds = seeds[1:]
return True
def dbscan(m, eps, min_points):
"""Implementation of Density Based Spatial Clustering of Applications with Noise
See https://en.wikipedia.org/wiki/DBSCAN
scikit-learn probably has a better implementation
Uses Euclidean Distance as the measure
Inputs:
m - A matrix whose columns are feature vectors
eps - Maximum distance two points can be to be regionally related
min_points - The minimum number of points to make a cluster
Outputs:
An array with either a cluster id number or dbscan.NOISE (None) for each
column vector in m.
"""
cluster_id = 1
n_points = m.shape[1]
classifications = [UNCLASSIFIED] * n_points
for point_id in range(0, n_points):
point = m[:,point_id]
if classifications[point_id] == UNCLASSIFIED:
if _expand_cluster(m, classifications, point_id, cluster_id, eps, min_points):
cluster_id = cluster_id + 1
for classifcation in classifications:
cids.append(classifcation)
return classifications
def test_dbscan():
m = np.asmatrix(np.loadtxt('noaa-hail-cleaned-100-coords-dbscan.csv', delimiter=","))
print m.shape
eps = 0.99
min_points = 4
# Transpose coordinate matrix
q = np.transpose(m)
print(dbscan(q, eps, min_points))
coords = np.array(m).tolist()
i = 0
for coord in coords:
coord.append(str(cids[i]))
i += 1
#print(coords)
# Open results output file
outFile = open("noaa-hail-dbscan-classified.csv", 'wb')
resultWriter = csv.writer(outFile, delimiter=",")
for coord in coords:
resultWriter.writerow(coord)
test_dbscan()