forked from sophiaxc/clustexer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
209 lines (167 loc) · 7.66 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
Data should come in the formats:
lat, lng, cluster_index
OR
lat, lng
>> In this case, specify the k for clustering with --num-clusters
Example usage to get convex hull polygons to stdout from clustered input:
python parser.py clustered_data/marin_pts.csv
>> Convext Hull Format: neighborhood_index,lat1 lng1; lat2 lng2; lat3 lng3
Example usage to get convex hulls to stdout from clustered input with label:
python parser.py clustered_data/marin_pts.csv --cluster-prefix marin
>> Convext Hull Format: label, neighborhood_index,lat1 lng1; ...
Example usage to get convex hulls polygons to stdout from unclustered data
python parser.py raw_data/marin_trip_pts.csv --num-clusters 15
Example usage to output convex hulls to html from clustered data:
python parser.py clustered_data/marin_pts.csv --html marin.html
Example usage to get convex hull points from clustered data:
python parser.py clustered_data/marin_pts.csv --cluster-prefix marin
>> Outputs to a file called marin.csv
Example usage to get convex hulls from unclustered data to html
python parser.py raw_data/marin_trip_pts.csv --html marin.html --num-clusters 15
Example usage to get convex hulls from unclustered data to html and save shapes to file
python parser.py raw_data/marin_trip_pts.csv --html marin.html
--num-clusters 15 --cluster-prefix marin
>> Outputs convex hulls to a file called marin.csv
"""
import argparse
import numpy
from numpy import array
from scipy.cluster.vq import kmeans,vq
from collections import defaultdict
from pyhull.convex_hull import ConvexHull
from jinja2 import Environment, FileSystemLoader
def parse_file(filename, output_html=None, cluster_prefix=None,
num_clusters=None):
"""Parses a file for clustering/display.
Given a filename in the format of lat, lng, neighborhood id,
generate the convex hull polygon for each neighborhood.
"""
cluster_points = {}
if num_clusters is None:
cluster_points = read_clustered_data(filename)
else:
cluster_points = read_unclustered_data(filename, num_clusters)
# Generate the convex hulls mapped to their cluster id
convex_hulls = dict((id, get_convex_hull_polygon(points))
for id, points in cluster_points.iteritems())
if output_html is not None:
with open(output_html, 'w') as outf:
env = Environment(loader=FileSystemLoader('./'))
template = env.get_template('_base.html')
polygon_centers = dict((id, get_polygon_center(polygon))
for id, polygon in convex_hulls.iteritems())
# N.B. for every large numbers of polygons, we'd want to
# use template.generate()
outf.write(template.render(
convex_hulls=convex_hulls,
polygon_centers=polygon_centers,
map_bounds=get_map_bounds(convex_hulls)))
for id, polygon in convex_hulls.iteritems():
output_formatted_polygon(id, polygon, cluster_prefix)
def get_map_bounds(convex_hulls):
"""Return dictionary of map bounding box coordinates.
Using the points of all the convex hulls, calculates the bounding box
for the map to display the convex hulls.
"""
all_points = []
for id, polygon in convex_hulls.iteritems():
all_points.extend(polygon)
lats = [point[0] for point in all_points]
lngs = [point[1] for point in all_points]
south, north = min(lats), max(lats)
west, east = min(lngs), max(lngs)
# Flip east/west if the bounding doesn't contain polygons.
# Check if each point is within the west/east bounds.
if not all(west <= lng <= east for lng in lngs):
east, west = west, east
return {"SW" : (south, west), "NE" : (north, east)}
def read_unclustered_data(filename, num_clusters):
"""Return dictionary of cluster id to array of points.
Given a filename in the format of lat, lng
generate k clusters based on arguments. Outputs a dictionary with
the cluster id as the key mapped to a list of lat, lng pts
"""
request_points = []
with open(filename, 'rb') as input_file:
input_file.next() # Skip the header row
for line in input_file:
lat, lng = line.split(',')
request_points.append((float(lat), float(lng)))
request_points = array(request_points)
# computing K-Means with K = num_clusters
centroids,_ = kmeans(request_points, int(num_clusters))
# assign each sample to a cluster
idx, _ = vq(request_points,centroids)
# map cluster lat, lng to cluster index
cluster_points = defaultdict(list)
for i in xrange(len(request_points)):
lat, lng = request_points[i]
cluster_points[idx[i]].append((lat, lng))
return cluster_points
def read_clustered_data(filename):
"""Return dictionary of cluster id to array of points.
Given a filename in the format of lat, lng, cluster_id
Outputs a dictionary with the cluster id as the key mapped to
a list of lat, lng pts
"""
cluster_points = defaultdict(list)
with open(filename, 'rb') as input_file:
input_file.next() # Skip the header row
for line in input_file:
lat, lng, cluster_id = line.split(',')
cluster_points[int(cluster_id)].append((float(lat), float(lng)))
return cluster_points
def output_formatted_polygon(id, polygon_points, prefix=None):
"""Writes out formatted polygons to stdout or file.
If prefix is defined, outputs row as
prefix,neighborhood_n,x1 y1;x2 y2;x3 y3
"""
formatted_points = ";".join("%s %s" % pt for pt in polygon_points)
output = ["neighborhood_%s" % id, formatted_points]
if prefix:
output.insert(0, prefix)
with open('%s.csv' % (prefix,), 'a') as output_file:
output_file.write(','.join(output) + '\n')
else:
print ','.join(output)
def get_polygon_center(points):
"""Returns a tuple representing a polygon center.
Get a polygon's "center" by averaging the x/y
"""
return (numpy.mean([point[0] for point in points]),
numpy.mean([point[1] for point in points]))
def get_convex_hull_polygon(points):
"""Returns an array of points.
Given a set of points, generate the convex hull as a polygon.
"""
hull = ConvexHull(points)
# break it out into a dictionary of vertex to its neighbor vertex.
vertices_dict = dict((vertex[0], vertex[1]) for vertex in hull.vertices)
ordered_vertices = follow_vertices(hull.vertices[0][0], vertices_dict)
polygon_points = [hull.points[index] for index in ordered_vertices]
return polygon_points
def follow_vertices(start_vertex, vertices_dict):
"""Returns an array of vertices.
Follow the start vertex to generate the vertices in order, given
a dictionary with the key as a vertex and the value as its neighbor vertex.
"""
vertices = [start_vertex]
next_vertex = vertices_dict[start_vertex]
while next_vertex != start_vertex:
vertices.append(next_vertex)
next_vertex = vertices_dict[next_vertex]
vertices.append(start_vertex)
return vertices
if __name__ == '__main__':
# Paaaarsing!
parser = argparse.ArgumentParser()
parser.add_argument('filename')
parser.add_argument('--html',
help="If specified, outputs html representation of clusters to filename.")
parser.add_argument('--cluster-prefix',
help="If specified, cluster prefix outputs polygons to a file of")
parser.add_argument('--num-clusters', type=int,
help="If specified, k means clusters are computed from the input file.")
args = parser.parse_args()
parse_file(args.filename, args.html, args.cluster_prefix, args.num_clusters)