Skip to content

Commit

Permalink
before going the R stats route
Browse files Browse the repository at this point in the history
  • Loading branch information
seekshreyas committed Dec 9, 2013
1 parent 3f0b5d9 commit a18217e
Showing 1 changed file with 98 additions and 12 deletions.
110 changes: 98 additions & 12 deletions clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@
from classifier import getAnalysisData
from nltk import cluster
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pylab as pl
from sklearn import metrics
from time import time
from scipy.cluster import vq
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from scipy.spatial.distance import pdist, squareform
import scipy.cluster.hierarchy as hy



def getUserInput():
Expand All @@ -38,19 +49,28 @@ def createCluster(data, cltype):
# pprint(data)

vectors = []
labels = []

for row in data:
rowval = []
labels = []
labelval = True
if row[1] == 'unfair':
labelval = False
labels.append(labelval)

for k,v in row[0].iteritems():

if k == 'hasPrivacy' or k == 'hasDeveloperEmail' or k == 'hasDeveloperWebsite':
v = bool(v)

v = int(bool(v))
rowval.append(v)

vectors.append(np.array(rowval))

pprint(vectors)
# vectors = np.asarray(vectors)
data = np.vstack(vectors)


means = [vectors[20].tolist(), vectors[21].tolist()]

Expand All @@ -59,16 +79,82 @@ def createCluster(data, cltype):
clusters = clusterer.cluster(vectors, True)
clusterer.dendrogram().show()
elif cltype == 'kmeans':
clusterer = cluster.EMClusterer(initial_means=means)
clusters = clusterer.cluster(vectors, assign_clusters=True, trace=False)

for c in range(2):
print 'Clustered:', vectors
print 'As:', clusters
print 'Cluster:', c
print 'Prior: ', clusterer._priors[c]
print 'Mean: ', clusterer._means[c]
print 'Covar: ', clusterer._covariance_matrices[c]
centroids, variance = vq.kmeans(data, 3)
identified, distance = vq.vq(data, centroids)

print identified
print centroids

print variance


elif cltype == 'hy':
# Creating a cluster of clusters function
def clusters(number=20, cnumber=5, csize=10):
# Note that the way the clusters are positioned is Gaussian randomness.
rnum = np.random.rand(cnumber, 2)
rn = rnum[:, 0] * number
rn = rn.astype(int)
rn[np.where(rn < 5)] = 5
rn[np.where(rn > number / 2.)] = round(number / 2., 0)
ra = rnum[:, 1] * 2.9
ra[np.where(ra < 1.5)] = 1.5

cls = np.random.randn(number, 3) * csize

# Random multipliers for central point of cluster
rxyz = np.random.randn(cnumber - 1, 3)
for i in xrange(cnumber - 1):
tmp = np.random.randn(rn[i + 1], 3)
x = tmp[:, 0] + (rxyz[i, 0] * csize)
y = tmp[:, 1] + (rxyz[i, 1] * csize)
z = tmp[:, 2] + (rxyz[i, 2] * csize)
tmp = np.column_stack([x, y, z])
cls = np.vstack([cls, tmp])
return cls

# Generate a cluster of clusters and distance matrix.
cls = clusters()
D = pdist(cls[:, 0:2])
D = squareform(D)

# Compute and plot first dendrogram.
fig = plt.figure(figsize=(8, 8))
ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
Y1 = hy.linkage(D, method='complete')
cutoff = 0.3 * np.max(Y1[:, 2])
Z1 = hy.dendrogram(Y1, orientation='right', color_threshold=cutoff)
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
Y2 = hy.linkage(D, method='average')
cutoff = 0.3 * np.max(Y2[:, 2])
Z2 = hy.dendrogram(Y2, color_threshold=cutoff)
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

# Plot distance matrix.
ax3 = fig.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1, :]
D = D[:, idx2]
ax3.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
ax3.xaxis.set_visible(False)
ax3.yaxis.set_visible(False)

# Plot colorbar.
fig.savefig('scipy_352_ex1.pdf', bbox='tight')











Expand Down

0 comments on commit a18217e

Please sign in to comment.