Skip to content

Commit

Permalink
Integrate fixed version of exploreEM into module
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakob Bauer committed Sep 2, 2014
1 parent 9c73c51 commit 27832cf
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 8 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,8 @@ $(gold_qid_eid): $(GOLD) $(baseline0) venv | $(datdir)
# baseline clustering
.PHONY: baseline
baseline: $(baseline0) $(baseline1) $(baseline2) $(baseline3) \
$(baseline4) $(baseline6)
#$(baseline4) $(baseline5) $(baseline6) $(baseline7)
$(baseline4) $(baseline5) $(baseline6) $(baseline7)
#$(baseline4) $(baseline6)

# string only
$(baseline0): $(qid_did_string_eid) venv | $(outdir)
Expand Down
33 changes: 33 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
################################################################################
# README NILCLUSTERING MODULE TAC/KBP 2014 #
################################################################################

Configuration
-------------

Befor installing and running the code it is necessary to specify the paths to
the external data (e.g., ProPPR output). This has to be done in the first
section of the Makefile. The necessary files are:

[DESCRIPTION OF FILES]

[also: formatting]

Installation
------------

Run "make" inside the main directory. This will set up a virtual environment
with the necessary python dependencies and then run the different clustering
scripts. If you only want to install the virtual environment without running
the clustering scripts, use "make venv" instead.

Running the code
----------------

[all, raw, baseline, explore targets]
[Describe different stages of clustering]
[Describe output]

Cleaning up
-----------

23 changes: 17 additions & 6 deletions resources/ExploreEM_package_v2/ABIC_ExplEM_KM.m
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
BaselineLL = 0;
ExploreLL = 0;
BaselineNumClusters = numClasses;
numClasses
%numClasses
%iter
% E step : Estimate P(cluster_j | X_i) based on cosine similarity
S=sprintf('-------- KM Iteration : %d : E step started -------------', iter);
Expand Down Expand Up @@ -147,7 +147,7 @@
if (hard == 1)
P_Cj_Xi(i, :) = zeros(1, numClasses);
end

P_Cj_Xi(i, cID) = maxWt;
NewAssgn(i) = cID;
ExploreLL = ExploreLL + log(maxWt);
Expand All @@ -166,7 +166,8 @@
% Temp Recompute centorids
TcentroidsTemp = P_Cj_XiNorm' * XNorm;
TnumClasses = size(TcentroidsTemp,1);


%clusterSizes
% Keep only those clusters which have at least 1 point assigned to it
Tcentroids = TcentroidsTemp(1:numSeedClasses, :);
for c = numSeedClasses+1 : numClasses
Expand Down Expand Up @@ -215,6 +216,7 @@
explore = 0;
NewAssgn = BaselineAssn;
P_Cj_XiNorm = normrow(Baseline_P_Cj_Xi);
%size(Baseline_P_Cj_Xi)
newLL = BaselineLL;
end
end
Expand All @@ -227,9 +229,18 @@
centroidsTemp = P_Cj_XiNorm' * XNorm;
numClasses = size(centroidsTemp,1);

% Keep only those clusters which have at least 1 point assigned to it
centroids = centroidsTemp(1:numSeedClasses, :);
for c = numSeedClasses+1 : numClasses
% Keep only those clusters which have at more than 1 datapoints assigned to it
% Except for the case where, #seed classes=0, we keep at least 1 class to carry forward
if numSeedClasses == 0
numSeedClassesTemp = 1;
else
numSeedClassesTemp = numSeedClasses;
end
centroids = centroidsTemp(1:numSeedClassesTemp, :);
%numSeedClasses
%sum(P_Cj_XiNorm(:,1))
%clusterSizes
for c = numSeedClassesTemp+1 : numClasses
if (sum(P_Cj_XiNorm(:,c)) > 0 && clusterSizes(c) > 1)
centroids = [centroids ; centroidsTemp(c, :)];
end
Expand Down

0 comments on commit 27832cf

Please sign in to comment.