Integrate fixed version of exploreEM into module

krivard · Sep 2, 2014 · 27832cf · 27832cf
1 parent 9c73c51
commit 27832cf
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 8 deletions.
diff --git a/Makefile b/Makefile
@@ -231,8 +231,8 @@ $(gold_qid_eid): $(GOLD) $(baseline0) venv | $(datdir)
 # baseline clustering
 .PHONY: baseline
 baseline: $(baseline0) $(baseline1) $(baseline2) $(baseline3) \
-	$(baseline4) $(baseline6)
-	#$(baseline4) $(baseline5) $(baseline6) $(baseline7)
+	$(baseline4) $(baseline5) $(baseline6) $(baseline7)
+	#$(baseline4) $(baseline6)
 
 # string only
 $(baseline0): $(qid_did_string_eid) venv | $(outdir)

diff --git a/README b/README
@@ -0,0 +1,33 @@
+################################################################################
+#                   README NILCLUSTERING MODULE TAC/KBP 2014                   #
+################################################################################
+
+Configuration
+-------------
+
+Befor installing and running the code it is necessary to specify the paths to
+the external data (e.g., ProPPR output). This has to be done in the first
+section of the Makefile. The necessary files are:
+
+[DESCRIPTION OF FILES]
+
+[also: formatting]
+
+Installation
+------------
+
+Run "make" inside the main directory. This will set up a virtual environment
+with the necessary python dependencies and then run the different clustering
+scripts. If you only want to install the virtual environment without running 
+the clustering scripts, use "make venv" instead. 
+
+Running the code
+----------------
+
+[all, raw, baseline, explore targets]
+[Describe different stages of clustering]
+[Describe output]
+
+Cleaning up
+-----------
+
diff --git a/resources/ExploreEM_package_v2/ABIC_ExplEM_KM.m b/resources/ExploreEM_package_v2/ABIC_ExplEM_KM.m
@@ -58,7 +58,7 @@
     BaselineLL = 0;
     ExploreLL = 0;
     BaselineNumClusters = numClasses;
-    numClasses
+    %numClasses
     %iter
     % E step : Estimate P(cluster_j | X_i) based on cosine similarity
     S=sprintf('-------- KM Iteration : %d : E step started -------------', iter);
@@ -147,7 +147,7 @@
         if (hard == 1)
             P_Cj_Xi(i, :) = zeros(1, numClasses);
         end
-
+        
         P_Cj_Xi(i, cID) = maxWt;
         NewAssgn(i) = cID;
         ExploreLL = ExploreLL + log(maxWt);
@@ -166,7 +166,8 @@
         % Temp Recompute centorids
         TcentroidsTemp = P_Cj_XiNorm' * XNorm;
         TnumClasses = size(TcentroidsTemp,1);
-
+
+        %clusterSizes 
         % Keep only those clusters which have at least 1 point assigned to it
         Tcentroids = TcentroidsTemp(1:numSeedClasses, :);
         for c = numSeedClasses+1 : numClasses
@@ -215,6 +216,7 @@
                 explore = 0;
                 NewAssgn = BaselineAssn;
                 P_Cj_XiNorm = normrow(Baseline_P_Cj_Xi);
+                %size(Baseline_P_Cj_Xi)
                 newLL = BaselineLL;
             end
         end
@@ -227,9 +229,18 @@
     centroidsTemp = P_Cj_XiNorm' * XNorm;
     numClasses = size(centroidsTemp,1);
 
-    % Keep only those clusters which have at least 1 point assigned to it
-    centroids = centroidsTemp(1:numSeedClasses, :);
-    for c = numSeedClasses+1 : numClasses
+    % Keep only those clusters which have at more than 1 datapoints assigned to it
+    % Except for the case where, #seed classes=0, we keep at least 1 class to carry forward
+    if numSeedClasses == 0
+	numSeedClassesTemp = 1;
+    else
+	numSeedClassesTemp = numSeedClasses;
+    end
+    centroids = centroidsTemp(1:numSeedClassesTemp, :);
+    %numSeedClasses
+    %sum(P_Cj_XiNorm(:,1))
+    %clusterSizes
+    for c = numSeedClassesTemp+1 : numClasses
         if (sum(P_Cj_XiNorm(:,c)) > 0  && clusterSizes(c) > 1)
             centroids = [centroids ; centroidsTemp(c, :)];
         end