Skip to content

Commit

Permalink
Add padding parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakob Bauer committed Sep 1, 2014
1 parent d23471e commit 09be945
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 46 deletions.
44 changes: 28 additions & 16 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ INSENT := /remote/curtis/krivard/2014/kbp.dataset.2014-0.4/kbp.cfacts/inSentence
TACPR := /remote/curtis/bbd/KBP_2014/alignKBs/e54_v11.docid_wp14_enType_score_begin_end_mention.TAC_id_name_type.genericType.txt
GOLD := /remote/curtis/krivard/2014/e54_v11.tac_2014_kbp_english_EDL_training_KB_links.tab

# parameters for baseline clustering using global or local context
# (NB these lines may be changed)
# formatting parameters
FORMATTING_FLAGS := --padding=4

# baseline clustering parameters
GLOBAL_BASELINE_CLUSTERING_FLAGS := --threshold=0.5
LOCAL_BASELINE_CLUSTERING_FLAGS := --threshold=0.7

Expand Down Expand Up @@ -233,33 +235,37 @@ baseline: $(baseline0) $(baseline1) $(baseline2) $(baseline3) \

# string only
$(baseline0): $(qid_did_string_eid) venv | $(outdir)
$(PYTHON) $(srcdir)/baseline0.py $(qid_did_string_eid) > $@
$(PYTHON) $(srcdir)/baseline0.py $(FORMATTING_FLAGS) \
$(qid_did_string_eid) > $@

# string and did
$(baseline1): $(qid_did_string_eid) venv | $(outdir)
$(PYTHON) $(srcdir)/baseline1.py $(qid_did_string_eid) > $@
$(PYTHON) $(srcdir)/baseline1.py $(FORMATTING_FLAGS) \
$(qid_did_string_eid) > $@

# string and document distance (agglomerative)
$(baseline2): $(qid_did_string_eid) $(did_tok) venv | $(outdir)
$(PYTHON) $(srcdir)/baseline2.py $(GLOBAL_BASELINE_CLUSTERING_FLAGS) \
$(PYTHON) $(srcdir)/baseline2.py $(FORMATTING_FLAGS) \
$(GLOBAL_BASELINE_CLUSTERING_FLAGS) \
$(qid_did_string_eid) $(did_tok) > $@

# string and document distance (exploratory)
$(baseline3): $(qid_did_string_eid) $(did_tok) venv | $(outdir) $(iptdir)
rm -rf $(iptdir)/*
$(PYTHON) $(srcdir)/baseline3.py \
$(PYTHON) $(srcdir)/baseline3.py $(FORMATTING_FLAGS) \
$(qid_did_string_eid) $(did_tok) $(expdir) > $@

# string and sentence distance (agglomerative)
$(baseline4): $(qid_sid_string_eid) $(sid_tok) venv | $(outdir)
$(PYTHON) $(srcdir)/baseline4.py $(LOCAL_BASELINE_CLUSTERING_FLAGS) \
$(PYTHON) $(srcdir)/baseline4.py $(FORMATTING_FLAGS) \
$(LOCAL_BASELINE_CLUSTERING_FLAGS) \
$(qid_sid_string_eid) $(sid_tok) > $@

# string and sentence distance (exploratory)
# TODO PROBLEM HERE WHEN USING TRAIN + TEST DATA ###
$(baseline5): $(qid_sid_string_eid) $(sid_tok) venv | $(outdir) $(iptdir)
rm -rf $(iptdir)/*
$(PYTHON) $(srcdir)/baseline5.py \
$(PYTHON) $(srcdir)/baseline5.py $(FORMATTING_FLAGS) \
$(qid_sid_string_eid) $(sid_tok) $(expdir) > $@

# string and sentence distance (agglomerative) where available;
Expand All @@ -269,7 +275,7 @@ $(baseline6): $(qid_did_string_eid) $(did_tok) $(baseline4) venv | $(outdir)
$(PYTHON) $(srcdir)/generate_qid_did_string_eid_local.py \
$(qid_did_string_eid) $(baseline4) > $(qid_did_string_eid_agglomerative)
# step 2: perform document distance clustering on remaining nils
$(PYTHON) $(srcdir)/baseline2.py --existing \
$(PYTHON) $(srcdir)/baseline2.py --existing $(FORMATTING_FLAGS) \
$(qid_did_string_eid_agglomerative) $(did_tok) > $@
# TODO CHECK RESULT

Expand All @@ -282,7 +288,7 @@ $(baseline7): $(qid_did_string_eid) $(did_tok) $(baseline5) venv | $(outdir) \
$(qid_did_string_eid) $(baseline5) > $(qid_did_string_eid_exploratory)
# step 2: perform document distance clustering on remaining nils
rm -rf $(iptdir)/*
$(PYTHON) $(srcdir)/baseline3.py --existing \
$(PYTHON) $(srcdir)/baseline3.py --existing $(FORMATTING_FLAGS) \
$(qid_did_string_eid_exploratory) $(did_tok) $(expdir) > $@
# TODO CHECK RESULT

Expand All @@ -301,7 +307,8 @@ $(unsupervised0): $(rid_fid_weight_global) $(qid_rid) $(qid_eid) venv | \
# TODO WORKAROUND: SEED FILE WITH ONLY ONE DATAPOINT
echo "1\t1" > $(data_Y)
cd $(expdir); matlab $(M_FLAGS) $(EM_MAIN)
$(PYTHON) $(srcdir)/exploratory.py $(assgn_suffix) $(qid_rid) $(qid_eid) > $@
$(PYTHON) $(srcdir)/exploratory.py $(FORMATTING_FLAGS) \
$(assgn_suffix) $(qid_rid) $(qid_eid) > $@

# unsupervised with local context only
$(unsupervised1): $(rid_fid_weight_local) $(qid_rid) $(qid_eid) venv | \
Expand All @@ -311,7 +318,8 @@ $(unsupervised1): $(rid_fid_weight_local) $(qid_rid) $(qid_eid) venv | \
# TODO WORKAROUND: SEED FILE WITH ONLY ONE DATAPOINT
echo "1\t1" > $(data_Y)
cd $(expdir); matlab $(M_FLAGS) $(EM_MAIN)
$(PYTHON) $(srcdir)/exploratory.py $(assgn_suffix) $(qid_rid) $(qid_eid) > $@
$(PYTHON) $(srcdir)/exploratory.py $(FORMATTING_FLAGS) \
$(assgn_suffix) $(qid_rid) $(qid_eid) > $@

# unsupervised with global and local context
$(unsupervised2): $(rid_fid_weight) $(qid_rid) $(qid_eid) venv | \
Expand All @@ -321,7 +329,8 @@ $(unsupervised2): $(rid_fid_weight) $(qid_rid) $(qid_eid) venv | \
# TODO WORKAROUND: SEED FILE WITH ONLY ONE DATAPOINT
echo "1\t1" > $(data_Y)
cd $(expdir); matlab $(M_FLAGS) $(EM_MAIN)
$(PYTHON) $(srcdir)/exploratory.py $(assgn_suffix) $(qid_rid) $(qid_eid) > $@
$(PYTHON) $(srcdir)/exploratory.py $(FORMATTING_FLAGS) \
$(assgn_suffix) $(qid_rid) $(qid_eid) > $@

# semi-supervised with global context only
$(semi_supervised0): $(rid_fid_weight_global) $(rid_lid_score) $(qid_rid) \
Expand All @@ -331,7 +340,8 @@ $(semi_supervised0): $(rid_fid_weight_global) $(rid_lid_score) $(qid_rid) \
cp $(rid_lid_score) $(seeds_Y)
cp $(rid_lid_score) $(data_Y)
cd $(expdir); matlab $(M_FLAGS) $(EM_MAIN)
$(PYTHON) $(srcdir)/exploratory.py $(assgn_suffix) $(qid_rid) $(qid_eid) > $@
$(PYTHON) $(srcdir)/exploratory.py $(FORMATTING_FLAGS) \
$(assgn_suffix) $(qid_rid) $(qid_eid) > $@

# semi-supervised with local context only
$(semi_supervised1): $(rid_fid_weight_local) $(rid_lid_score) $(qid_rid) \
Expand All @@ -341,7 +351,8 @@ $(semi_supervised1): $(rid_fid_weight_local) $(rid_lid_score) $(qid_rid) \
cp $(rid_lid_score) $(seeds_Y)
cp $(rid_lid_score) $(data_Y)
cd $(expdir); matlab $(M_FLAGS) $(EM_MAIN)
$(PYTHON) $(srcdir)/exploratory.py $(assgn_suffix) $(qid_rid) $(qid_eid) > $@
$(PYTHON) $(srcdir)/exploratory.py $(FORMATTING_FLAGS) \
$(assgn_suffix) $(qid_rid) $(qid_eid) > $@

# semi-supervised with global and local context
$(semi_supervised2): $(rid_fid_weight) $(rid_lid_score) $(qid_rid) \
Expand All @@ -351,7 +362,8 @@ $(semi_supervised2): $(rid_fid_weight) $(rid_lid_score) $(qid_rid) \
cp $(rid_lid_score) $(seeds_Y)
cp $(rid_lid_score) $(data_Y)
cd $(expdir); matlab $(M_FLAGS) $(EM_MAIN)
$(PYTHON) $(srcdir)/exploratory.py $(assgn_suffix) $(qid_rid) $(qid_eid) > $@
$(PYTHON) $(srcdir)/exploratory.py $(FORMATTING_FLAGS) \
$(assgn_suffix) $(qid_rid) $(qid_eid) > $@

# ==============================================================================

Expand Down
24 changes: 0 additions & 24 deletions resources/ExploreEM_package_v2/ABIC_ExplEM_KM.m
Original file line number Diff line number Diff line change
Expand Up @@ -92,21 +92,10 @@
BaselineAssn(i) = cID;
P_Cj_Xi(i, :) = zeros(1, numClasses);
else
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEBUG
S=sprintf('iteration: %d', i);
disp(S);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Find out the entities which did not
% belong to any of the existing
% clusters, and put them in new clusters
if (numClasses == 0)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEBUG
S=sprintf('numClasses: %d', numClasses);
disp(S);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
decision =1;
maxWt = 0;
maxI = 0;
Expand Down Expand Up @@ -152,26 +141,13 @@
P_Cj_XiNorm(i+1:numDocs, :) = normrow(P_Cj_Xi(i+1:numDocs, :));
end
else
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEBUG
S=sprintf('cID : %d, maxI : %d', cID, maxI);
disp(S);
% cID is zero!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
cID = maxI;
end
end
if (hard == 1)
P_Cj_Xi(i, :) = zeros(1, numClasses);
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEBUG
S=sprintf('i : %d, cID : %d, maxI : %d', i, cID, maxI);
disp(S);
% cID is zero!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

P_Cj_Xi(i, cID) = maxWt;
NewAssgn(i) = cID;
ExploreLL = ExploreLL + log(maxWt);
Expand Down
6 changes: 5 additions & 1 deletion src/baseline0.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
# Parse path to input files
parser = argparse.ArgumentParser()
parser.add_argument('QID_DID_STRING_EID')
parser.add_argument('--padding', help='amount of padding in nid',
type=int, default=4)

args = parser.parse_args()
QID_DID_STRING_EID = args.QID_DID_STRING_EID
padding = args.padding

# Load 'qid did string eid' into dataframe
df1 = pd.read_table(QID_DID_STRING_EID, header=None,
Expand All @@ -23,7 +27,7 @@

# Assign nil-ID
c = count(start=1)
assign = lambda x: 'nil' + str(c.next()).zfill(3)
assign = lambda x: 'nil' + str(c.next()).zfill(padding)
df2['eid'] = df2.groupby('string')['eid'].transform(assign)

# Merge nil-IDs back into original dataframe
Expand Down
6 changes: 5 additions & 1 deletion src/baseline1.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
# Parse path to input files
parser = argparse.ArgumentParser()
parser.add_argument('QID_DID_STRING_EID')
parser.add_argument('--padding', help='amount of padding in nid',
type=int, default=4)

args = parser.parse_args()
QID_DID_STRING_EID = args.QID_DID_STRING_EID
padding = args.padding

# Load 'qid did string eid' into dataframe
df1 = pd.read_table(QID_DID_STRING_EID, header=None,
Expand All @@ -23,7 +27,7 @@

# Assign nil-ID
c = count(start=1)
assign = lambda x: 'nil' + str(c.next()).zfill(3)
assign = lambda x: 'nil' + str(c.next()).zfill(padding)
df2['eid'] = df2.groupby(['string', 'did'])['eid'].transform(assign)

# Merge nil-IDs back into original dataframe
Expand Down
6 changes: 5 additions & 1 deletion src/baseline2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
default='euclidean')
parser.add_argument('--method', help='clustering method', default='single')
parser.add_argument('--threshold', help='clustering threshold', default='0.5')
parser.add_argument('--padding', help='amount of padding in nid',
type=int, default=4)

args = parser.parse_args()
QID_DID_STRING_EID = args.QID_DID_STRING_EID
DID_TOK = args.DID_TOK
Expand All @@ -26,6 +29,7 @@
metric_cluster= args.cluster
method = args.method
threshold = float(args.threshold)
padding = args.padding

# Load 'qid did string eid' into dataframe
df1 = pd.read_table(QID_DID_STRING_EID, header=None,
Expand Down Expand Up @@ -79,7 +83,7 @@
for i, did in enumerate(docs):
nid[(string, did)] = ids[i]

assignID = lambda x: 'nil' + str(nid[tuple(x)]).zfill(3)
assignID = lambda x: 'nil' + str(nid[tuple(x)]).zfill(padding)
df2['eid'] = df2[['string', 'did']].apply(assignID, axis=1)

df5 = df2.combine_first(df1)
Expand Down
3 changes: 3 additions & 0 deletions src/baseline3.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
parser.add_argument('EXPLORE_EM')
parser.add_argument('--existing', help='pre-existing nill ids',
action='store_true')
parser.add_argument('--padding', help='amount of padding in nid',
type=int, default=4)

args = parser.parse_args()
QID_DID_STRING_EID = args.QID_DID_STRING_EID
DID_TOK = args.DID_TOK
EXPLORE_EM = args.EXPLORE_EM
EXISTING = args.existing
padding = args.padding

# Load 'qid did string eid' into dataframe
df1 = pd.read_table(QID_DID_STRING_EID, header=None,
Expand Down
6 changes: 5 additions & 1 deletion src/baseline4.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,17 @@
default='euclidean')
parser.add_argument('--method', help='clustering method', default='single')
parser.add_argument('--threshold', help='clustering threshold', default='0.5')
parser.add_argument('--padding', help='amount of padding in nid',
type=int, default=4)

args = parser.parse_args()
QID_SID_STRING_EID = args.QID_SID_STRING_EID
SID_TOK = args.SID_TOK
metric_pairwise = args.pairwise
metric_cluster= args.cluster
method = args.method
threshold = float(args.threshold)
padding = args.padding

# Load 'qid sid string eid' into dataframe
df1 = pd.read_table(QID_SID_STRING_EID, header=None,
Expand Down Expand Up @@ -69,7 +73,7 @@
for i, sid in enumerate(docs):
nid[(string, sid)] = ids[i]

assignID = lambda x: 'nil' + str(nid[tuple(x)]).zfill(3)
assignID = lambda x: 'nil' + str(nid[tuple(x)]).zfill(padding)
df2['eid'] = df2[['string', 'sid']].apply(assignID, axis=1)

df5 = df2.combine_first(df1)
Expand Down
5 changes: 4 additions & 1 deletion src/baseline5.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
parser.add_argument('QID_SID_STRING_EID')
parser.add_argument('SID_TOK')
parser.add_argument('EXPLORE_EM')
parser.add_argument('--padding', help='amount of padding in nid',
type=int, default=4)

args = parser.parse_args()
QID_SID_STRING_EID = args.QID_SID_STRING_EID
SID_TOK = args.SID_TOK
EXPLORE_EM = args.EXPLORE_EM
padding = args.padding

# Load 'qid sid string eid' into dataframe
df1 = pd.read_table(QID_SID_STRING_EID, header=None,
Expand Down Expand Up @@ -91,7 +94,7 @@
for i, sid in enumerate(docs):
nid[(string, sid)] = ids[i]

assignID = lambda x: 'nil' + str(nid[tuple(x)]).zfill(3)
assignID = lambda x: 'nil' + str(nid[tuple(x)]).zfill(padding)
df2['eid'] = df2[['string', 'sid']].apply(assignID, axis=1)

df5 = df2.combine_first(df1)
Expand Down
5 changes: 4 additions & 1 deletion src/exploratory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
parser.add_argument('ASSGN')
parser.add_argument('QID_RID')
parser.add_argument('QID_EID')
parser.add_argument('--padding', help='amount of padding in nid',
type=int, default=4)

args = parser.parse_args()
ASSGN = args.ASSGN
QID_RID = args.QID_RID
QID_EID = args.QID_EID
padding = args.padding

# Load ExploreEM cluster assignments
assignments = np.loadtxt(open(ASSGN))
Expand All @@ -32,7 +35,7 @@
df2 = pd.read_table(QID_EID, header=None, names=['qid', 'eid'])

df3 = pd.merge(df2, df1, on='qid')
df3['eid'] = df3.cid.map(lambda x: 'nil' + str(int(x)).zfill(3))
df3['eid'] = df3.cid.map(lambda x: 'nil' + str(int(x)).zfill(padding))

df2 = df2.set_index('qid')
df3 = df3[['qid', 'eid']].set_index('qid')
Expand Down

0 comments on commit 09be945

Please sign in to comment.