forked from BIDData/BIDMach
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetrcv1.ssc
75 lines (68 loc) · 2.52 KB
/
getrcv1.ssc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
object RCV1 {
def prepare(dict:String) {
println("Preprocessing"); preprocess(dict)
println("Making Sparse Data Matrix"); mksparse(dict,"")
println("Making Category Matrix"); mkcats(dict,"")
println("Making Sparse Test Data Matrix"); mksparse(dict,"test")
println("Making Test Category Matrix"); mkcats(dict,"test")
}
def preprocess(dict:String) {
val dictm = CSMat(loadSBMat(dict+"dict.sbmat.gz"))
val wc = loadIMat(dict+"dict.imat.gz")
val a0 = loadIMat(dict+"lyrl2004_tokens_test_pt0.dat.imat.gz")
val a1 = loadIMat(dict+"lyrl2004_tokens_test_pt1.dat.imat.gz")
val a2 = loadIMat(dict+"lyrl2004_tokens_test_pt2.dat.imat.gz")
val a3 = loadIMat(dict+"lyrl2004_tokens_test_pt3.dat.imat.gz")
val a4 = loadIMat(dict+"lyrl2004_tokens_train.dat.imat.gz")
val a = (a0 on a1) on (a2 on a3)
val (swc, ii) = sortdown2(wc)
val sdict = dictm(ii)
val bdict = SBMat(sdict)
val n = ii.length
val iinv = izeros(n, 1)
iinv(ii) = icol(0->n)
val jj = find(a > 0)
a(jj,0) = iinv(a(jj,0)-1)
val jj2 = find(a4 > 0)
a4(jj2,0) = iinv(a4(jj2,0)-1)
saveIMat(dict+"tokens.imat.lz4", a)
saveIMat(dict+"testtokens.imat.lz4", a4)
saveSBMat(dict+"../sdict.sbmat.lz4", bdict)
saveIMat(dict+"../swcount.imat.lz4", swc)
}
def mksparse(dict:String, prefix:String) {
val a = loadIMat(dict+prefix+"tokens.imat.lz4")
val dictm = Dict(loadSBMat(dict+"../sdict.sbmat.lz4"))
val swc = loadIMat(dict+"../swcount.imat.lz4")
val tab = izeros(a.nrows,2)
tab(?,1) = a
val ii = find(a == dictm(".i"))
val wi = find(a == dictm(".w"))
tab(ii,1) = -1
tab(wi,1) = -1
val lkup = a(ii+1)
BIDMach.Experiments.clearbit(lkup)
tab(ii,0) = 1
tab(0,0) = 0
tab(?,0) = cumsum(tab(?,0))
val iikeep = find(tab(?,1) >= 0)
val ntab = tab(iikeep,?)
val sm = sparse(ntab(?,1), ntab(?,0), ones(ntab.nrows,1), swc.length, ii.length)
saveSMat(dict+"../"+prefix+"docs.smat.lz4", sm)
saveIMat(dict+"../"+prefix+"lkup.imat.lz4", lkup)
}
def mkcats(dict:String, prefix:String) {
val lkup = loadIMat(dict+"../"+prefix+"lkup.imat.lz4")
val catids=loadIMat(dict+"../catname.imat")
val docids=loadIMat(dict+"../docid.imat")
val nd = math.max(maxi(lkup).v,maxi(docids).v)+1
val nc = maxi(catids).v
val cmat = izeros(nc,nd)
val indx = catids - 1 + nc*docids
cmat(indx) = 1
val cm = FMat(cmat(?,lkup))
saveFMat(dict+"../"+prefix+"cats.fmat.lz4", cm)
}
}
RCV1.prepare("data/rcv1/tokenized/")
sys.exit