forked from BIDData/BIDMach
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuildcriteo.ssc
executable file
·80 lines (76 loc) · 2.13 KB
/
buildcriteo.ssc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
val dir = "d:/criteo/parts/";
val ivals = new Array[IMat](26);
val mvals = new Array[IMat](26);
println("Building feature lookup tables")
for (i <- 0 to 87) {
val a= if (i <= 76) {
loadIMat(dir+("traindata%02d.imat.lz4" format i));
} else {
loadIMat(dir+("testdata%02d.imat.lz4" format i-77));
}
for (j <- 0 until 26) {
if (i == 0) {
ivals(j) = unique(a(13+j,?).t);
} else {
ivals(j) = unique(a(13+j,?).t on ivals(j));
}
}
print(".")
}
println("")
println("Building feature matrices");
val dicts = ivals.map(IDict(_));
val offsets = izeros(27,1);
for (i <- 0 until 26) {
if (ivals(i).length < 20000000) {
offsets(i+1) = offsets(i) + ivals(i).length;
} else {
offsets(i+1) = offsets(i);
}
}
for (i <- 0 to 87) {
val (a,b)= if (i <= 76) {
(loadIMat(dir+("traindata%02d.imat.lz4" format i)),
loadIMat(dir+("trainnz%02d.imat.lz4" format i)));
} else {
(loadIMat(dir+("testdata%02d.imat.lz4" format i-77)),
loadIMat(dir+("testnz%02d.imat.lz4" format i-77)));
}
val a0 = a(0->13,?);
val b0 = b(0->13,?);
val a1 = a(13->39,?);
val b1 = b(13->39,?);
val nnz = sum(sum(b1)).v;
val rows = IMat(nnz,1);
val cols = IMat(nnz,1);
val vv = ones(nnz,1);
for (j <- 0 until 26) {
val cc = a1(j, ?).t;
mvals(j) = IDict(cc) --> dicts(j);
}
var ix = 0;
var icol = 0;
val ioff = Mat.ioneBased;
while (icol < a.ncols) {
var irow = 0;
while (irow < a1.nrows) {
if (b1(irow,icol) > 0 && offsets(irow+1) > offsets(irow)) {
rows(ix) = offsets(irow) + mvals(irow)(icol) + ioff;
cols(ix) = icol;
ix += 1;
}
irow += 1;
}
icol += 1;
}
val ccols = BIDMat.SparseMat.compressInds(cols.data, a.ncols, new Array[Int](a.ncols+1), ix);
val sout = SMat(offsets(26), a.ncols, ix, rows.data, ccols, vv.data);
val out = sparse(iones(1, b0.ncols) on b0 on a0) on sout;
out.check;
if (i <= 76) {
saveSMat(dir+("train%02d.smat.lz4" format i), out);
} else {
saveSMat(dir+("test%02d.smat.lz4" format i-77), out);
}
print(".")
}