-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_datasets.py
executable file
·58 lines (42 loc) · 1.71 KB
/
generate_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import random
import csv
# divides users.dat into training and test set. P denotes the number of articles used for test set. P=1 sparse setting. P =10 dense setting. Default is dense setting
def create_training_test_sets(P=10,n=5551):
train = [[] for _dummy in xrange(n)]
test = [[] for _dummy in xrange(n)]
fp= open("data/citeulike-a/raw_inputs/users.dat")
users = fp.readlines()
num_users = len(users)
for user_id in range(num_users):
entry = users[user_id]
entry = entry.split(" ")
num_articles = int(entry[0])
articles = entry[1:]
train_idx = random.sample(xrange(num_articles),P)
train[user_id].append(P)
test[user_id].append(num_articles-P)
for i in range(num_articles):
if i in train_idx:
train[user_id].append(int(articles[i]))
else:
test[user_id].append(int(articles[i]))
return train,test
def generate_datasets(num_samples,num_sets=1,setting="dense",):
#manually change settings for citeulike-t dataset
if setting == "dense":
P = 10
else:
P = 1
random.seed(1234)
for i in range(num_sets):
train,test = create_training_test_sets(P,num_samples )
trainFile = "data/citeulike-a/train_P"+str(P)+"_"+str(i+1)+".dat"
testFile = "data/citeulike-a/test_P"+str(P)+"_"+str(i+1)+".dat"
with open(trainFile, "wb") as f:
writer = csv.writer(f, delimiter=' ')
writer.writerows(train)
with open(testFile, "wb") as f:
writer = csv.writer(f, delimiter=' ')
writer.writerows(test)
if __name__ == '__main__':
generate_datasets(num_samples=5551,num_sets=3,setting="sparse")