This repository was archived by the owner on Apr 30, 2020. It is now read-only.
forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathDocid2Siteflags.cpp
153 lines (121 loc) · 4.71 KB
/
Docid2Siteflags.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#include "Docid2Siteflags.h"
#include "Log.h"
#include "Conf.h"
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <errno.h>
#include <string.h>
#include <algorithm>
//format of docid->siteid file:
// docid2siteid_file ::= { entry }
// entry ::= flags26 | docid38 | siteid32
// docid38::= 38-bit (actual) docid
// flags26::= 26 bit flags for boolean lists
// siteid32::= 32-bit hash of site (as per SiteGetter)
//Note: entries that have no bits set in flags26 are not written to output file.
//
// Important: the flags+docid fields are 64 bit in total. docid is in the high bits. this makes sorting+binary search easy.
struct Docid2FlagsAndSiteMapEntry {
uint64_t flags : 26;
uint64_t docid : 38;
uint32_t sitehash32 : 32;
} __attribute__((packed, aligned(4)));
Docid2FlagsAndSiteMap g_d2fasm;
static const char filename[] = "docid2flagsandsitemap.dat";
static bool cmp(const Docid2FlagsAndSiteMapEntry &e1, const Docid2FlagsAndSiteMapEntry &e2) {
//Normally this would be:
// return e1.docid < e2.docid;
//However, we do a dirty trick here: we just treat docid+flags as a uint64_t.
//This works fine because we will not have duplicated docids in the table and we don't care about the flags.
//This generates more efficient code than what gcc does with packed structs and bitfields.
return *(const uint64_t*)&e1 < *(const uint64_t*)&e2;
}
bool Docid2FlagsAndSiteMap::load()
{
log(LOG_DEBUG, "Loading %s", filename);
struct stat st;
if(stat(filename,&st)!=0) {
log(LOG_WARN,"stat(%s) failed with errno=%d (%s)", filename, errno, strerror(errno));
return false;
}
unsigned new_active_index = 1-active_index;
if(!mmf[new_active_index].open(filename)) {
if(errno==ENOENT)
log(LOG_INFO,"Couldn't open %s, errno=%d (%s)", filename, errno, strerror(errno));
else
log(LOG_WARN,"Couldn't open %s, errno=%d (%s)", filename, errno, strerror(errno));
return false;
}
//load the entries in one go
if(mmf[new_active_index].size()%sizeof(Docid2FlagsAndSiteMapEntry)) {
log(LOG_WARN,"%s size is not a multiple of %zu", filename, sizeof(Docid2FlagsAndSiteMapEntry));
return false;
}
//ok, is the file sorted as we expect it to be?
if(mmf[new_active_index].size() >= sizeof(Docid2FlagsAndSiteMapEntry)*5) {
//just probe 5 elements
size_t c = mmf[new_active_index].size() / sizeof(Docid2FlagsAndSiteMapEntry);
auto e = reinterpret_cast<const Docid2FlagsAndSiteMapEntry*>(mmf[new_active_index].start());
if(e[c/5*0].docid<e[c/5*1].docid &&
e[c/5*1].docid<e[c/5*2].docid &&
e[c/5*2].docid<e[c/5*3].docid &&
e[c/5*3].docid<e[c-1 ].docid)
; //excellent
else {
log(LOG_WARN,"%s is not sorted. Either regenerate it or use 'sort_docid2flagsandsitemap'", filename);
return false;
}
}
//swap in and done.
active_index.store(new_active_index,std::memory_order_release);
timestamp = st.st_mtime;
log(LOG_DEBUG, "Loaded %s (%lu entries)", filename, (unsigned long)mmf[new_active_index].size()/sizeof(Docid2FlagsAndSiteMapEntry));
return true;
}
void Docid2FlagsAndSiteMap::reload_if_needed() {
struct stat st;
if(stat(filename,&st)!=0)
return; //probably not found
if(timestamp==-1 || timestamp!=st.st_mtime)
load();
}
void Docid2FlagsAndSiteMap::unload() {
mmf[0].close();
mmf[1].close();
}
bool Docid2FlagsAndSiteMap::lookupSiteHash(uint64_t docid, uint32_t *sitehash32) {
Docid2FlagsAndSiteMapEntry tmp;
tmp.docid = docid;
tmp.flags = 0;
auto ai = active_index.load(std::memory_order_consume);
auto start = reinterpret_cast<const Docid2FlagsAndSiteMapEntry *>(mmf[ai].start());
auto count = mmf[ai].size()/sizeof(Docid2FlagsAndSiteMapEntry);
auto end = start+count;
auto pos = std::lower_bound(start, end, tmp, cmp);
if(pos!=end && pos->docid == docid) {
*sitehash32 = pos->sitehash32;
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Found record sitehash32=%u for docid=%lu", *sitehash32, docid);
return true;
}
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Record not found for docid=%lu", docid);
return false;
}
bool Docid2FlagsAndSiteMap::lookupFlags(uint64_t docid, unsigned *flags) {
Docid2FlagsAndSiteMapEntry tmp;
tmp.docid = docid;
tmp.flags = 0;
auto ai = active_index.load(std::memory_order_consume);
auto start = reinterpret_cast<const Docid2FlagsAndSiteMapEntry *>(mmf[ai].start());
auto count = mmf[ai].size()/sizeof(Docid2FlagsAndSiteMapEntry);
auto end = start+count;
auto pos = std::lower_bound(start, end, tmp, cmp);
if(pos!=end && pos->docid == docid) {
*flags = pos->flags;
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Found record flags=%u for docid=%lu", *flags, docid);
return true;
}
logTrace(g_conf.m_logTraceDocid2FlagsAndSiteMap, "Record not found for docid=%lu", docid);
return false;
}