-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlhash_kv.c
3076 lines (3067 loc) · 84.3 KB
/
lhash_kv.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Symisc unQLite: An Embeddable NoSQL (Post Modern) Database Engine.
* Copyright (C) 2012-2013, Symisc Systems http://unqlite.org/
* Version 1.1.6
* For information on licensing, redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES
* please contact Symisc Systems via:
* or visit:
* http://unqlite.org/licensing.html
*/
/* $SymiscID: lhash_kv.c v1.7 Solaris 2013-01-14 12:56 stable <[email protected]> $ */
#ifndef UNQLITE_AMALGAMATION
#include "unqliteInt.h"
#endif
/*
* This file implements disk based hashtable using the linear hashing algorithm.
* This implementation is the one decribed in the paper:
* LINEAR HASHING : A NEW TOOL FOR FILE AND TABLE ADDRESSING. Witold Litwin. I. N. Ft. I. A.. 78 150 Le Chesnay, France.
* Plus a smart extension called Virtual Bucket Table. (contact [email protected] for additional information).
*/
/* Magic number identifying a valid storage image */
#define L_HASH_MAGIC 0xFA782DCB
/*
* Magic word to hash to identify a valid hash function.
*/
#define L_HASH_WORD "chm@symisc"
/*
* Cell size on disk.
*/
#define L_HASH_CELL_SZ (4/*Hash*/+4/*Key*/+8/*Data*/+2/* Offset of the next cell */+8/*Overflow*/)
/*
* Primary page (not overflow pages) header size on disk.
*/
#define L_HASH_PAGE_HDR_SZ (2/* Cell offset*/+2/* Free block offset*/+8/*Slave page number*/)
/*
* The maximum amount of payload (in bytes) that can be stored locally for
* a database entry. If the entry contains more data than this, the
* extra goes onto overflow pages.
*/
#define L_HASH_MX_PAYLOAD(PageSize) (PageSize-(L_HASH_PAGE_HDR_SZ+L_HASH_CELL_SZ))
/*
* Maxium free space on a single page.
*/
#define L_HASH_MX_FREE_SPACE(PageSize) (PageSize - (L_HASH_PAGE_HDR_SZ))
/*
** The maximum number of bytes of payload allowed on a single overflow page.
*/
#define L_HASH_OVERFLOW_SIZE(PageSize) (PageSize-8)
/* Forward declaration */
typedef struct lhash_kv_engine lhash_kv_engine;
typedef struct lhpage lhpage;
/*
* Each record in the database is identified either in-memory or in
* disk by an instance of the following structure.
*/
typedef struct lhcell lhcell;
struct lhcell
{
/* Disk-data (Big-Endian) */
sxu32 nHash; /* Hash of the key: 4 bytes */
sxu32 nKey; /* Key length: 4 bytes */
sxu64 nData; /* Data length: 8 bytes */
sxu16 iNext; /* Offset of the next cell: 2 bytes */
pgno iOvfl; /* Overflow page number if any: 8 bytes */
/* In-memory data only */
lhpage *pPage; /* Page this cell belongs */
sxu16 iStart; /* Offset of this cell */
pgno iDataPage; /* Data page number when overflow */
sxu16 iDataOfft; /* Offset of the data in iDataPage */
SyBlob sKey; /* Record key for fast lookup (Kept in-memory if < 256KB ) */
lhcell *pNext,*pPrev; /* Linked list of the loaded memory cells */
lhcell *pNextCol,*pPrevCol; /* Collison chain */
};
/*
** Each database page has a header that is an instance of this
** structure.
*/
typedef struct lhphdr lhphdr;
struct lhphdr
{
sxu16 iOfft; /* Offset of the first cell */
sxu16 iFree; /* Offset of the first free block*/
pgno iSlave; /* Slave page number */
};
/*
* Each loaded primary disk page is represented in-memory using
* an instance of the following structure.
*/
struct lhpage
{
lhash_kv_engine *pHash; /* KV Storage engine that own this page */
unqlite_page *pRaw; /* Raw page contents */
lhphdr sHdr; /* Processed page header */
lhcell **apCell; /* Cell buckets */
lhcell *pList,*pFirst; /* Linked list of cells */
sxu32 nCell; /* Total number of cells */
sxu32 nCellSize; /* apCell[] size */
lhpage *pMaster; /* Master page in case we are dealing with a slave page */
lhpage *pSlave; /* List of slave pages */
lhpage *pNextSlave; /* Next slave page on the list */
sxi32 iSlave; /* Total number of slave pages */
sxu16 nFree; /* Amount of free space available in the page */
};
/*
* A Bucket map record which is used to map logical bucket number to real
* bucket number is represented by an instance of the following structure.
*/
typedef struct lhash_bmap_rec lhash_bmap_rec;
struct lhash_bmap_rec
{
pgno iLogic; /* Logical bucket number */
pgno iReal; /* Real bucket number */
lhash_bmap_rec *pNext,*pPrev; /* Link to other bucket map */
lhash_bmap_rec *pNextCol,*pPrevCol; /* Collision links */
};
typedef struct lhash_bmap_page lhash_bmap_page;
struct lhash_bmap_page
{
pgno iNum; /* Page number where this entry is stored */
sxu16 iPtr; /* Offset to start reading/writing from */
sxu32 nRec; /* Total number of records in this page */
pgno iNext; /* Next map page */
};
/*
* An in memory linear hash implemenation is represented by in an isntance
* of the following structure.
*/
struct lhash_kv_engine
{
const unqlite_kv_io *pIo; /* IO methods: Must be first */
/* Private fields */
SyMemBackend sAllocator; /* Private memory backend */
ProcHash xHash; /* Default hash function */
ProcCmp xCmp; /* Default comparison function */
unqlite_page *pHeader; /* Page one to identify a valid implementation */
lhash_bmap_rec **apMap; /* Buckets map records */
sxu32 nBuckRec; /* Total number of bucket map records */
sxu32 nBuckSize; /* apMap[] size */
lhash_bmap_rec *pList; /* List of bucket map records */
lhash_bmap_rec *pFirst; /* First record*/
lhash_bmap_page sPageMap; /* Primary bucket map */
int iPageSize; /* Page size */
pgno nFreeList; /* List of free pages */
pgno split_bucket; /* Current split bucket: MUST BE A POWER OF TWO */
pgno max_split_bucket; /* Maximum split bucket: MUST BE A POWER OF TWO */
pgno nmax_split_nucket; /* Next maximum split bucket (1 << nMsb): In-memory only */
sxu32 nMagic; /* Magic number to identify a valid linear hash disk database */
};
/*
* Given a logical bucket number, return the record associated with it.
*/
static lhash_bmap_rec * lhMapFindBucket(lhash_kv_engine *pEngine,pgno iLogic)
{
lhash_bmap_rec *pRec;
if( pEngine->nBuckRec < 1 ){
/* Don't bother */
return 0;
}
pRec = pEngine->apMap[iLogic & (pEngine->nBuckSize - 1)];
for(;;){
if( pRec == 0 ){
break;
}
if( pRec->iLogic == iLogic ){
return pRec;
}
/* Point to the next entry */
pRec = pRec->pNextCol;
}
/* No such record */
return 0;
}
/*
* Install a new bucket map record.
*/
static int lhMapInstallBucket(lhash_kv_engine *pEngine,pgno iLogic,pgno iReal)
{
lhash_bmap_rec *pRec;
sxu32 iBucket;
/* Allocate a new instance */
pRec = (lhash_bmap_rec *)SyMemBackendPoolAlloc(&pEngine->sAllocator,sizeof(lhash_bmap_rec));
if( pRec == 0 ){
return UNQLITE_NOMEM;
}
/* Zero the structure */
SyZero(pRec,sizeof(lhash_bmap_rec));
/* Fill in the structure */
pRec->iLogic = iLogic;
pRec->iReal = iReal;
iBucket = iLogic & (pEngine->nBuckSize - 1);
pRec->pNextCol = pEngine->apMap[iBucket];
if( pEngine->apMap[iBucket] ){
pEngine->apMap[iBucket]->pPrevCol = pRec;
}
pEngine->apMap[iBucket] = pRec;
/* Link */
if( pEngine->pFirst == 0 ){
pEngine->pFirst = pEngine->pList = pRec;
}else{
MACRO_LD_PUSH(pEngine->pList,pRec);
}
pEngine->nBuckRec++;
if( (pEngine->nBuckRec >= pEngine->nBuckSize * 3) && pEngine->nBuckRec < 100000 ){
/* Allocate a new larger table */
sxu32 nNewSize = pEngine->nBuckSize << 1;
lhash_bmap_rec *pEntry;
lhash_bmap_rec **apNew;
sxu32 n;
apNew = (lhash_bmap_rec **)SyMemBackendAlloc(&pEngine->sAllocator, nNewSize * sizeof(lhash_bmap_rec *));
if( apNew ){
/* Zero the new table */
SyZero((void *)apNew, nNewSize * sizeof(lhash_bmap_rec *));
/* Rehash all entries */
n = 0;
pEntry = pEngine->pList;
for(;;){
/* Loop one */
if( n >= pEngine->nBuckRec ){
break;
}
pEntry->pNextCol = pEntry->pPrevCol = 0;
/* Install in the new bucket */
iBucket = pEntry->iLogic & (nNewSize - 1);
pEntry->pNextCol = apNew[iBucket];
if( apNew[iBucket] ){
apNew[iBucket]->pPrevCol = pEntry;
}
apNew[iBucket] = pEntry;
/* Point to the next entry */
pEntry = pEntry->pNext;
n++;
}
/* Release the old table and reflect the change */
SyMemBackendFree(&pEngine->sAllocator,(void *)pEngine->apMap);
pEngine->apMap = apNew;
pEngine->nBuckSize = nNewSize;
}
}
return UNQLITE_OK;
}
/*
* Process a raw bucket map record.
*/
static int lhMapLoadPage(lhash_kv_engine *pEngine,lhash_bmap_page *pMap,const unsigned char *zRaw)
{
const unsigned char *zEnd = &zRaw[pEngine->iPageSize];
const unsigned char *zPtr = zRaw;
pgno iLogic,iReal;
sxu32 n;
int rc;
if( pMap->iPtr == 0 ){
/* Read the map header */
SyBigEndianUnpack64(zRaw,&pMap->iNext);
zRaw += 8;
SyBigEndianUnpack32(zRaw,&pMap->nRec);
zRaw += 4;
}else{
/* Mostly page one of the database */
zRaw += pMap->iPtr;
}
/* Start processing */
for( n = 0; n < pMap->nRec ; ++n ){
if( zRaw >= zEnd ){
break;
}
/* Extract the logical and real bucket number */
SyBigEndianUnpack64(zRaw,&iLogic);
zRaw += 8;
SyBigEndianUnpack64(zRaw,&iReal);
zRaw += 8;
/* Install the record in the map */
rc = lhMapInstallBucket(pEngine,iLogic,iReal);
if( rc != UNQLITE_OK ){
return rc;
}
}
pMap->iPtr = (sxu16)(zRaw-zPtr);
/* All done */
return UNQLITE_OK;
}
/*
* Allocate a new cell instance.
*/
static lhcell * lhNewCell(lhash_kv_engine *pEngine,lhpage *pPage)
{
lhcell *pCell;
pCell = (lhcell *)SyMemBackendPoolAlloc(&pEngine->sAllocator,sizeof(lhcell));
if( pCell == 0 ){
return 0;
}
/* Zero the structure */
SyZero(pCell,sizeof(lhcell));
/* Fill in the structure */
SyBlobInit(&pCell->sKey,&pEngine->sAllocator);
pCell->pPage = pPage;
return pCell;
}
/*
* Discard a cell from the page table.
*/
static void lhCellDiscard(lhcell *pCell)
{
lhpage *pPage = pCell->pPage->pMaster;
if( pCell->pPrevCol ){
pCell->pPrevCol->pNextCol = pCell->pNextCol;
}else{
pPage->apCell[pCell->nHash & (pPage->nCellSize - 1)] = pCell->pNextCol;
}
if( pCell->pNextCol ){
pCell->pNextCol->pPrevCol = pCell->pPrevCol;
}
MACRO_LD_REMOVE(pPage->pList,pCell);
if( pCell == pPage->pFirst ){
pPage->pFirst = pCell->pPrev;
}
pPage->nCell--;
/* Release the cell */
SyBlobRelease(&pCell->sKey);
SyMemBackendPoolFree(&pPage->pHash->sAllocator,pCell);
}
/*
* Install a cell in the page table.
*/
static int lhInstallCell(lhcell *pCell)
{
lhpage *pPage = pCell->pPage->pMaster;
sxu32 iBucket;
if( pPage->nCell < 1 ){
sxu32 nTableSize = 32; /* Must be a power of two */
lhcell **apTable;
/* Allocate a new cell table */
apTable = (lhcell **)SyMemBackendAlloc(&pPage->pHash->sAllocator, nTableSize * sizeof(lhcell *));
if( apTable == 0 ){
return UNQLITE_NOMEM;
}
/* Zero the new table */
SyZero((void *)apTable, nTableSize * sizeof(lhcell *));
/* Install it */
pPage->apCell = apTable;
pPage->nCellSize = nTableSize;
}
iBucket = pCell->nHash & (pPage->nCellSize - 1);
pCell->pNextCol = pPage->apCell[iBucket];
if( pPage->apCell[iBucket] ){
pPage->apCell[iBucket]->pPrevCol = pCell;
}
pPage->apCell[iBucket] = pCell;
if( pPage->pFirst == 0 ){
pPage->pFirst = pPage->pList = pCell;
}else{
MACRO_LD_PUSH(pPage->pList,pCell);
}
pPage->nCell++;
if( (pPage->nCell >= pPage->nCellSize * 3) && pPage->nCell < 100000 ){
/* Allocate a new larger table */
sxu32 nNewSize = pPage->nCellSize << 1;
lhcell *pEntry;
lhcell **apNew;
sxu32 n;
apNew = (lhcell **)SyMemBackendAlloc(&pPage->pHash->sAllocator, nNewSize * sizeof(lhcell *));
if( apNew ){
/* Zero the new table */
SyZero((void *)apNew, nNewSize * sizeof(lhcell *));
/* Rehash all entries */
n = 0;
pEntry = pPage->pList;
for(;;){
/* Loop one */
if( n >= pPage->nCell ){
break;
}
pEntry->pNextCol = pEntry->pPrevCol = 0;
/* Install in the new bucket */
iBucket = pEntry->nHash & (nNewSize - 1);
pEntry->pNextCol = apNew[iBucket];
if( apNew[iBucket] ){
apNew[iBucket]->pPrevCol = pEntry;
}
apNew[iBucket] = pEntry;
/* Point to the next entry */
pEntry = pEntry->pNext;
n++;
}
/* Release the old table and reflect the change */
SyMemBackendFree(&pPage->pHash->sAllocator,(void *)pPage->apCell);
pPage->apCell = apNew;
pPage->nCellSize = nNewSize;
}
}
return UNQLITE_OK;
}
/*
* Private data of lhKeyCmp().
*/
struct lhash_key_cmp
{
const char *zIn; /* Start of the stream */
const char *zEnd; /* End of the stream */
ProcCmp xCmp; /* Comparison function */
};
/*
* Comparsion callback for large key > 256 KB
*/
static int lhKeyCmp(const void *pData,sxu32 nLen,void *pUserData)
{
struct lhash_key_cmp *pCmp = (struct lhash_key_cmp *)pUserData;
int rc;
if( pCmp->zIn >= pCmp->zEnd ){
if( nLen > 0 ){
return UNQLITE_ABORT;
}
return UNQLITE_OK;
}
/* Perform the comparison */
rc = pCmp->xCmp((const void *)pCmp->zIn,pData,nLen);
if( rc != 0 ){
/* Abort comparison */
return UNQLITE_ABORT;
}
/* Advance the cursor */
pCmp->zIn += nLen;
return UNQLITE_OK;
}
/* Forward declaration */
static int lhConsumeCellkey(lhcell *pCell,int (*xConsumer)(const void *,unsigned int,void *),void *pUserData,int offt_only);
/*
* given a key, return the cell associated with it on success. NULL otherwise.
*/
static lhcell * lhFindCell(
lhpage *pPage, /* Target page */
const void *pKey, /* Lookup key */
sxu32 nByte, /* Key length */
sxu32 nHash /* Hash of the key */
)
{
lhcell *pEntry;
if( pPage->nCell < 1 ){
/* Don't bother hashing */
return 0;
}
/* Point to the corresponding bucket */
pEntry = pPage->apCell[nHash & (pPage->nCellSize - 1)];
for(;;){
if( pEntry == 0 ){
break;
}
if( pEntry->nHash == nHash && pEntry->nKey == nByte ){
if( SyBlobLength(&pEntry->sKey) < 1 ){
/* Large key (> 256 KB) are not kept in-memory */
struct lhash_key_cmp sCmp;
int rc;
/* Fill-in the structure */
sCmp.zIn = (const char *)pKey;
sCmp.zEnd = &sCmp.zIn[nByte];
sCmp.xCmp = pPage->pHash->xCmp;
/* Fetch the key from disk and perform the comparison */
rc = lhConsumeCellkey(pEntry,lhKeyCmp,&sCmp,0);
if( rc == UNQLITE_OK ){
/* Cell found */
return pEntry;
}
}else if ( pPage->pHash->xCmp(pKey,SyBlobData(&pEntry->sKey),nByte) == 0 ){
/* Cell found */
return pEntry;
}
}
/* Point to the next entry */
pEntry = pEntry->pNextCol;
}
/* No such entry */
return 0;
}
/*
* Parse a raw cell fetched from disk.
*/
static int lhParseOneCell(lhpage *pPage,const unsigned char *zRaw,const unsigned char *zEnd,lhcell **ppOut)
{
sxu16 iNext,iOfft;
sxu32 iHash,nKey;
lhcell *pCell;
sxu64 nData;
int rc;
/* Offset this cell is stored */
iOfft = (sxu16)(zRaw - (const unsigned char *)pPage->pRaw->zData);
/* 4 byte hash number */
SyBigEndianUnpack32(zRaw,&iHash);
zRaw += 4;
/* 4 byte key length */
SyBigEndianUnpack32(zRaw,&nKey);
zRaw += 4;
/* 8 byte data length */
SyBigEndianUnpack64(zRaw,&nData);
zRaw += 8;
/* 2 byte offset of the next cell */
SyBigEndianUnpack16(zRaw,&iNext);
/* Perform a sanity check */
if( iNext > 0 && &pPage->pRaw->zData[iNext] >= zEnd ){
return UNQLITE_CORRUPT;
}
zRaw += 2;
pCell = lhNewCell(pPage->pHash,pPage);
if( pCell == 0 ){
return UNQLITE_NOMEM;
}
/* Fill in the structure */
pCell->iNext = iNext;
pCell->nKey = nKey;
pCell->nData = nData;
pCell->nHash = iHash;
/* Overflow page if any */
SyBigEndianUnpack64(zRaw,&pCell->iOvfl);
zRaw += 8;
/* Cell offset */
pCell->iStart = iOfft;
/* Consume the key */
rc = lhConsumeCellkey(pCell,unqliteDataConsumer,&pCell->sKey,pCell->nKey > 262144 /* 256 KB */? 1 : 0);
if( rc != UNQLITE_OK ){
/* TICKET: [email protected]: Key too large for memory */
SyBlobRelease(&pCell->sKey);
}
/* Finally install the cell */
rc = lhInstallCell(pCell);
if( rc != UNQLITE_OK ){
return rc;
}
if( ppOut ){
*ppOut = pCell;
}
return UNQLITE_OK;
}
/*
* Compute the total number of free space on a given page.
*/
static int lhPageFreeSpace(lhpage *pPage)
{
const unsigned char *zEnd,*zRaw = pPage->pRaw->zData;
lhphdr *pHdr = &pPage->sHdr;
sxu16 iNext,iAmount;
sxu16 nFree = 0;
if( pHdr->iFree < 1 ){
/* Don't bother processing, the page is full */
pPage->nFree = 0;
return UNQLITE_OK;
}
/* Point to first free block */
zEnd = &zRaw[pPage->pHash->iPageSize];
zRaw += pHdr->iFree;
for(;;){
/* Offset of the next free block */
SyBigEndianUnpack16(zRaw,&iNext);
zRaw += 2;
/* Available space on this block */
SyBigEndianUnpack16(zRaw,&iAmount);
nFree += iAmount;
if( iNext < 1 ){
/* No more free blocks */
break;
}
/* Point to the next free block*/
zRaw = &pPage->pRaw->zData[iNext];
if( zRaw >= zEnd ){
/* Corrupt page */
return UNQLITE_CORRUPT;
}
}
/* Save the amount of free space */
pPage->nFree = nFree;
return UNQLITE_OK;
}
/*
* Given a primary page, load all its cell.
*/
static int lhLoadCells(lhpage *pPage)
{
const unsigned char *zEnd,*zRaw = pPage->pRaw->zData;
lhphdr *pHdr = &pPage->sHdr;
lhcell *pCell = 0; /* cc warning */
int rc;
/* Calculate the amount of free space available first */
rc = lhPageFreeSpace(pPage);
if( rc != UNQLITE_OK ){
return rc;
}
if( pHdr->iOfft < 1 ){
/* Don't bother processing, the page is empty */
return UNQLITE_OK;
}
/* Point to first cell */
zRaw += pHdr->iOfft;
zEnd = &zRaw[pPage->pHash->iPageSize];
for(;;){
/* Parse a single cell */
rc = lhParseOneCell(pPage,zRaw,zEnd,&pCell);
if( rc != UNQLITE_OK ){
return rc;
}
if( pCell->iNext < 1 ){
/* No more cells */
break;
}
/* Point to the next cell */
zRaw = &pPage->pRaw->zData[pCell->iNext];
if( zRaw >= zEnd ){
/* Corrupt page */
return UNQLITE_CORRUPT;
}
}
/* All done */
return UNQLITE_OK;
}
/*
* Given a page, parse its raw headers.
*/
static int lhParsePageHeader(lhpage *pPage)
{
const unsigned char *zRaw = pPage->pRaw->zData;
lhphdr *pHdr = &pPage->sHdr;
/* Offset of the first cell */
SyBigEndianUnpack16(zRaw,&pHdr->iOfft);
zRaw += 2;
/* Offset of the first free block */
SyBigEndianUnpack16(zRaw,&pHdr->iFree);
zRaw += 2;
/* Slave page number */
SyBigEndianUnpack64(zRaw,&pHdr->iSlave);
/* All done */
return UNQLITE_OK;
}
/*
* Allocate a new page instance.
*/
static lhpage * lhNewPage(
lhash_kv_engine *pEngine, /* KV store which own this instance */
unqlite_page *pRaw, /* Raw page contents */
lhpage *pMaster /* Master page in case we are dealing with a slave page */
)
{
lhpage *pPage;
/* Allocate a new instance */
pPage = (lhpage *)SyMemBackendPoolAlloc(&pEngine->sAllocator,sizeof(lhpage));
if( pPage == 0 ){
return 0;
}
/* Zero the structure */
SyZero(pPage,sizeof(lhpage));
/* Fill-in the structure */
pPage->pHash = pEngine;
pPage->pRaw = pRaw;
pPage->pMaster = pMaster ? pMaster /* Slave page */ : pPage /* Master page */ ;
if( pPage->pMaster != pPage ){
/* Slave page, attach it to its master */
pPage->pNextSlave = pMaster->pSlave;
pMaster->pSlave = pPage;
pMaster->iSlave++;
}
/* Save this instance for future fast lookup */
pRaw->pUserData = pPage;
/* All done */
return pPage;
}
/*
* Load a primary and its associated slave pages from disk.
*/
static int lhLoadPage(lhash_kv_engine *pEngine,pgno pnum,lhpage *pMaster,lhpage **ppOut,int iNest)
{
unqlite_page *pRaw;
lhpage *pPage = 0; /* cc warning */
int rc;
/* Aquire the page from the pager first */
rc = pEngine->pIo->xGet(pEngine->pIo->pHandle,pnum,&pRaw);
if( rc != UNQLITE_OK ){
return rc;
}
if( pRaw->pUserData ){
/* The page is already parsed and loaded in memory. Point to it */
pPage = (lhpage *)pRaw->pUserData;
}else{
/* Allocate a new page */
pPage = lhNewPage(pEngine,pRaw,pMaster);
if( pPage == 0 ){
return UNQLITE_NOMEM;
}
/* Process the page */
rc = lhParsePageHeader(pPage);
if( rc == UNQLITE_OK ){
/* Load cells */
rc = lhLoadCells(pPage);
}
if( rc != UNQLITE_OK ){
pEngine->pIo->xPageUnref(pPage->pRaw); /* pPage will be released inside this call */
return rc;
}
if( pPage->sHdr.iSlave > 0 && iNest < 128 ){
if( pMaster == 0 ){
pMaster = pPage;
}
/* Slave page. Not a fatal error if something goes wrong here */
lhLoadPage(pEngine,pPage->sHdr.iSlave,pMaster,0,iNest++);
}
}
if( ppOut ){
*ppOut = pPage;
}
return UNQLITE_OK;
}
/*
* Given a cell, Consume its key by invoking the given callback for each extracted chunk.
*/
static int lhConsumeCellkey(
lhcell *pCell, /* Target cell */
int (*xConsumer)(const void *,unsigned int,void *), /* Consumer callback */
void *pUserData, /* Last argument to xConsumer() */
int offt_only
)
{
lhpage *pPage = pCell->pPage;
const unsigned char *zRaw = pPage->pRaw->zData;
const unsigned char *zPayload;
int rc;
/* Point to the payload area */
zPayload = &zRaw[pCell->iStart];
if( pCell->iOvfl == 0 ){
/* Best scenario, consume the key directly without any overflow page */
zPayload += L_HASH_CELL_SZ;
rc = xConsumer((const void *)zPayload,pCell->nKey,pUserData);
if( rc != UNQLITE_OK ){
rc = UNQLITE_ABORT;
}
}else{
lhash_kv_engine *pEngine = pPage->pHash;
sxu32 nByte,nData = pCell->nKey;
unqlite_page *pOvfl;
int data_offset = 0;
pgno iOvfl;
/* Overflow page */
iOvfl = pCell->iOvfl;
/* Total usable bytes in an overflow page */
nByte = L_HASH_OVERFLOW_SIZE(pEngine->iPageSize);
for(;;){
if( iOvfl == 0 || nData < 1 ){
/* no more overflow page */
break;
}
/* Point to the overflow page */
rc = pEngine->pIo->xGet(pEngine->pIo->pHandle,iOvfl,&pOvfl);
if( rc != UNQLITE_OK ){
return rc;
}
zPayload = &pOvfl->zData[8];
/* Point to the raw content */
if( !data_offset ){
/* Get the data page and offset */
SyBigEndianUnpack64(zPayload,&pCell->iDataPage);
zPayload += 8;
SyBigEndianUnpack16(zPayload,&pCell->iDataOfft);
zPayload += 2;
if( offt_only ){
/* Key too large, grab the data offset and return */
pEngine->pIo->xPageUnref(pOvfl);
return UNQLITE_OK;
}
data_offset = 1;
}
/* Consume the key */
if( nData <= nByte ){
rc = xConsumer((const void *)zPayload,nData,pUserData);
if( rc != UNQLITE_OK ){
pEngine->pIo->xPageUnref(pOvfl);
return UNQLITE_ABORT;
}
nData = 0;
}else{
rc = xConsumer((const void *)zPayload,nByte,pUserData);
if( rc != UNQLITE_OK ){
pEngine->pIo->xPageUnref(pOvfl);
return UNQLITE_ABORT;
}
nData -= nByte;
}
/* Next overflow page in the chain */
SyBigEndianUnpack64(pOvfl->zData,&iOvfl);
/* Unref the page */
pEngine->pIo->xPageUnref(pOvfl);
}
rc = UNQLITE_OK;
}
return rc;
}
/*
* Given a cell, Consume its data by invoking the given callback for each extracted chunk.
*/
static int lhConsumeCellData(
lhcell *pCell, /* Target cell */
int (*xConsumer)(const void *,unsigned int,void *), /* Data consumer callback */
void *pUserData /* Last argument to xConsumer() */
)
{
lhpage *pPage = pCell->pPage;
const unsigned char *zRaw = pPage->pRaw->zData;
const unsigned char *zPayload;
int rc;
/* Point to the payload area */
zPayload = &zRaw[pCell->iStart];
if( pCell->iOvfl == 0 ){
/* Best scenario, consume the data directly without any overflow page */
zPayload += L_HASH_CELL_SZ + pCell->nKey;
rc = xConsumer((const void *)zPayload,(sxu32)pCell->nData,pUserData);
if( rc != UNQLITE_OK ){
rc = UNQLITE_ABORT;
}
}else{
lhash_kv_engine *pEngine = pPage->pHash;
sxu64 nData = pCell->nData;
unqlite_page *pOvfl;
int fix_offset = 0;
sxu32 nByte;
pgno iOvfl;
/* Overflow page where data is stored */
iOvfl = pCell->iDataPage;
for(;;){
if( iOvfl == 0 || nData < 1 ){
/* no more overflow page */
break;
}
/* Point to the overflow page */
rc = pEngine->pIo->xGet(pEngine->pIo->pHandle,iOvfl,&pOvfl);
if( rc != UNQLITE_OK ){
return rc;
}
/* Point to the raw content */
zPayload = pOvfl->zData;
if( !fix_offset ){
/* Point to the data */
zPayload += pCell->iDataOfft;
nByte = pEngine->iPageSize - pCell->iDataOfft;
fix_offset = 1;
}else{
zPayload += 8;
/* Total usable bytes in an overflow page */
nByte = L_HASH_OVERFLOW_SIZE(pEngine->iPageSize);
}
/* Consume the data */
if( nData <= (sxu64)nByte ){
rc = xConsumer((const void *)zPayload,(unsigned int)nData,pUserData);
if( rc != UNQLITE_OK ){
pEngine->pIo->xPageUnref(pOvfl);
return UNQLITE_ABORT;
}
nData = 0;
}else{
if( nByte > 0 ){
rc = xConsumer((const void *)zPayload,nByte,pUserData);
if( rc != UNQLITE_OK ){
pEngine->pIo->xPageUnref(pOvfl);
return UNQLITE_ABORT;
}
nData -= nByte;
}
}
/* Next overflow page in the chain */
SyBigEndianUnpack64(pOvfl->zData,&iOvfl);
/* Unref the page */
pEngine->pIo->xPageUnref(pOvfl);
}
rc = UNQLITE_OK;
}
return rc;
}
/*
* Read the linear hash header (Page one of the database).
*/
static int lhash_read_header(lhash_kv_engine *pEngine,unqlite_page *pHeader)
{
const unsigned char *zRaw = pHeader->zData;
lhash_bmap_page *pMap;
sxu32 nHash;
int rc;
pEngine->pHeader = pHeader;
/* 4 byte magic number */
SyBigEndianUnpack32(zRaw,&pEngine->nMagic);
zRaw += 4;
if( pEngine->nMagic != L_HASH_MAGIC ){
/* Corrupt implementation */
return UNQLITE_CORRUPT;
}
/* 4 byte hash value to identify a valid hash function */
SyBigEndianUnpack32(zRaw,&nHash);
zRaw += 4;
/* Sanity check */
if( pEngine->xHash(L_HASH_WORD,sizeof(L_HASH_WORD)-1) != nHash ){
/* Different hash function */
pEngine->pIo->xErr(pEngine->pIo->pHandle,"Invalid hash function");
return UNQLITE_INVALID;
}
/* List of free pages */
SyBigEndianUnpack64(zRaw,&pEngine->nFreeList);
zRaw += 8;
/* Current split bucket */
SyBigEndianUnpack64(zRaw,&pEngine->split_bucket);
zRaw += 8;
/* Maximum split bucket */
SyBigEndianUnpack64(zRaw,&pEngine->max_split_bucket);
zRaw += 8;
/* Next generation */
pEngine->nmax_split_nucket = pEngine->max_split_bucket << 1;
/* Initialiaze the bucket map */
pMap = &pEngine->sPageMap;
/* Fill in the structure */
pMap->iNum = pHeader->pgno;
/* Next page in the bucket map */
SyBigEndianUnpack64(zRaw,&pMap->iNext);
zRaw += 8;
/* Total number of records in the bucket map (This page only) */
SyBigEndianUnpack32(zRaw,&pMap->nRec);
zRaw += 4;
pMap->iPtr = (sxu16)(zRaw - pHeader->zData);
/* Load the map in memory */
rc = lhMapLoadPage(pEngine,pMap,pHeader->zData);
if( rc != UNQLITE_OK ){
return rc;
}
/* Load the bucket map chain if any */
for(;;){
pgno iNext = pMap->iNext;
unqlite_page *pPage;
if( iNext == 0 ){
/* No more map pages */
break;
}
/* Point to the target page */
rc = pEngine->pIo->xGet(pEngine->pIo->pHandle,iNext,&pPage);
if( rc != UNQLITE_OK ){
return rc;
}
/* Fill in the structure */
pMap->iNum = iNext;
pMap->iPtr = 0;
/* Load the map in memory */
rc = lhMapLoadPage(pEngine,pMap,pPage->zData);
if( rc != UNQLITE_OK ){
return rc;
}
}
/* All done */
return UNQLITE_OK;
}
/*
* Perform a record lookup.
*/
static int lhRecordLookup(
lhash_kv_engine *pEngine, /* KV storage engine */
const void *pKey, /* Lookup key */
sxu32 nByte, /* Key length */
lhcell **ppCell /* OUT: Target cell on success */
)
{
lhash_bmap_rec *pRec;
lhpage *pPage;
lhcell *pCell;
pgno iBucket;
sxu32 nHash;
int rc;
/* Acquire the first page (hash Header) so that everything gets loaded autmatically */
rc = pEngine->pIo->xGet(pEngine->pIo->pHandle,1,0);
if( rc != UNQLITE_OK ){
return rc;
}
/* Compute the hash of the key first */
nHash = pEngine->xHash(pKey,nByte);
/* Extract the logical (i.e. not real) page number */
iBucket = nHash & (pEngine->nmax_split_nucket - 1);
if( iBucket >= (pEngine->split_bucket + pEngine->max_split_bucket) ){
/* Low mask */
iBucket = nHash & (pEngine->max_split_bucket - 1);
}
/* Map the logical bucket number to real page number */
pRec = lhMapFindBucket(pEngine,iBucket);
if( pRec == 0 ){
/* No such entry */
return UNQLITE_NOTFOUND;
}
/* Load the master page and it's slave page in-memory */
rc = lhLoadPage(pEngine,pRec->iReal,0,&pPage,0);
if( rc != UNQLITE_OK ){
/* IO error, unlikely scenario */
return rc;
}
/* Lookup for the cell */
pCell = lhFindCell(pPage,pKey,nByte,nHash);
if( pCell == 0 ){
/* No such entry */
return UNQLITE_NOTFOUND;
}
if( ppCell ){
*ppCell = pCell;