This repository was archived by the owner on Apr 30, 2020. It is now read-only.
forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathBigFile.cpp
1970 lines (1624 loc) · 60.8 KB
/
BigFile.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "BigFile.h"
#include "File.h"
#include "Dir.h"
#include "Conf.h"
#include "JobScheduler.h"
#include "Stats.h"
#include "Sanity.h"
#include "GbMutex.h"
#include "ScopedLock.h"
#include "Mem.h"
#include "Statistics.h"
#include "Errno.h"
#include "fctypes.h"
#include <fcntl.h>
#include <new>
#include <vector>
#include <pthread.h>
#include <atomic>
#include <unistd.h>
// main.cpp will wait for this to be zero before exiting so all unlink/renames
// can complete
static std::atomic<unsigned> g_unlinkRenameThreads(0);
static void readwriteWrapper_r ( void *state );
static void readwriteDoneWrapper(void *state, job_exit_t exit_type);
static bool readwrite_r ( FileState *fstate );
//A set (list in this case) of filenames that we intend to unlink or rename (src name).
//it is needed for preventing queued read operations from working on deleted files.
struct UnlinkFilename {
char filename[1024];
};
static std::vector<UnlinkFilename> s_pendingFileMetaOperations;
static GbMutex s_pending_mtx;
static bool isPendingUnlink(const char *filename) {
ScopedLock sl(s_pending_mtx);
for(std::vector<UnlinkFilename>::const_iterator iter=s_pendingFileMetaOperations.begin(); iter!=s_pendingFileMetaOperations.end(); ++iter) {
if(strcmp(iter->filename,filename)==0)
return true;
}
return false;
}
static void addPendingUnlink(const char *filename) {
ScopedLock sl(s_pending_mtx);
//we cannot have two simultaenous operations on a file
for(std::vector<UnlinkFilename>::const_iterator iter=s_pendingFileMetaOperations.begin(); iter!=s_pendingFileMetaOperations.end(); ++iter) {
if(strcmp(iter->filename,filename)==0)
gbshutdownLogicError();
}
UnlinkFilename ruf;
strncpy(ruf.filename, filename, sizeof(ruf.filename)-1);
ruf.filename[ sizeof(ruf.filename)-1 ] = '\0';
s_pendingFileMetaOperations.push_back(ruf);
}
static void removePendingUnlink(const char *filename) {
ScopedLock sl(s_pending_mtx);
//double-remove is allowed.
for(std::vector<UnlinkFilename>::iterator iter=s_pendingFileMetaOperations.begin(); iter!=s_pendingFileMetaOperations.end(); ++iter) {
if(strcmp(iter->filename,filename)==0) {
s_pendingFileMetaOperations.erase(iter);
return;
}
}
}
bool BigFile::anyOngoingUnlinksOrRenames() {
return g_unlinkRenameThreads > 0;
}
BigFile::~BigFile () {
close();
}
//#define O_DIRECT 040000
BigFile::BigFile ()
: m_unlinkJobsBeingSubmitted(false),
m_outstandingUnlinkJobCount(0),
m_renameP1JobsBeingSubmitted(false),
m_outstandingRenameP1JobCount(0),
m_renameP2JobsBeingSubmitted(false),
m_outstandingRenameP2JobCount(0),
m_latestsRenameP1Errno(0),
m_mtxMetaJobs(),
m_flushingIsApplicable(false)
{
m_flags = O_RDWR ; // | O_DIRECT;
m_maxParts = 0;
m_numParts = 0;
m_vfd = -1;
//m_vfdAllowed = false;
m_fileSize = -1;
m_lastModified = -1;
m_isClosing = false;
// Coverity
m_callback = NULL;
m_state = NULL;
}
void BigFile::logAllData(int32_t log_type)
{
log(log_type, "Dumping BigFile at %p", (void*)this);
struct tm tm_buf;
struct tm *stm = localtime_r(&m_lastModified,&tm_buf);
log(log_type, "m_flags................: %" PRId32, m_flags);
log(log_type, "m_maxParts.............: %" PRId32, m_maxParts);
log(log_type, "m_numParts.............: %d", m_numParts);
log(log_type, "m_vfd..................: %" PRId32, m_vfd);
log(log_type, "m_fileSize.............: %" PRId64, m_fileSize);
log(log_type, "m_lastModified.........: %04d%02d%02d-%02d%02d%02d", stm->tm_year+1900,stm->tm_mon+1,stm->tm_mday,stm->tm_hour,stm->tm_min,stm->tm_sec);
log(log_type, "m_outstandingUnlinkJobCount: %d", m_outstandingUnlinkJobCount);
log(log_type, "m_outstandingRenameP1JobCount: %d", m_outstandingRenameP1JobCount);
log(log_type, "m_outstandingRenameP2JobCount: %d", m_outstandingRenameP2JobCount);
log(log_type, "m_isClosing............: [%s]", m_isClosing?"true":"false");
// SafeBufs
loghex( log_type, m_dir.getBufStart(), m_dir.length(), "m_dir..................: (hex dump)");
loghex( log_type, m_baseFilename.getBufStart(), m_baseFilename.length(), "m_baseFilename.........: (hex dump)");
loghex( log_type, m_newBaseFilename.getBufStart(), m_newBaseFilename.length(), "m_newBaseFilename......: (hex dump)");
loghex( log_type, m_newBaseFilenameDir.getBufStart(), m_newBaseFilenameDir.length(),"m_newBaseFilenameDir...: (hex dump)");
log(log_type, "g_unlinkRenameThreads..: %u", (unsigned)g_unlinkRenameThreads);
}
// . return false and set g_errno on error
bool BigFile::set(const char *dir, const char *baseFilename) {
logTrace( g_conf.m_logTraceBigFile, "BEGIN. dir [%s] baseFilename [%s]",dir, baseFilename);
// reset filsize
m_fileSize = -1;
m_lastModified = -1;
m_flushingIsApplicable = false;
m_dir.reset();
m_baseFilename.reset();
m_dir.setLabel("bfd");
m_baseFilename.setLabel("bfbf");
if ( ! m_dir.safeStrcpy( dir ) ) {
logTrace( g_conf.m_logTraceBigFile, "END. Return false, m_dir.safeStrcpy failed" );
return false;
}
if ( ! m_baseFilename.safeStrcpy( baseFilename ) ) {
logTrace( g_conf.m_logTraceBigFile, "END. Return false, m_baseFilename.safeStrcpy failed" );
return false;
}
// reset # of parts
m_numParts = 0;
m_maxParts = 0;
m_filePtrsBuf.reset();
// now add parts from both directories
if ( ! addParts ( dir ) ) {
log(LOG_WARN,"%s:%s:%d: END. addParts failed", __FILE__, __func__, __LINE__ );
return false;
}
logTrace( g_conf.m_logTraceBigFile, "END. Return true - OK" );
return true;
}
bool BigFile::reset ( ) {
// RdbMap calls BigFile (m_file)::reset() so we need to free
// the files and their safebufs for their filename and dir.
close ();
// reset filsize
m_fileSize = -1;
m_lastModified = -1;
// m_baseFilename contains the "dir" in it
//sprintf(m_baseFilename ,"%s/%s", dirname , baseFilename );
//strcpy ( m_baseFilename , baseFilename );
//strcpy ( m_dir , dir );
// reset # of parts
//m_numParts = 0;
//m_maxParts = 0;
// now add parts from both directories
// MDW: why is this in reset() function? remove...
//if ( ! addParts ( m_dir.getBufStart() ) ) return false;
return true;
}
bool BigFile::addParts ( const char *dirname ) {
logTrace( g_conf.m_logTraceBigFile, "BEGIN. dirname [%s]", dirname);
// if dirname is NULL return true
if ( ! dirname || ! dirname[0] ) {
logTrace( g_conf.m_logTraceBigFile, "END - No dirname" );
return true;
}
// . now set the names of all the Files that we consist of
// . get the directory entry and find out what parts we have
Dir dir;
dir.set ( dirname );
// set our directory class
if ( !dir.open() ) {
log( LOG_ERROR, "disk: openDir ('%s') failed", dirname );
return false;
}
// match files with this pattern in the directory
char pattern[256];
sprintf(pattern,"%s*", m_baseFilename.getBufStart() );
// length of the base filename
int32_t blen = strlen ( m_baseFilename.getBufStart() );
// . set our m_files array
// . addFile() will return false on problems
// . the lower the fileId the older the file (w/ exception of #0)
logTrace( g_conf.m_logTraceBigFile, "Look for [%s]", pattern);
const char *filename;
while ( ( filename = dir.getNextFilename ( pattern ) ) ) {
logTrace( g_conf.m_logTraceBigFile, " Checking [%s]", filename);
// if filename len is exactly blen it's part 0
int32_t flen = strlen(filename);
int32_t part = -1;
if ( flen == blen ) {
part = 0;
// some files have the same first X chars, like
// indexdb.store-info-bak but are not part files
logTrace( g_conf.m_logTraceBigFile, " Default to part 0" );
} else if ( flen > blen && strncmp(filename+blen,".part",5)!=0) {
logTrace( g_conf.m_logTraceBigFile, " No good." );
continue;
} else if (flen - blen < 6 ) {
log( LOG_WARN, "disk: Part extension too small for '%s'. Must end in .partN to be valid.", filename );
continue;
} else {
part = atoi ( filename + blen + 5 );
logTrace( g_conf.m_logTraceBigFile, " Detected part %" PRId32, part);
}
// make this part file
if( !addPart( part ) ) {
log( LOG_ERROR,"%s:%s:%d: END. addPart failed, returning false.", __FILE__, __func__, __LINE__ );
return false;
}
}
logTrace( g_conf.m_logTraceBigFile, "END - OK" );
return true;
}
// WE CAN'T REALLOC the safebuf because there might be a thread
// referencing the file ptr. so let's just keep the m_filePtrs[] array
// and realloc on that.
bool BigFile::addPart ( int32_t n ) {
logTrace( g_conf.m_logTraceBigFile, "BEGIN n [%" PRId32"] filename [%s]", n, getFilename());
// . grow our dynamic array and return ptr to last element
// . n's come in NOT necessarily in order!!!
int32_t need = (n+1) * sizeof(File *);
// how much more mem do we need?
int32_t delta = need - m_filePtrsBuf.length();
// . make sure our CAPACITY is increased by what we need
// . SafeBuf::reserve() ADDS this much to current capacity
// . true = clear new mem new new file ptrs are null because
// there may be gaps or not exist because the BigFile was being
// merged.
if ( delta > 0 && ! m_filePtrsBuf.reserve ( delta ,"bfbuf", true ) ) {
log(LOG_ERROR, "%s:%s:%d: Failed to reserve %" PRId32" more mem for part", __FILE__, __func__, __LINE__, delta);
logAllData(LOG_ERROR);
return false;
}
// make length the capacity. so if buf is resized in call to
// SafeBuf::reserve() it will copy over all of the old buf to new buf
m_filePtrsBuf.setLength ( m_filePtrsBuf.getCapacity() );
File **filePtrs = (File **)m_filePtrsBuf.getBufStart();
File *f = NULL;
try {
f = new (File);
} catch(std::bad_alloc&) {
g_errno = ENOMEM;
//### BR 20151217: Fix. Previously returned the return code from log(...)
logError("new failed. size: %i, err [%s]", (int)sizeof(File), mstrerror(g_errno));
logAllData(LOG_ERROR);
return false;
}
mnew ( f , sizeof(File) , "BigFile" );
char buf[1024];
// make the filename for this new File class
makeFilename_r(m_baseFilename.getBufStart(), NULL, n, buf, sizeof(buf));
// and set it with that
f->set ( buf );
// store the ptr to it in m_filePtrs
filePtrs [ n ] = f;
++m_numParts;
// set maxPart
if ( n+1 > m_maxParts ) {
m_maxParts = n+1;
logTrace( g_conf.m_logTraceBigFile, "New m_maxParts: %" PRId32, m_maxParts );
}
logTrace( g_conf.m_logTraceBigFile, "END - OK. New File object prepared. returning true" );
return true;
}
bool BigFile::doesExist() const {
return m_numParts != 0;
}
// if we can open it with a valid fd, then it exists
bool BigFile::doesPartExist ( int32_t n ) {
if ( n >= m_maxParts ) return false;
// f will be null if part does not exist
File *f = getFile2(n);
if ( f ) return true;
return false;
}
static int64_t s_vfd = 0;
// . overide File::open so we can set m_numParts
// . set maxFileSize when opening a new file for writing and using
// DiskPageCache
// . use maxFileSize of -1 for us to use getFileSize() to set it
bool BigFile::open(int flags) {
logTrace(g_conf.m_logTraceBigFile, "BEGIN. flag=%d", flags);
m_flags = flags;
m_isClosing = false;
// . init the page cache for this vfd
// . this returns our "virtual fd", not the same as File::m_vfd
// . returns -1 and sets g_errno on failure
// . we pass m_vfd to getPages() and addPages()
if ( m_vfd == -1 ) {
m_vfd = ++s_vfd;
}
logTrace(g_conf.m_logTraceBigFile, "END");
return true;
}
// get the filename of the nth file using m_dir & m_baseFilename
void BigFile::makeFilename_r(const char *baseFilename, const char *baseFilenameDir,
int32_t partNum,
char *buf, int32_t bufSize) const {
const char *dir;
if(baseFilenameDir && baseFilenameDir[0])
dir = baseFilenameDir;
else
dir = m_dir.getBufStart();
int32_t r;
if(partNum == 0) {
r = snprintf ( buf, bufSize, "%s/%s",dir,baseFilename);
} else {
r = snprintf ( buf, bufSize, "%s/%s.part%" PRId32,dir,baseFilename,partNum);
}
if ( r < bufSize ) return;
// truncation is bad
gbshutdownLogicError();
}
// . get the fd of the nth file
// . will try to open the file if it hasn't yet been opened
int BigFile::getfd ( int32_t n , bool forReading ) {
// boundary check
if ( n >= m_maxParts && ! addPart ( n ) ) {
log( LOG_ERROR, "disk: Part number %" PRId32" > %" PRId32". fd not available.", n, m_maxParts );
// return -1 to indicate can't do it
return -1;
}
// get the File ptr from the table
File *f = getFile2(n);
// if part does not exist then create it! addPart(n) will do that?
if (!f) {
// don't create File if we're getting it for reading
if (forReading) {
log( LOG_WARN, "disk: Don't create file when we're getting it for reading" );
return -1;
}
if (!addPart(n)) {
log(LOG_WARN, "disk: Unable to add part %" PRId32, n);
return -1;
}
f = getFile2(n);
if (!f) {
log(LOG_WARN, "disk: Unable to get part %" PRId32, n);
return -1;
}
}
// open it if not opened
if (!f->calledOpen()) {
if (!f->open(m_flags)) {
log(LOG_WARN, "disk: Failed to open file part #%" PRId32".", n);
return -1;
}
}
// get it's file descriptor
int fd = f->getfd();
if (fd >= -1) {
return fd;
}
// otherwise, fd is -2 and it's never been opened?!?!
g_errno = EBADENGINEER;
log( LOG_LOGIC, "disk: fd is -2." );
return -1;
}
// . return -2 on error
// . return -1 if does not exist
// . otherwise return the big file's complete file size (can be well over 2gb)
int64_t BigFile::getFileSize() const {
// return if already computed
if ( m_fileSize >= 0 ) {
return m_fileSize;
}
// add up the sizes of each file
int64_t totalSize = 0;
for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
const File *f = getFile2(n);
// we can have headless big files... count the heads.
// this can happen if the first Files were deleted because
// of an ongoing merge operation.
if ( ! f ) {
totalSize += MAX_PART_SIZE;
continue;
}
// . returns -2 on error, -1 if does not exist
// . TODO: it returns 0 if does not exist! FIX...
int64_t size = f->getFileSize();
if ( size == -2 ) return -2;
if ( size == -1 ) break;
totalSize += size;
}
// save time
m_fileSize = totalSize;
return totalSize;
}
// . return -2 on error
// . return -1 if does not exist
// . otherwise returns the oldest of the last mod dates of all the part files
time_t BigFile::getLastModifiedTime ( ) {
// return if already computed
if ( m_lastModified >= 0 ) return m_lastModified;
// add up the sizes of each file
time_t min = -1;
for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
File *f = getFile2(n);
// we can have headless big files... count the heads
if ( ! f ) continue;
// returns -1 on error, 0 if file does not exist
time_t date = f->getLastModifiedTime();
if ( date == -1 ) return -2;
if ( date == 0 ) break;
// check min
if ( date < min || min == -1 ) min = date;
}
// save time
m_lastModified = min;
return m_lastModified;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we need a ptr to the ptr to this BigFile so if we get deleted and
// a signal is still pending for us, the callback will know we are nuked
bool BigFile::read ( void *buf ,
int64_t size ,
int64_t offset ,
FileState *fs ,
void *state ,
void (* callback)(void *state) ,
int32_t niceness ,
int32_t allocOff ) {
g_errno = 0;
return readwrite ( buf , size , offset , false/*doWrite?*/,
fs , state, callback , niceness , allocOff );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool BigFile::write ( const void *buf,
int64_t size ,
int64_t offset ,
FileState *fs ,
void *state ,
void (* callback)(void *state) ,
int32_t niceness) {
// sanity check
if ( g_conf.m_readOnlyMode ) {
logf(LOG_DEBUG,"disk: BigFile: Trying to write while in "
"read only mode.");
return true;
}
g_errno = 0;
return readwrite(const_cast<void*>(buf), size, offset, true/*doWrite?*/,
fs , state, callback , niceness , 0 );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . we divide into 2 writes in case write spans 2 files
// . only BigFiles will support non-blocking read/writes for now
// . damn, i thought linux supported non-blocking file reads, but it doesn't!
// . we use the aio.h calls
// . we should us kaio from sgi cuz it's in the kernel and only uses 4 threads
// whereas using librt.a creates a thread every time we call aio_read/write()
// . fstate is used by aio_read/write()
// . we need a ptr to the ptr to this BigFile so if we get deleted and
// a signal is still pending for us, the callback will know we are nuked
bool BigFile::readwrite ( void *buf ,
int64_t size ,
int64_t offset ,
bool doWrite ,
FileState *fstate ,
void *state ,
void (* callback) ( void *state ) ,
int32_t niceness ,
int32_t allocOff ) {
// if we're non blocking and caller didn't supply an "fstate"
if ( callback && ! fstate ) {
g_errno = EBADENGINEER;
log(LOG_LOGIC,"disk: readwrite() call is "
"specified as non-blocking, but no state provided.");
return true;
}
// reset file size in case we change it here
if ( doWrite ) {
m_fileSize = -1;
m_lastModified = getTime();
}
// . sanity check
// . when our offset was just a int32_t 2gig+ files, when dumped,
// had negative offsets, bad engineer
if ( offset < 0 ) {
log(LOG_LOGIC,"disk: readwrite() offset is %" PRId64" "
"< 0. filename=%s/%s. dumping core. try deleting "
"the .map file for it and restarting.",offset,
m_dir.getBufStart(),m_baseFilename.getBufStart());
gbshutdownLogicError();
}
// if we're not blocking use a fake fstate
FileState tmp;
if ( ! fstate ) {
fstate = &tmp;
}
// reset this
fstate->m_errno = 0;
// set up fstate
fstate->m_bigfile = this;
// buf may be NULL if caller passed in a NULL "buf" and it did not hit
// the disk page cache. Threads.cpp will have to allocate it right
// before it launches the thread.
fstate->m_buf = (char *)buf;
// if getPages() allocates a buf, this will point to it
fstate->m_allocBuf = NULL;
fstate->m_allocSize = 0;
// when buf is passed in as NULL we allocate it in Threads.cpp right
// before we launch it to save memory.
// we have to know where to start storing
// the read into it for RdbScan, it is not immediately at the
// beginning of the allocated buffer because RdbScan may have to
// turn the first key from a 6 byte half key into a 12 byte key so it
// needs some initial padding. this is because RdbLists should never
// start with a 6 byte half key.
fstate->m_allocOff = allocOff;
fstate->m_bytesToGo = size;
fstate->m_offset = offset;
fstate->m_doWrite = doWrite;
fstate->m_bytesDone = 0;
fstate->m_state = state;
fstate->m_callback = callback;
fstate->m_niceness = niceness;
fstate->m_flags = m_flags;
fstate->m_flushAfterWrite = g_conf.m_flushWrites && m_flushingIsApplicable;
// sanity
if ( fstate->m_bytesToGo > 150000000 ) {
log( LOG_WARN, "file: huge read of %" PRId64" bytes", ( int64_t ) size );
}
// . set our fd's before entering the thread in case RdbMerge
// calls our unlinkPart()
// . it's thread-UNsafe to call getfd() from within the thread
// . FUCK! what if we get unlinked and another file gets this fd!!
// . now we do do unlinks in a thread in File.cpp, but since we
// employ the getCloseCount_r() scheme we can detect when this
// situation occurs and pass a g_errno back to the caller.
fstate->m_filenum1 = offset / MAX_PART_SIZE;
fstate->m_filenum2 = (offset + size ) / MAX_PART_SIZE;
// . save the open count for this fd
// . if it changes when we're done with the read we do a re-read
// . it gets incremented once every time File calls ::open and gets
// back this fd
// . fd1 and fd1 are now set in Threads.cpp since we only want to do
// the open right before we actually launch the thread.
fstate->m_fd1 = -3;
fstate->m_fd2 = -3;
// . if we are writing, prevent these fds from being closed on us
// by File::closedLeastUsed(), because the fd could then be re-opened
// by someone else doing a write and we end up writing to THAT FILE!
// . the closeCount mechanism helps us DETECT when something like this
// happens, but it will not prevent the write from going through
if ( doWrite ) {
// actually have to do the open here for writing so it
// can prevent the fds from being closed on us
fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite);
fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite);
enterWriteMode( fstate->m_fd1 );
enterWriteMode( fstate->m_fd2 );
fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
}
//grab the filenames of the associated files so we later can check for pending deletion
if(fstate->m_filenum1<m_maxParts)
strcpy(fstate->m_filename1, getFile2(fstate->m_filenum1)->getFilename());
else
fstate->m_filename1[0] = '\0';
if(fstate->m_filenum2<m_maxParts)
strcpy(fstate->m_filename2, getFile2(fstate->m_filenum2)->getFilename());
else
fstate->m_filename2[0] = '\0';
// get the close counts after calling getfd() since if getfd() calls
// File::open() that will inc the counts
// closeCount1 and 2 are now set in Threads.cpp since we want to only
// open the fd right before we launch the thread.
//fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
//fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
fstate->m_errno = 0;
fstate->m_startTime = gettimeofdayInMilliseconds();
fstate->m_vfd = m_vfd;
if(callback && g_jobScheduler.are_new_jobs_allowed()) {
// . spawn a thread to do this i/o
// . this returns false and sets g_errno on error, true on success
// . we should return false cuz we blocked
// . thread will add signal to g_loop on completion to call
if ( g_jobScheduler.submit_io(readwriteWrapper_r, readwriteDoneWrapper, fstate, thread_type_unspecified_io, niceness, doWrite) ) {
return false;
}
// thread spawn failed, do it blocking then
log(LOG_INFO, "disk: Doing blocking disk access. This will hurt performance. isWrite=%" PRId32".",(int32_t)doWrite);
}
// come here if we haven't spawned a thread
// if there was no room in the thread queue, then we must do this here
fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite );
fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite );
fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
// clear g_errno from the failed thread spawn
g_errno = 0;
// since Threads.cpp usually allocs the buffer before launching,
// we must do it here now
FileState *fs = fstate;
if ( ! fs->m_doWrite && ! fs->m_buf && fs->m_bytesToGo > 0 ) {
int64_t need = fs->m_bytesToGo + fs->m_allocOff;
char *p = (char *) mmalloc ( need , "ThreadReadBuf" );
if ( p ) {
fs->m_buf = p + fs->m_allocOff;
fs->m_allocBuf = p;
fs->m_allocSize = need;
} else {
log( LOG_WARN, "disk: read buf alloc failed for %" PRId64" bytes.", need );
}
}
// . this returns false and sets errno on error
// . set g_errno to the errno
if ( ! readwrite_r ( fstate ) ) {
g_errno = errno;
}
// exit write mode
if ( doWrite ) {
exitWriteMode( fstate->m_fd1 );
exitWriteMode( fstate->m_fd2 );
}
// set this up here
fstate->m_bytesDone = fstate->m_bytesToGo;
// and this too
fstate->m_doneTime = gettimeofdayInMilliseconds();
// if it read less than 8MB/s bitch
int64_t took = fstate->m_doneTime - fstate->m_startTime ;
int64_t rate = 100000;
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
if ( rate < 8000 && fstate->m_niceness <= 0 ) {
log(LOG_INFO,"disk: Read %" PRId64" bytes in %" PRId64" "
"ms (%" PRId64"KB/s).",
fstate->m_bytesDone,took,rate);
}
Statistics::register_io_time(fstate->m_doWrite, g_errno, fstate->m_bytesDone, took);
// now log our stuff here
if ( g_errno && g_errno != EBADENGINEER ) {
log( LOG_WARN, "disk: readwrite: %s", mstrerror(g_errno));
}
// . this EBADENGINEER can happen right after a merge if
// the file is renamed because the fd may have changed from
// under us
// . i added EBADF because RbdDump was failing because of this when
// trying to write the tree to a file
// . EBADF happens when we unlink a file from under a read or write
// . the closeCount code below was not saving us from coring on EBADF
// because the closeCount is only changed if another file is opened
// with that fd, it is not incremented on a close() but rather on
// an open()
/*
if ( g_errno == EBADENGINEER ) { // || g_errno == EBADF ) {
int32_t fn1 = fstate->m_filenum1;
int32_t fn2 = fstate->m_filenum2;
char *s = getFilename();
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%" PRId32")",s,fn1);
log(LOG_DEBUG,"disk: Closing old fd2 (%s,%" PRId32")",s,fn2);
// get the File ptr from the table
File *f1 = getFile(fn1);
File *f2 = getFile(fn2);
if ( f2 == f1 ) f2 = NULL;
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%" PRId32")",s,fn1);
if ( f2) log(LOG_DEBUG,"disk: Closing old fd2 (%s,%" PRId32")",s,fn2);
if ( f1 ) f1->close();
if ( f2 ) f2->close();
}
*/
// we didn't block so return true
return true;
}
// . this should be called from the main process after getting our call OUR callback here
// Use of ThreadEntry parameter is NOT thread safe
void readwriteDoneWrapper(void *state, job_exit_t exit_type) {
FileState *fstate = (FileState *)state;
if( exit_type != job_exit_normal ) {
log(LOG_INFO, "disk: Read canceled due to JobScheduler exit type %d.", (int)exit_type);
//call calback with m-errno set
fstate->m_errno = ECLOSING;
fstate->m_callback ( fstate->m_state );
return;
}
// any writes we did in the disk read thread were done to the
// "tmp" FileState class on the stack, so now we have the real deal
// we can update all this junk.
fstate->m_bytesDone = fstate->m_bytesToGo;
// exit write mode
if ( fstate->m_doWrite ) {
// THIS could have been deleted!!
exitWriteMode( fstate->m_fd1 );
exitWriteMode( fstate->m_fd2 );
}
// if it read less than 8MB/s bitch
int64_t took = fstate->m_doneTime - fstate->m_startTime;
int32_t rate = 100000;
if ( took > 500 ) rate = fstate->m_bytesDone / took ;
bool slow = false;
if ( rate < 8000 ) slow = true;
if ( slow && fstate->m_niceness <= 0 ) {
log(LOG_INFO, "disk: Read %" PRId64" bytes in %" PRId64" ms (%" PRId32"KB/s).", fstate->m_bytesDone,took,rate);
}
Statistics::register_io_time(fstate->m_doWrite, fstate->m_errno, fstate->m_bytesDone, took);
// recall g_errno from state's m_errno
g_errno = fstate->m_errno;
// now log our stuff here
int32_t tt = ( g_errno == EFILECLOSED ) ? LOG_INFO : LOG_WARN;
if ( g_errno ) {
log( tt, "disk: err=%s. fd1=%" PRId32" fd2=%" PRId32" "
"off=%" PRId64" toread=%" PRId32,
mstrerror( g_errno ),
( int32_t ) fstate->m_fd1,
( int32_t ) fstate->m_fd2,
( int64_t ) fstate->m_offset,
( int32_t ) fstate->m_bytesToGo
);
}
// . this EBADENGINEER can happen right after a merge if
// the file is renamed because the fd may have changed from
// under us
// . i added EBADF because RbdDump was failing because of this when
// trying to write the tree to a file
// . the closeCount code below was not saving us from coring on EBADF
// because the closeCount is only changed if another file is opened
// with that fd, it is not incremented on a close() but rather on
// an open()
/*
if ( g_errno == EBADENGINEER ) { // || g_errno == EBADF ) {
int32_t fn1 = fstate->m_filenum1;
int32_t fn2 = fstate->m_filenum2;
// CAUTION: if file got delete THIS will be invalid!!!
BigFile *THIS = fstate->m_bigfile;
char *s = THIS->getFilename();
log(LOG_DEBUG,"disk: Closing old fd1 (%s,%" PRId32")",s,fn1);
log(LOG_DEBUG,"disk: Closing old fd2 (%s,%" PRId32")",s,fn2);
// get the File ptr from the table
File *f1 = THIS->getFile(fn1);
File *f2 = THIS->getFile(fn2);
if ( f2 == f1 ) f2 = NULL;
if ( f1 ) { f1->close();log(LOG_DEBUG,"disk: Closed old fd1");}
if ( f2 ) { f2->close();log(LOG_DEBUG,"disk: Closed old fd2");}
}
*/
// call the callback, with errno set if there was an error
fstate->m_callback ( fstate->m_state );
}
static void readwriteWrapper_r ( void *state ) {
int64_t time_start = gettimeofdayInMilliseconds();
// extract our class
FileState *fstate = (FileState *)state;
//check if the file (part) is scheduled to be deleted. If so, abort.
if(fstate->m_filename1[0] && isPendingUnlink(fstate->m_filename1)) {
log(LOG_WARN,"readwriteWrapper_r: file %s is marked for unlinking; aborting read/write",fstate->m_filename1);
fstate->m_errno = EFILECLOSED;
fstate->m_doneTime = gettimeofdayInMilliseconds();
return;
}
if(fstate->m_filename2[0] && isPendingUnlink(fstate->m_filename2)) {
log(LOG_WARN,"readwriteWrapper_r: file %s is marked for unlinking; aborting read/write",fstate->m_filename2);
fstate->m_errno = EFILECLOSED;
fstate->m_doneTime = gettimeofdayInMilliseconds();
return;
}
if( !fstate->m_doWrite && !fstate->m_buf && fstate->m_bytesToGo>0 ) {
int32_t need = fstate->m_allocOff + fstate->m_bytesToGo;
char *p = (char *) mmalloc ( need , "ThreadReadBuf" );
if ( p ) {
fstate->m_buf = p + fstate->m_allocOff;
fstate->m_allocBuf = p;
fstate->m_allocSize = need;
} else {
log(LOG_WARN, "readwriteWrapper_r: read buf alloc failed for %" PRId32" bytes.", need);
}
}
fstate->m_fd1 = fstate->m_bigfile->getfd (fstate->m_filenum1,!fstate->m_doWrite);
fstate->m_fd2 = fstate->m_bigfile->getfd (fstate->m_filenum2,!fstate->m_doWrite);
// is this bad?
if ( fstate->m_fd1 < 0 ) {
log( LOG_WARN, "disk: fd1 is %i for %s", fstate->m_fd1, fstate->m_bigfile->getFilename() );
}
if ( fstate->m_fd2 < 0 ) {
log( LOG_WARN, "disk: fd2 is %i for %s", fstate->m_fd2, fstate->m_bigfile->getFilename() );
}
fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
// clear thread's errno
errno = 0;
// . do the readwrite_r() since we're a thread now
// . this SHOULD NOT set g_errno, we're a thread!
// . it does have it's own errno however
bool status = readwrite_r ( fstate );
// set errno
if ( ! status ) {
fstate->m_errno = errno;
}
// . if open count changed on us our file got unlinked from under us
// and another file was opened with that same fd!!!
// . just fail the read so caller knows it is bad
// . do not do this for writes because RdbDump can fail when writing!
// . in that case hopefully write will fail if the fd was re-opened
// for another file in RDONLY mode, but, if per chance it opens
// a different file for dumping or merging with this same fd then
// we may be seriously screwing things up!! TODO: investigate
// . f1 and f2 can be non-null and invalid here now on the ssds
// i saw this happen on gk153... i preserved the core/gb on there
//if ( (getCloseCount_r (fstate->m_fd1) != fstate->m_closeCount1 ||
// getCloseCount_r (fstate->m_fd2) != fstate->m_closeCount2 )) {
// get current close counts. we can't access BigFile because it
// might have been deleted or closed on us, i saw this before.
int32_t cc1 = getCloseCount_r ( fstate->m_fd1 );
int32_t cc2 = getCloseCount_r ( fstate->m_fd2 );
if ( cc1 != fstate->m_closeCount1 || cc2 != fstate->m_closeCount2 ) {
log( LOG_WARN, "file: c1a=%" PRId32" c1b=%" PRId32" c2a=%" PRId32" c2b=%" PRId32,
cc1, fstate->m_closeCount1, cc2, fstate->m_closeCount2 );
if ( ! fstate->m_doWrite ) {
fstate->m_errno = EFILECLOSED;
} else {
// we use s_writing[] locks in File.cpp to prevent a write
// operation's fd from being closed under him
log(LOG_ERROR,"PANIC: fd closed on us while writing. This should "
"never happen!! Simultaneous writes?");
}
}
int64_t time_took = gettimeofdayInMilliseconds() - time_start;
if ( !fstate->m_doWrite && time_took >= g_conf.m_logDiskReadTimeThreshold ) {
log( LOG_WARN, "Disk read of %" PRId64" bytes took %" PRId64" ms", fstate->m_bytesDone, time_took );
}
fstate->m_doneTime = gettimeofdayInMilliseconds();
}
// . returns false and sets errno on error, true on success
// . don't log shit when you're in a thread anymore
// Use of ThreadEntry parameter is NOT thread safe
static bool readwrite_r ( FileState *fstate ) {
// if no buffer to read into the alloc in Threads.cpp failed
if ( ! fstate->m_buf ) {
errno = EBUFTOOSMALL;
log( LOG_WARN, "disk: read buf is NULL. malloc failed?");
return false;
}
// how many total bytes to write?
int64_t bytesToGo = fstate->m_bytesToGo;
// how many bytes we've written so far
int64_t bytesDone = fstate->m_bytesDone;