diff --git a/OnnxBridge/LLAMA/sytorchBackendRep.py b/OnnxBridge/LLAMA/sytorchBackendRep.py index 6c5a2ed8..a2cd0093 100644 --- a/OnnxBridge/LLAMA/sytorchBackendRep.py +++ b/OnnxBridge/LLAMA/sytorchBackendRep.py @@ -108,7 +108,9 @@ def cleartext_post(code_list, program, scale, mode, indent): f""" int main(int argc, char**__argv){'{'} - + std::ios::sync_with_stdio(false); + std::cin.tie(NULL); + std::cout.tie(NULL); prngWeights.SetSeed(osuCrypto::toBlock(0, 0)); prngStr.SetSeed(osuCrypto::toBlock(time(NULL))); @@ -147,6 +149,9 @@ def cleartext_fp_post(code_list, program, scale, mode, indent): f""" int main(int argc, char**__argv){'{'} + std::ios::sync_with_stdio(false); + std::cin.tie(NULL); + std::cout.tie(NULL); prngWeights.SetSeed(osuCrypto::toBlock(0, 0)); prngStr.SetSeed(osuCrypto::toBlock(time(NULL))); @@ -193,11 +198,14 @@ def llama_post(code_list, program, scale, mode, bitlength, indent): f""" int main(int __argc, char**__argv){'{'} - + std::ios::sync_with_stdio(false); + std::cin.tie(NULL); + std::cout.tie(NULL); prngWeights.SetSeed(osuCrypto::toBlock(0, 0)); prngStr.SetSeed(osuCrypto::toBlock(time(NULL))); int party = atoi(__argv[1]); + bool ramdisk_path = false; std::string ip = "127.0.0.1"; int nt=4; std::string weights_file = ""; @@ -206,20 +214,32 @@ def llama_post(code_list, program, scale, mode, bitlength, indent): weights_file = __argv[2]; {'}'} else if(party == DEALER){'{'} - if(__argc > 2){'{'} + if(__argc == 3){'{'} + nt = atoi(__argv[2]); + {'}'} + if(__argc > 3){'{'} nt = atoi(__argv[2]); + ramdisk_path = __argv[3]; {'}'} {'}'} else if(party == SERVER){'{'} weights_file = __argv[2]; - if(__argc > 3){'{'} + if(__argc == 4){'{'} nt = atoi(__argv[3]); {'}'} + if(__argc > 4){'{'} + nt = atoi(__argv[3]); + ramdisk_path = __argv[4]; + {'}'} {'}'} else if(party == CLIENT){'{'} ip = __argv[2]; - if(__argc > 3){'{'} + if(__argc == 4){'{'} + nt = atoi(__argv[3]); + {'}'} + if(__argc > 4){'{'} nt = atoi(__argv[3]); + ramdisk_path = __argv[4]; {'}'} {'}'} @@ -247,8 +267,13 @@ def llama_post(code_list, program, scale, mode, bitlength, indent): LlamaConfig::stochasticT = true; LlamaConfig::stochasticRT = true; LlamaConfig::num_threads = nt; + LlamaConfig::ramdisk_path = ramdisk_path; - llama->init(ip, true); + if(ramdisk_path){'{'} + llama->init(ip, true,true); + {'}'}else{'{'} + llama->init(ip,true,false); + {'}'} Net net; net.init(scale); diff --git a/OnnxBridge/README.md b/OnnxBridge/README.md index 6930c92b..63e79c43 100644 --- a/OnnxBridge/README.md +++ b/OnnxBridge/README.md @@ -84,12 +84,19 @@ python3 main.py --path "/path/to/onnx-file" --generate "code" --backend LLAMA -- # compile secure code LLAMA/compile_llama.sh "/path/to/file.cpp" +# Enable and mount Ramdisk on client and server machines +./ramdrive.sh # generate LLAMA keys on client and server machines ./ 1 + + # start inference on server and client machines -./ 2 // Server -./ 3 < // Client +./ 2 // Server +./ 3 < // Client + +# Disable and unmount Ramdisk on client and server machines +./unmount_ramdrive.sh ``` #### **LLAMA Cleartext** diff --git a/sytorch/Toy example- single inference.md b/sytorch/Toy example- single inference.md index ca775147..29c8d668 100644 --- a/sytorch/Toy example- single inference.md +++ b/sytorch/Toy example- single inference.md @@ -94,5 +94,33 @@ chmod +x client-offline.sh client-online.sh (on client) ./client-online.sh ``` +8. For Using Ramdisk, follow the steps below: +``` +# Enable and mount Ramdisk on client and server machines +./ramdrive.sh + +ramdisk_size >= sum of sizes of server and client keys. +(example) +Lenet server key size = 9.5 MB +Lenet client key size = 9.5 MB +Lenet Total key size = 19 MB +So, ramdisk_size >= 19 MB + +command: ./ramdrive.sh 20m + +chexpert server key size = 87.5 GB +chexpert client key size = 87.5 GB +chexpert Total key size = 175 GB +So ramdisk_size >= 175 GB + +command: ./ramdrive.sh 200g + +//change the server and client.sh scripts to use ramdisk + ./lenet_LLAMA_15 1 ->> ./lenet_LLAMA_15 1 4 true +// sed command to be added. + +# Disable and unmount Ramdisk on client and server machines after inference +./unmount_ramdrive.sh +``` In this particular example, you should get a score array of `[-2.71362 1.06747 4.43045 0.795044 -3.21173 -2.39871 -8.49094 10.3443 1.0567 -0.694458]`, which is maximum at index 7, which is indeed expected as the [input.jpg](https://github.com/kanav99/models/raw/main/input.jpg) file contains an image of handwritten 7. diff --git a/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h b/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h index e4cbd43e..d95b5096 100644 --- a/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h +++ b/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h @@ -41,7 +41,7 @@ namespace osuCrypto std::lock_guardl(log.mLock); for (u64 i = 0; i < log.mMessages.size(); ++i) { - o << "[" << i << ", " << log.mMessages[i].first / 1000.0 << "ms ] " << log.mMessages[i].second << std::endl; + o << "[" << i << ", " << log.mMessages[i].first / 1000.0 << "ms ] " << log.mMessages[i].second << "\n"; } return o; @@ -219,4 +219,4 @@ namespace osuCrypto void setThreadName(const std::string name); void setThreadName(const char* name); -} +} \ No newline at end of file diff --git a/sytorch/ext/llama/api.cpp b/sytorch/ext/llama/api.cpp index 8030758e..1e4fa2cf 100644 --- a/sytorch/ext/llama/api.cpp +++ b/sytorch/ext/llama/api.cpp @@ -1,5 +1,5 @@ /* -Authors: Deepak Kumaraswamy, Kanav Gupta +Authors: Deepak Kumaraswamy, Kanav Gupta, Tanmay Rajore Copyright: Copyright (c) 2022 Microsoft Research Permission is hereby granted, free of charge, to any person obtaining a copy @@ -102,8 +102,8 @@ void llama::end() std::cerr << "Select/Bit operations Time = " << selectEvalMicroseconds / 1000.0 << " milliseconds\n"; std::cerr << "Truncate time = " << arsEvalMicroseconds / 1000.0 << " milliseconds\n"; auto endTime = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - std::cerr << "Total Time (including Key Read) = " << (endTime - startTime) / 1000000.0 << " milliseconds\n"; - std::cerr << std::endl; + std::cerr << "Total Time (including Key Read) = " << double((endTime - startTime)) / 1000000.0 << " milliseconds\n"; + std::cerr << "\n"; std::cerr << "Conv Online Communication = " << convOnlineComm << " bytes\n"; std::cerr << "MatMul Online Communication = " << matmulOnlineComm << " bytes\n"; std::cerr << "Select Online Communication = " << selectOnlineComm << " bytes\n"; @@ -146,7 +146,7 @@ void reconstructRT(int32_t size, GroupElement *arr, int bw) { uint64_t *tmp = new uint64_t[size]; int bitarraySize = size % 8 == 0 ? size / 8 : size / 8 + 1; - // std::cerr << "bitarraySize = " << bitarraySize << std::endl; + // std::cerr << "bitarraySize = " << bitarraySize << "\n"; uint8_t *tmp2 = new uint8_t[bitarraySize]; uint8_t *tmp3 = new uint8_t[bitarraySize]; // std::cerr << "bits = "; @@ -154,9 +154,9 @@ void reconstructRT(int32_t size, GroupElement *arr, int bw) // { // std::cerr << arr[i + size] << " "; // } - // std::cerr << std::endl; + // std::cerr << "\n"; packBitArray(arr + size, size, tmp2); - // std::cerr << "encoded = " << (int)tmp2[0] << std::endl; + // std::cerr << "encoded = " << (int)tmp2[0] << "\n"; if (parallel_reconstruct) { std::thread send_thread(&Peer::send_batched_input, peer, arr, size, bw); @@ -182,7 +182,7 @@ void reconstructRT(int32_t size, GroupElement *arr, int bw) // std::cerr << ((tmp3[i / 8] >> (i % 8)) & 1) << " "; arr[i + size] = arr[i + size] + ((tmp3[i / 8] >> (i % 8)) & 1); } - // std::cerr << std::endl; + // std::cerr << "\n"; delete[] tmp; numRounds += 1; } @@ -208,7 +208,7 @@ void Conv2DWrapper(int32_t N, int32_t H, int32_t W, int32_t strideW, MASK_PAIR(GroupElement *inputArr), MASK_PAIR(GroupElement *filterArr), MASK_PAIR(GroupElement *outArr)) { - std::cerr << ">> Conv2D - Start" << std::endl; + std::cerr << ">> Conv2D - Start" << "\n"; int d0 = N; int d1 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1; int d2 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1; @@ -275,7 +275,7 @@ void Conv2DWrapper(int32_t N, int32_t H, int32_t W, std::cerr << " Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n"; } - std::cerr << ">> Conv2D - End" << std::endl; + std::cerr << ">> Conv2D - End" << "\n"; } @@ -287,7 +287,7 @@ void Conv3DWrapper(int32_t N, int32_t D, int32_t H, int32_t W, int32_t strideW, GroupElement *inputArr, GroupElement *filterArr, GroupElement *outArr) { - std::cerr << ">> Conv3D - Start" << std::endl; + std::cerr << ">> Conv3D - Start" << "\n"; int d0 = N; int d1 = ((D - FD + (zPadDLeft + zPadDRight)) / strideD) + 1; int d2 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1; @@ -350,7 +350,7 @@ void Conv3DWrapper(int32_t N, int32_t D, int32_t H, int32_t W, std::cerr << " Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n"; } - std::cerr << ">> Conv3D - End" << std::endl; + std::cerr << ">> Conv3D - End" << "\n"; } @@ -389,11 +389,14 @@ void ars_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Group { auto p = get_start_end(size, thread_idx); for(int i = p.first; i < p.second; i += 1){ + outArr[i] = evalARS(party - 2, inArr[i], keys[i].shift, keys[i]); + freeARSKeyPack(keys[i]); } } +//backup code /* auto keyread_start = std::chrono::high_resolution_clock::now(); auto keyread_end = std::chrono::high_resolution_clock::now(); @@ -413,11 +416,11 @@ void ars_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Group */ void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t shift) { - std::cerr << ">> Truncate" << (LlamaConfig::stochasticT ? " (stochastic)" : "") << " - Start" << std::endl; + std::cerr << ">> Truncate" << (LlamaConfig::stochasticT ? " (stochastic)" : "") << " - Start" << "\n"; if (party == DEALER) { pair *keys = new pair[size]; auto dealer_start = std::chrono::high_resolution_clock::now(); - #pragma omp parallel for + //#pragma omp parallel for for (int i = 0; i < size; i++) { GroupElement rout = random_ge(bitlength); keys[i] = keyGenARS(bitlength, bitlength, shift, inArr_mask[i], rout); @@ -447,7 +450,11 @@ void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *o keyread_start).count(); peer->sync(); + + auto start = std::chrono::high_resolution_clock::now(); + + std::thread thread_pool[num_threads]; for(int i = 0; i < num_threads; ++i) { thread_pool[i] = std::thread(ars_threads_helper, i, size, inArr, outArr, keys); @@ -475,12 +482,12 @@ void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *o arsEvalMicroseconds += (reconstruct_time + compute_time); delete[] keys; } - std::cerr << ">> Truncate - End" << std::endl; + std::cerr << ">> Truncate - End" << "\n"; } void ScaleDown(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf) { - std::cerr << ">> ScaleDown - Start " << std::endl; + std::cerr << ">> ScaleDown - Start " << "\n"; if (localTruncation) { uint64_t m = ((1L << sf) - 1) << (bitlength - sf); @@ -511,7 +518,7 @@ void ScaleDown(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf) else { ARS(size, inArr, inArr_mask, inArr, inArr_mask, sf); } - std::cerr << ">> ScaleDown - End " << std::endl; + std::cerr << ">> ScaleDown - End " << "\n"; } inline void matmul2d_server_helper(int thread_idx, int s1, int s2, int s3, GroupElement *A, GroupElement *B, GroupElement *C, GroupElement *a, GroupElement *b, GroupElement *c) @@ -549,7 +556,7 @@ inline void matmul2d_client_helper(int thread_idx, int s1, int s2, int s3, Group void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A), MASK_PAIR(GroupElement *B), MASK_PAIR(GroupElement *C), bool modelIsA) { - std::cerr << ">> MatMul2D - Start" << std::endl; + std::cerr << ">> MatMul2D - Start" << "\n"; if (party == DEALER) { auto dealer_start = std::chrono::high_resolution_clock::now(); @@ -567,7 +574,7 @@ void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A), client->send_matmul_key(keys.second); freeMatMulKey(keys.second); dealerMicroseconds += std::chrono::duration_cast(dealer_end - dealer_start).count(); - std::cerr << " Dealer Time = " << std::chrono::duration_cast(dealer_end - dealer_start).count() << " milliseconds" << std::endl; + std::cerr << " Dealer Time = " << std::chrono::duration_cast(dealer_end - dealer_start).count() << " milliseconds" << "\n"; } else { @@ -602,7 +609,7 @@ void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A), freeMatMulKey(key); } - std::cerr << ">> MatMul2D - End" << std::endl; + std::cerr << ">> MatMul2D - End" << "\n"; } void ElemWiseActModelVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr), @@ -616,7 +623,7 @@ void ArgMax(int32_t rows, int32_t cols, MASK_PAIR(GroupElement *inp), MASK_PAIR( // inp is a vector of size rows*columns and max (resp. maxidx) is caclulated for every // column chunk of elements. Result maxidx is stored in out (size: rows) - std::cerr << ">> ArgMax - Start" << std::endl; + std::cerr << ">> ArgMax - Start" << "\n"; always_assert(rows == 1); if (party == DEALER) { @@ -775,14 +782,14 @@ void ArgMax(int32_t rows, int32_t cols, MASK_PAIR(GroupElement *inp), MASK_PAIR( auto eval_time = std::chrono::duration_cast(end - start).count(); argmaxEvalMicroseconds += eval_time; evalMicroseconds += eval_time; - std::cerr << " Eval time: " << eval_time / 1000.0 << " milliseconds" << std::endl; + std::cerr << " Eval time: " << eval_time / 1000.0 << " milliseconds" << "\n"; delete[] tmpMax; delete[] tmpIdx; delete[] drelu; delete[] mult_res; } - std::cerr << ">> ArgMax - End" << std::endl; + std::cerr << ">> ArgMax - End" << "\n"; } void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH, @@ -792,7 +799,7 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH, int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr)) { // taken from the equivalent function in Porthos/src/EzPCFunctionalities.cpp - std::cerr << ">> AvgPool - Start" << std::endl; + std::cerr << ">> AvgPool - Start" << "\n"; int rows = N*H*W*C; std::vector filterAvg(rows, 0); std::vector filterAvg_mask(rows, 0); @@ -846,12 +853,12 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH, auto common_time = std::chrono::duration_cast(common_end - common_start).count(); if (party == DEALER) { dealerMicroseconds += common_time; - std::cerr << " Dealer Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << std::endl; + std::cerr << " Dealer Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << "\n"; } else { avgpoolEvalMicroseconds += common_time; evalMicroseconds += common_time; - std::cerr << " Eval Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << std::endl; + std::cerr << " Eval Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << "\n"; } @@ -879,7 +886,7 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH, // todo: the divisor ksizeH * ksizeW is 32 bits long when passed as param, but ezpc cleartext explicitly converts to 64 bit value // will this be an issue in the future? // ElemWiseVectorPublicDiv(rows, filterAvg.data(), filterAvg_mask.data(), ksizeH * ksizeW, outp.data(), outp_mask.data()); - std::cerr << "Error Error Error" << std::endl; + std::cerr << "Error Error Error" << "\n"; exit(1); } @@ -898,7 +905,7 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH, } } } - std::cerr << ">> AvgPool - End" << std::endl; + std::cerr << ">> AvgPool - End" << "\n"; } @@ -915,12 +922,12 @@ void mult_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Grou void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *multArrVec), MASK_PAIR(GroupElement *outputArr)) { - std::cerr << ">> ElemWise Mult - start" << std::endl; + std::cerr << ">> ElemWise Mult - start" << "\n"; if (party == DEALER) { uint64_t dealer_toal_time = 0; pair *keys = new pair[size]; - #pragma omp parallel for + //#pragma omp parallel for for(int i = 0; i < size; ++i) { auto dealer_start = std::chrono::high_resolution_clock::now(); auto rout = random_ge(bitlength); @@ -971,7 +978,7 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr) delete[] keys; } - std::cerr << ">> ElemWise Mult - end" << std::endl; + std::cerr << ">> ElemWise Mult - end" << "\n"; } void maxpool_threads_helper(int thread_idx, int fh, int fw, int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, @@ -1017,7 +1024,7 @@ void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW, int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *oneHot) { - std::cerr << ">> MaxPool - Start" << std::endl; + std::cerr << ">> MaxPool - Start" << "\n"; int d1 = ((imgH - FH + (zPadHLeft + zPadHRight)) / strideH) + 1; int d2 = ((imgW - FW + (zPadWLeft + zPadWRight)) / strideW) + 1; always_assert(d1 == H); @@ -1083,7 +1090,7 @@ void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, auto dealer_end = std::chrono::high_resolution_clock::now(); auto dealer_time = std::chrono::duration_cast(dealer_end - dealer_start).count() - dealer_file_read_time; dealerMicroseconds += dealer_time; - std::cerr << " Dealer time: " << dealer_time / 1000.0 << " milliseconds" << std::endl; + std::cerr << " Dealer time: " << dealer_time / 1000.0 << " milliseconds" << "\n"; } else { MaxpoolKeyPack *keys = new MaxpoolKeyPack[(FH * FW - 1) * N * C * H * W]; @@ -1175,14 +1182,14 @@ void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, evalMicroseconds += eval_time; maxpoolEvalMicroseconds += eval_time; delete[] keys; - std::cerr << " Key Read Time = " << keyread_time / 1000.0 << " milliseconds" << std::endl; - std::cerr << " Key Size = " << keysize / (1024.0 * 1024.0) << " MB" << std::endl; - std::cerr << " Compute Time = " << timeCompute / 1000.0 << " milliseconds" << std::endl; - std::cerr << " Reconstruct Time = " << timeReconstruct / 1000.0 << " milliseconds" << std::endl; - std::cerr << " Online Time = " << eval_time / 1000.0 << " miliseconds" << std::endl; + std::cerr << " Key Read Time = " << keyread_time / 1000.0 << " milliseconds" << "\n"; + std::cerr << " Key Size = " << keysize / (1024.0 * 1024.0) << " MB" << "\n"; + std::cerr << " Compute Time = " << timeCompute / 1000.0 << " milliseconds" << "\n"; + std::cerr << " Reconstruct Time = " << timeReconstruct / 1000.0 << " milliseconds" << "\n"; + std::cerr << " Online Time = " << eval_time / 1000.0 << " miliseconds" << "\n"; } - std::cerr << ">> MaxPool - End" << std::endl; + std::cerr << ">> MaxPool - End" << "\n"; } @@ -1210,13 +1217,13 @@ void relu_dealer_threads_helper(int thread_idx, int32_t size, GroupElement *inAr void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *drelu) { - std::cerr << ">> Relu (Spline) - Start" << std::endl; + std::cerr << ">> Relu (Spline) - Start" << "\n"; // todo: handle doTruncation param if (party == DEALER) { uint64_t dealer_total_time = 0; std::pair *keys = new std::pair[size]; auto start = std::chrono::high_resolution_clock::now(); - #pragma omp parallel for + //#pragma omp parallel for for(int i = 0; i < size; i += 1){ auto rout = random_ge(bitlength); // prng inside multithreads, need some locking drelu[i] = random_ge(1); @@ -1233,7 +1240,7 @@ void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement * } delete[] keys; dealerMicroseconds += dealer_total_time; - std::cerr << " Dealer time = " << dealer_total_time / 1000.0 << " milliseconds" << std::endl; + std::cerr << " Dealer time = " << dealer_total_time / 1000.0 << " milliseconds" << "\n"; } else { // Step 1: Preprocessing Keys from Dealer @@ -1283,7 +1290,7 @@ void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement * reluEvalMicroseconds += (reconstruct_time + compute_time); delete[] keys; } - std::cerr << ">> Relu (Spline) - End " << std::endl; + std::cerr << ">> Relu (Spline) - End " << "\n"; } #define BIG_LOOPY(e) for(int n = 0; n < N; ++n) {\ @@ -1326,7 +1333,7 @@ void maxpool_onehot_threads_helper(int thread_idx, int f, int32_t N, int32_t H, // This is compatible with both MaxPool and MaxPoolDouble void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32_t FW, GroupElement *maxBits, GroupElement *oneHot) { - std::cerr << ">> MaxPoolOneHot - Start" << std::endl; + std::cerr << ">> MaxPoolOneHot - Start" << "\n"; GroupElement *curr = make_array(N * H * W * C); if (party == DEALER) { BIG_LOOPY( @@ -1383,7 +1390,7 @@ void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32 Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c) = evalAnd(party - 2, max, 1 ^ c1, key); mod(Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c), 1); ) - + /* testing to be done // std::thread thread_pool[num_threads]; // for(int i = 0; i < num_threads; ++i) { // thread_pool[i] = std::thread(maxpool_onehot_threads_helper, i, f, N, H, W, C, FH, FW, maxBits, curr, oneHot, keys); @@ -1392,7 +1399,7 @@ void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32 // for(int i = 0; i < num_threads; ++i) { // thread_pool[i].join(); // } - + */ reconstruct(N * H * W * C, oneHot + f * N * H * W * C, 1); BIG_LOOPY( @@ -1407,12 +1414,12 @@ void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32 auto eval_time = std::chrono::duration_cast(end - start).count(); evalMicroseconds += eval_time; selectEvalMicroseconds += eval_time; - std::cerr << " Key Read Time = " << keyread_time / 1000.0 << " miliseconds" << std::endl; - std::cerr << " Online Time = " << eval_time / 1000.0 << " miliseconds" << std::endl; + std::cerr << " Key Read Time = " << keyread_time / 1000.0 << " miliseconds" << "\n"; + std::cerr << " Online Time = " << eval_time / 1000.0 << " miliseconds" << "\n"; delete[] keys; } delete[] curr; - std::cerr << ">> MaxPoolOneHot - End" << std::endl; + std::cerr << ">> MaxPoolOneHot - End" << "\n"; } void ConvTranspose3DWrapper(int64_t N, @@ -1440,7 +1447,7 @@ void ConvTranspose3DWrapper(int64_t N, GroupElement* filterArr, GroupElement* outArr) { - std::cerr << ">> ConvTranspose3D - Start" << std::endl; + std::cerr << ">> ConvTranspose3D - Start" << "\n"; always_assert(outD == (D - 1) * strideD - zPadDLeft - zPadDRight + FD); always_assert(outH == (H - 1) * strideH - zPadHLeft - zPadHRight + FH); always_assert(outW == (W - 1) * strideW - zPadWLeft - zPadWRight + FW); @@ -1501,6 +1508,6 @@ void ConvTranspose3DWrapper(int64_t N, std::cerr << " Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n"; } - std::cerr << ">> ConvTranspose3D - End" << std::endl; + std::cerr << ">> ConvTranspose3D - End" << "\n"; -} +} \ No newline at end of file diff --git a/sytorch/ext/llama/include/llama/comms.h b/sytorch/ext/llama/include/llama/comms.h index 507e0fd7..929889cc 100644 --- a/sytorch/ext/llama/include/llama/comms.h +++ b/sytorch/ext/llama/include/llama/comms.h @@ -51,6 +51,7 @@ class Peer { std::fstream file; uint64_t bytesSent = 0; uint64_t bytesReceived = 0; + Peer(std::string ip, int port); Peer(int sendsocket, int recvsocket) { @@ -148,29 +149,32 @@ Peer* waitForPeer(int port); class Dealer { public: int consocket; - bool useFile = false; + bool useFile = true; std::fstream file; uint64_t bytesSent = 0; uint64_t bytesReceived = 0; - bool ramdisk = false; + bool ramdisk =true; char *ramdiskBuffer; char *ramdiskStart; int ramdiskSize; + bool ramdisk_path = false; Dealer(std::string ip, int port); - Dealer(std::string filename, bool ramdisk) { + Dealer(std::string filename, bool ramdisk,bool ramdisk_path) { this->useFile = true; this->ramdisk = ramdisk; - if (ramdisk) { + this->ramdisk_path = ramdisk_path; + if (ramdisk && ramdisk_path) { int fd = open(filename.c_str(), O_RDWR | O_CREAT, 0); struct stat sb; fstat(fd, &sb); - std::cerr << "Key Size: " << sb.st_size << " bytes" << std::endl; + std::cerr << "Key Size: " << sb.st_size << " bytes" << "\n"; + int advise=posix_fadvise(fd, 0, sb.st_size, POSIX_FADV_WILLNEED); ramdiskSize = sb.st_size; - ramdiskBuffer = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); + ramdiskBuffer = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); ramdiskStart = ramdiskBuffer; - // std::cout << "RAMDISK: " << (int *)ramdiskBuffer << std::endl; + std::cout << "RAMDISK: " << (int *)ramdiskBuffer << "\n"; ::close(fd); } else { @@ -257,4 +261,3 @@ class Dealer { TripleKeyPack recv_triple_key(int bw, int64_t na, int64_t nb, int64_t nc); }; - diff --git a/sytorch/ext/llama/include/llama/config.h b/sytorch/ext/llama/include/llama/config.h index cfd2c34e..d28ce116 100644 --- a/sytorch/ext/llama/include/llama/config.h +++ b/sytorch/ext/llama/include/llama/config.h @@ -14,4 +14,5 @@ namespace LlamaConfig { extern int port; extern bool stochasticRT; extern bool stochasticT; + extern bool ramdisk_path; } diff --git a/sytorch/ext/llama/pubdiv.cpp b/sytorch/ext/llama/pubdiv.cpp index baf63aa5..1279d528 100644 --- a/sytorch/ext/llama/pubdiv.cpp +++ b/sytorch/ext/llama/pubdiv.cpp @@ -75,7 +75,7 @@ std::pair keyGenARS(int Bin, int Bout, uint64_t shift, G uint64_t ones = ((uint64_t)1 << shift) - 1; GroupElement alpha_s = y & ones; - // std::cout << "keygen alpha_n (dualdcf alpha) " << alpha_n << " alpha_s (dcf alpha)" << alpha_s << std::endl; + // std::cout << "keygen alpha_n (dualdcf alpha) " << alpha_n << " alpha_s (dcf alpha)" << alpha_s << "\n"; if (!LlamaConfig::stochasticT) { auto dcfKeys = keyGenDCF(shift, Bout, alpha_s, 1); @@ -113,7 +113,7 @@ GroupElement evalARS(int party, GroupElement x, uint64_t shift, const ARSKeyPack uint8_t x_msb = msb(x, k.Bin); // todo: bitsize of x_n should have been k.Bin - 1 uint64_t x_n = x & (((uint64_t)1 << (k.Bin - 1)) - 1); - // std::cout << "x_n " << x_n << std::endl; + // std::cout << "x_n " << x_n << "\n"; GroupElement dcfIdx = ((uint64_t)1 << shift) - x_s - 1; // GroupElement t_s = evalDCF(party, dcfIdx, k.dcfKey); @@ -138,4 +138,4 @@ GroupElement evalARS(int party, GroupElement x, uint64_t shift, const ARSKeyPack } return res; -} +} \ No newline at end of file diff --git a/sytorch/ext/llama/src/llama/comms.cpp b/sytorch/ext/llama/src/llama/comms.cpp index 3183696c..89361f6c 100644 --- a/sytorch/ext/llama/src/llama/comms.cpp +++ b/sytorch/ext/llama/src/llama/comms.cpp @@ -66,7 +66,7 @@ Peer::Peer(std::string ip, int port) { const int one = 1; setsockopt(sendsocket, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)); } - std::cerr << "connected" << std::endl; + std::cerr << "connected" << "\n"; } @@ -136,7 +136,7 @@ Peer* waitForPeer(int port) { close(mysocket); } - std::cerr << "connected" << std::endl; + std::cerr << "connected" << "\n"; return new Peer(sendsocket, recvsocket); } @@ -742,11 +742,11 @@ Dealer::Dealer(std::string ip, int port) { void Dealer::close() { if (useFile) { - if (!ramdisk) { + if (!ramdisk && ramdisk_path) { file.close(); } else { - // std::cout << (int)(ramdiskBuffer - ramdiskStart) << "bytes read" << std::endl; + // std::cout << (int)(ramdiskBuffer - ramdiskStart) << "bytes read" << "\n"; // always_assert(ramdiskBuffer - ramdiskStart == ramdiskSize); } } @@ -758,15 +758,18 @@ void Dealer::close() { GroupElement Dealer::recv_mask() { char buf[8]; if (useFile) { - if (ramdisk) { + if (ramdisk && ramdisk_path) { + std::cout<<"ramdiskBuffer: "<<(uint64_t)ramdiskBuffer<<"\n"; GroupElement g = *(uint64_t *)ramdiskBuffer; ramdiskBuffer += 8; bytesReceived += 8; return g; } this->file.read(buf, 8); + //std::cout << "dealer recv mask" << "\n"; } else { - recv(consocket, buf, 8, MSG_WAITALL); + // recv(consocket, buf, 8, MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } GroupElement g = *(uint64_t *)buf; bytesReceived += 8; @@ -776,15 +779,18 @@ GroupElement Dealer::recv_mask() { MultKey Dealer::recv_mult_key() { char buf[sizeof(MultKey)]; if (useFile) { - if (ramdisk) { - MultKey k(*(MultKey *)ramdiskBuffer); + if (ramdisk && ramdisk_path) { + std::cout<<"ramdiskBuffer ,multikey: "<<(uint64_t)ramdiskBuffer<<"\n"; + MultKey k=(*(MultKey *)ramdiskBuffer); ramdiskBuffer += sizeof(MultKey); bytesReceived += sizeof(MultKey); return k; } this->file.read(buf, sizeof(MultKey)); + //std::cout<< "dealer recv mult key" << "\n"; } else { - recv(consocket, buf, sizeof(MultKey), MSG_WAITALL); + //recv(consocket, buf, sizeof(MultKey), MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } MultKey k(*(MultKey *)buf); bytesReceived += sizeof(MultKey); @@ -794,8 +800,8 @@ MultKey Dealer::recv_mult_key() { osuCrypto::block Dealer::recv_block() { char buf[sizeof(osuCrypto::block)]; if (useFile) { - if (ramdisk) { - // std::cout << *(uint64_t *) ramdiskBuffer << std::endl; + if (ramdisk && ramdisk_path) { + std::cout << *(uint64_t *) ramdiskBuffer << "\n"; // Kanav: This could break when the endianness of the machine changes osuCrypto::block b = osuCrypto::toBlock(*(uint64_t *) (ramdiskBuffer + 8), *(uint64_t *) ramdiskBuffer); ramdiskBuffer += sizeof(osuCrypto::block); @@ -803,8 +809,10 @@ osuCrypto::block Dealer::recv_block() { return b; } this->file.read(buf, sizeof(osuCrypto::block)); + //std::cout<< "dealer recv block" << "\n"; } else { - recv(consocket, buf, sizeof(osuCrypto::block), MSG_WAITALL); + // recv(consocket, buf, sizeof(osuCrypto::block), MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } osuCrypto::block b = *(osuCrypto::block *)buf; bytesReceived += sizeof(osuCrypto::block); @@ -815,7 +823,7 @@ GroupElement Dealer::recv_ge(int bl) { if (bl > 32) { char buf[8]; if (useFile) { - if (ramdisk) { + if (ramdisk && ramdisk_path) { GroupElement g = *(uint64_t *)ramdiskBuffer; ramdiskBuffer += 8; bytesReceived += 8; @@ -823,8 +831,10 @@ GroupElement Dealer::recv_ge(int bl) { return g; } this->file.read(buf, 8); + //std::cerr << "dealer recv ge 32" << "\n"; } else { - recv(consocket, buf, 8, MSG_WAITALL); + // recv(consocket, buf, 8, MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } GroupElement g(*(uint64_t *)buf); mod(g, bl); @@ -834,7 +844,7 @@ GroupElement Dealer::recv_ge(int bl) { else if (bl > 16) { char buf[4]; if (useFile) { - if (ramdisk) { + if (ramdisk && ramdisk_path) { GroupElement g = *(uint32_t *)ramdiskBuffer; ramdiskBuffer += 4; bytesReceived += 4; @@ -842,8 +852,10 @@ GroupElement Dealer::recv_ge(int bl) { return g; } this->file.read(buf, 4); + //std::cout << "dealer recv ge 16" << "\n"; } else { - recv(consocket, buf, 4, MSG_WAITALL); + // recv(consocket, buf, 4, MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } GroupElement g(*(uint32_t *)buf); mod(g, bl); @@ -853,7 +865,7 @@ GroupElement Dealer::recv_ge(int bl) { else if (bl > 8) { char buf[2]; if (useFile) { - if (ramdisk) { + if (ramdisk && ramdisk_path) { GroupElement g = *(uint16_t *)ramdiskBuffer; ramdiskBuffer += 2; bytesReceived += 2; @@ -861,8 +873,10 @@ GroupElement Dealer::recv_ge(int bl) { return g; } this->file.read(buf, 2); + //std::cout<< "dealer recv ge 8" << "\n"; } else { - recv(consocket, buf, 2, MSG_WAITALL); + //recv(consocket, buf, 2, MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } GroupElement g(*(uint16_t *)buf); mod(g, bl); @@ -872,16 +886,18 @@ GroupElement Dealer::recv_ge(int bl) { else { char buf[1]; if (useFile) { - if (ramdisk) { + if (ramdisk && ramdisk_path) { GroupElement g = *(uint8_t *)ramdiskBuffer; ramdiskBuffer += 1; bytesReceived += 1; mod(g, bl); return g; } - this->file.read(buf, 1); + this->file.read(buf, 1); + //std::cout << "dealer recv ge 1" << "\n"; } else { - recv(consocket, buf, 1, MSG_WAITALL); + // recv(consocket, buf, 1, MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } GroupElement g(*(uint8_t *)buf); mod(g, bl); @@ -894,15 +910,17 @@ GroupElement Dealer::recv_ge(int bl) { void Dealer::recv_ge_array(const GroupElement *g, int size) { char *buf = (char *)g; if (useFile) { - if (ramdisk) { + if (ramdisk && ramdisk_path) { memcpy(buf, ramdiskBuffer, 8*size); ramdiskBuffer += 8*size; bytesReceived += 8*size; return; } this->file.read(buf, 8*size); + //std::cout << "dealer recv ge array" << "\n"; } else { - recv(consocket, buf, 8*size, MSG_WAITALL); + // recv(consocket, buf, 8*size, MSG_WAITALL); + std::cout << "dealer recv mask" << "\n"; } bytesReceived += 8 * size; @@ -914,7 +932,7 @@ DCFKeyPack Dealer::recv_dcf_keypack(int Bin, int Bout, int groupSize) { kp.Bout = Bout; kp.groupSize = groupSize; - if (ramdisk) { + if (ramdisk && ramdisk_path) { kp.k = (osuCrypto::block *)ramdiskBuffer; ramdiskBuffer += sizeof(osuCrypto::block) * (Bin + 1); } else { @@ -1136,7 +1154,7 @@ ReluKeyPack Dealer::recv_relu_key(int Bin, int Bout) { kp.Bout = Bout; kp.g = new GroupElement[groupSize]; // kp.dcfKey = recv_dcf_keypack(Bin, Bout, groupSize); - if (ramdisk) { + if (ramdisk && ramdisk_path) { kp.k = (osuCrypto::block *)ramdiskBuffer; ramdiskBuffer += sizeof(osuCrypto::block) * (Bin + 1); } else { @@ -1148,7 +1166,7 @@ ReluKeyPack Dealer::recv_relu_key(int Bin, int Bout) { for(int i = 0; i < groupSize; ++i) { kp.g[i] = recv_ge(Bout); } - if (ramdisk && (Bin > 32)) { + if (ramdisk && ramdisk_path && (Bin > 32)) { kp.v = (GroupElement *)ramdiskBuffer; ramdiskBuffer += sizeof(GroupElement) * (Bin * groupSize); } diff --git a/sytorch/ext/llama/src/llama/config.cpp b/sytorch/ext/llama/src/llama/config.cpp index b03208a1..684762f7 100644 --- a/sytorch/ext/llama/src/llama/config.cpp +++ b/sytorch/ext/llama/src/llama/config.cpp @@ -11,4 +11,5 @@ namespace LlamaConfig { int port = 42069; bool stochasticRT = false; bool stochasticT = false; + bool ramdisk_path = false; } diff --git a/sytorch/ext/llama/src/llama/utils.cpp b/sytorch/ext/llama/src/llama/utils.cpp index 84c237fc..96ef0ddd 100644 --- a/sytorch/ext/llama/src/llama/utils.cpp +++ b/sytorch/ext/llama/src/llama/utils.cpp @@ -137,7 +137,7 @@ void Conv2DReshapeInput(size_t N, size_t H, size_t W, size_t CI, size_t FH, size size_t curPosW = leftTopCornerW + fw; for (size_t ci = 0; ci < CI; ci++){ size_t rowidx = (fh*FW*CI) + (fw*CI) + ci; - // std::cout << rowidx << std::endl; + // std::cout << rowidx << "\n"; if ((((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){ Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = 0L; } @@ -385,7 +385,7 @@ void Conv3DReshapeInput(size_t N, size_t D, size_t H, size_t W, size_t CI, size_ size_t curPosW = leftTopCornerW + fw; for (size_t ci = 0; ci < CI; ci++){ size_t rowidx = (fd*FH*FW*CI) + (fh*FW*CI) + (fw*CI) + ci; - // std::cout << rowidx << std::endl; + // std::cout << rowidx << "\n"; if ((((curPosD < 0) || (curPosD >= D)) || ((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){ Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = 0L; } @@ -596,7 +596,7 @@ void ConvTranspose3DLoopInnerClear( zPadWLeft = FW - 1 - zPadWLeft; zPadWRight = FW - 1 - zPadWRight; - #pragma omp parallel for collapse(5) + //#pragma omp parallel for collapse(5) for (int64_t n = 0; n < N; n++){ for (int64_t d = 0; d < outD; d++){ for (int64_t h = 0; h < outH; h++){ @@ -633,10 +633,10 @@ void ConvTranspose3DLoopInnerClear( } } Arr5DIdx(outArr, N, outD, outH, outW, CO, n, d, h, w, co) = val; - // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << std::endl; + // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << "\n"; } } } } } -} +} \ No newline at end of file diff --git a/sytorch/include/sytorch/backend/cleartext.h b/sytorch/include/sytorch/backend/cleartext.h index 277021df..7133a47b 100644 --- a/sytorch/include/sytorch/backend/cleartext.h +++ b/sytorch/include/sytorch/backend/cleartext.h @@ -16,7 +16,7 @@ class ClearText : public Backend { template void fastfor(u64 size, Functor f) { - #pragma omp parallel for + //#pragma omp parallel for for (u64 i = 0; i < size; i++) { f(i); } diff --git a/sytorch/include/sytorch/backend/float.h b/sytorch/include/sytorch/backend/float.h index 9779bb10..0673ac8d 100644 --- a/sytorch/include/sytorch/backend/float.h +++ b/sytorch/include/sytorch/backend/float.h @@ -13,7 +13,7 @@ class FloatClearText : public Backend template void fastfor(u64 size, Functor f) { -#pragma omp parallel for +//#pragma omp parallel for for (u64 i = 0; i < size; i++) { f(i); diff --git a/sytorch/include/sytorch/backend/llama_base.h b/sytorch/include/sytorch/backend/llama_base.h index 8fb9c688..69ef5cb5 100644 --- a/sytorch/include/sytorch/backend/llama_base.h +++ b/sytorch/include/sytorch/backend/llama_base.h @@ -16,23 +16,46 @@ class LlamaBase : public Backend { public: const bool useLocalTruncation = false; - void init(std::string ip, bool ramdisk = false) + void init(std::string ip, bool ramdisk = true,bool ramdisk_path=false) { u64 seedKey = 0xdeadbeefbadc0ffe; for(int i = 0; i < 256; ++i) { LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(i, seedKey)); } if (LlamaConfig::party == 1) { + std::cerr< { void ss2m(T *data, u64 size) { - std::cerr << ">> SS2M - Start" << std::endl; + std::cerr << ">> SS2M - Start" << "\n"; if (LlamaConfig::party == 1) { for (int i = 0; i < size; i++){ data[i] = random_ge(64); @@ -191,7 +214,7 @@ class LlamaBase : public Backend { } reconstruct(size, data, 64); } - std::cerr << ">> SS2M - End" << std::endl; + std::cerr << ">> SS2M - End" << "\n"; } void matmul(const Tensor2D &a, const Tensor2D &b, Tensor2D &c) { diff --git a/sytorch/include/sytorch/backend/llama_extended.h b/sytorch/include/sytorch/backend/llama_extended.h index 3e9c9b8f..a078899c 100644 --- a/sytorch/include/sytorch/backend/llama_extended.h +++ b/sytorch/include/sytorch/backend/llama_extended.h @@ -168,7 +168,7 @@ class LlamaExtended : public LlamaBase { { if (node->layer->doTruncationForward) { if (node->children.size() == 1) { - // std::cout << "yeah.." << std::endl; + // std::cout << "yeah.." << "\n"; LayerGraphNode *child = node->children[0]; if (child->layer->doTruncationForward) { // no optimization possible diff --git a/sytorch/include/sytorch/graph.h b/sytorch/include/sytorch/graph.h index 471b22fd..2fe954a2 100644 --- a/sytorch/include/sytorch/graph.h +++ b/sytorch/include/sytorch/graph.h @@ -62,7 +62,7 @@ template void print_dot_graph(LayerGraphNode *root) { std::ofstream dotfile("graph.dot"); - dotfile << "digraph G {" << std::endl; + dotfile << "digraph G {" << "\n"; topologicalApply(root, [&dotfile](LayerGraphNode *node, LayerGraphNode *_root) { if (node->layer != nullptr) { @@ -77,13 +77,13 @@ void print_dot_graph(LayerGraphNode *root) } label += "(" + args + ")"; } - dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " [label=\"" << label << "\"" + (node->mark ? std::string(" color=\"red\"") : std::string("")) + "];" << std::endl; + dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " [label=\"" << label << "\"" + (node->mark ? std::string(" color=\"red\"") : std::string("")) + "];" << "\n"; for (auto &child : node->children) { - dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " -> " << child->layer->name + std::to_string((uint64_t)(child->layer)) << ";" << std::endl; + dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " -> " << child->layer->name + std::to_string((uint64_t)(child->layer)) << ";" << "\n"; } } }); - dotfile << "}" << std::endl; + dotfile << "}" << "\n"; dotfile.close(); -} +} \ No newline at end of file diff --git a/sytorch/include/sytorch/layers/layers.h b/sytorch/include/sytorch/layers/layers.h index cff45662..54ec8600 100644 --- a/sytorch/include/sytorch/layers/layers.h +++ b/sytorch/include/sytorch/layers/layers.h @@ -752,7 +752,7 @@ class Concat: public Layer { sz += t->size(); } - #pragma omp parallel for + //#pragma omp parallel for for(int i = 0; i < sz; ++i) { u64 l = i % outchannels; @@ -884,7 +884,7 @@ class Split: public Layer { u64 split_size = a.shape.back() / n_splits; // 3 u64 rest_size = a.size() / a.shape.back(); // 2 - #pragma omp parallel for + //#pragma omp parallel for for(u64 i = 0; i < a.size(); ++i) { u64 p = i / a.shape.back(); u64 q = i % a.shape.back(); @@ -948,7 +948,7 @@ class Transpose: public Layer { void _forward(Tensor &a) { always_assert(a.shape.size() == 2); - #pragma omp parallel for collapse(2) + //#pragma omp parallel for collapse(2) for (u64 i = 0; i < a.shape[0]; ++i) { for (u64 j = 0; j < a.shape[1]; ++j) { this->activation.data[j * a.shape[0] + i] = a.data[i * a.shape[1] + j]; diff --git a/sytorch/include/sytorch/module.h b/sytorch/include/sytorch/module.h index bce4091d..f0654b19 100644 --- a/sytorch/include/sytorch/module.h +++ b/sytorch/include/sytorch/module.h @@ -3,7 +3,10 @@ #include #include #include -#include +#include +#include +#include +#include template class SytorchModule { @@ -54,7 +57,7 @@ class SytorchModule { } id = id + "|" + paramstring(args...); if (functionalLayerMap.find(id) == functionalLayerMap.end()) { - std::cerr << "Layer not found = \"" << id << "\"" << std::endl; + std::cerr << "Layer not found = \"" << id << "\"" << "\n"; exit(1); } return functionalLayerMap[id]; @@ -148,14 +151,26 @@ class SytorchModule { always_assert(size_in_bytes % 4 == 0); // as it's float size_t numParameters = size_in_bytes / 4; float *floatWeights = new float[numParameters]; + //float *buffer; + int buffersize = 0; - std::ifstream file(weightsFile, std::ios::binary); - file.read((char*) floatWeights, size_in_bytes); - file.close(); + // std::ifstream file(weightsFile, std::ios::binary); + // file.read((char*) floatWeights, size_in_bytes); + // file.close(); + int fd1 = open(weightsFile.c_str(), O_RDWR | O_CREAT, 0); + struct stat sb; + fstat(fd1, &sb); + buffersize = sb.st_size; + int advise=posix_fadvise(fd1, 0, sb.st_size, POSIX_FADV_WILLNEED); + floatWeights= (float*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd1, 0); + //floatWeights = buffer; + std::cerr << "Model Weights Size: " << sb.st_size << " bytes" << "\n"; + ::close(fd1); u64 scale = this->scale; size_t wIdx = 0; for (auto &node: allNodesInExecutionOrder) { + auto layer = node->layer; if (layer->name == "BatchNormInference") { auto bn = (BatchNormInference*) layer; @@ -177,12 +192,13 @@ class SytorchModule { } wIdx += weights.size; + auto bias = layer->getbias(); if (layer->useBias) { for (u64 j = 0; j < bias.size; ++j) { - bias.data[j] = type_cast(floatWeights[wIdx + j] * (float)(1LL << (2 * scale))); + bias.data[j] = type_cast(floatWeights[wIdx + j] * (float)(1LL << (2*scale))); } wIdx += bias.size; @@ -192,9 +208,12 @@ class SytorchModule { } } } - + always_assert(wIdx == numParameters); - delete[] floatWeights; + + //delete floatWeights; + munmap(floatWeights, buffersize); + } void dumpi64(const std::string weightsFile) @@ -391,4 +410,4 @@ class SytorchModule { }; template -std::map *> SytorchModule::functionalLayerMap = std::map *>(); +std::map *> SytorchModule::functionalLayerMap = std::map *>(); \ No newline at end of file diff --git a/sytorch/include/sytorch/tensor.h b/sytorch/include/sytorch/tensor.h index 389651d1..87bca667 100644 --- a/sytorch/include/sytorch/tensor.h +++ b/sytorch/include/sytorch/tensor.h @@ -8,7 +8,11 @@ #include #include #include - +#include +#include +#include +#include +#include typedef uint64_t u64; typedef uint8_t u8; typedef int64_t i64; @@ -163,7 +167,7 @@ class Tensor { void copy(const Tensor &other, bool copyGraph = true) { assert_same_shape(other); // memcpy(data, other.data, size() * sizeof(T)); - #pragma omp parallel for + //#pragma omp parallel for for(u64 i = 0; i < size(); ++i) { data[i] = other.data[i]; @@ -223,12 +227,12 @@ class Tensor { { std::cout << this->shape[i] << ", "; } - std::cout << ")" << std::endl; + std::cout << ")" << "\n"; for (u64 i = 0; i < size(); i++) { std::cout << data[i] << " "; } - std::cout << std::endl; + std::cout << "\n"; } void printshape() { @@ -236,7 +240,7 @@ class Tensor { for(int i = 0; i < this->shape.size(); i++) { std::cout << this->shape[i] << ", "; } - std::cout << ")" << std::endl; + std::cout << ")" << "\n"; } T multidir_broadcast_value(const std::vector &broadcast_shape, const std::vector &idx) const @@ -296,14 +300,23 @@ class Tensor { size_t size_in_bytes = std::filesystem::file_size(filename); always_assert(size_in_bytes == size() * 4); float *floatInput = new float[size()]; - std::ifstream file(filename, std::ios::binary); - file.read((char*) floatInput, size_in_bytes); - file.close(); + int buffersize; + // std::ifstream file(filename, std::ios::binary); + // file.read((char*) floatInput, size_in_bytes); + // file.close(); + int fd2 = open(filename.c_str(), O_RDWR | O_CREAT, 0); + struct stat sb; + fstat(fd2, &sb); + buffersize = sb.st_size; + int advise=posix_fadvise(fd2, 0, sb.st_size, POSIX_FADV_WILLNEED); + floatInput= (float*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd2, 0); for(u64 i = 0; i < size(); ++i) { data[i] = (T)(floatInput[i] * (1LL << scale)); } - delete[] floatInput; + ::close(fd2); + //delete[] floatInput; + munmap(floatInput, buffersize); } Tensor5D as_5d() @@ -590,4 +603,3 @@ class Tensor5D { } }; - diff --git a/sytorch/include/sytorch/utils.h b/sytorch/include/sytorch/utils.h index 57401a33..2c0d3b75 100644 --- a/sytorch/include/sytorch/utils.h +++ b/sytorch/include/sytorch/utils.h @@ -233,7 +233,7 @@ void convTranspose3dLoop( zPadWLeft = FW - 1 - zPadWLeft; zPadWRight = FW - 1 - zPadWRight; - #pragma omp parallel for collapse(5) + //#pragma omp parallel for collapse(5) for (int64_t n = 0; n < N; n++){ for (int64_t d = 0; d < outD; d++){ for (int64_t h = 0; h < outH; h++){ @@ -270,7 +270,7 @@ void convTranspose3dLoop( } } Arr5DIdx(outArr, N, outD, outH, outW, CO, n, d, h, w, co) = val; - // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << std::endl; + // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << "\n"; } } } @@ -363,7 +363,7 @@ void print(const Tensor &p, u64 scale, u64 bw) } std::cout << (double) val / (1LL << scale); if ((i + 1) % d == 0) { - std::cout << std::endl; + std::cout << "\n"; } else { std::cout << " "; @@ -382,7 +382,7 @@ inline void printshape(const std::vector &shape) { for(int i = 0; i < shape.size(); i++) { std::cout << shape[i] << ", "; } - std::cout << ")" << std::endl; + std::cout << ")" << "\n"; } inline void sytorch_init() @@ -414,4 +414,4 @@ void qkv_split(Tensor2D &x, Tensor4D &y, u64 n_heads) y(2, head, i, pos) = x(i, j + 2 * n_embd); } } -} +} \ No newline at end of file diff --git a/sytorch/ramdrive.sh b/sytorch/ramdrive.sh new file mode 100755 index 00000000..5e265d1a --- /dev/null +++ b/sytorch/ramdrive.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +sudo mkdir /tmp/ramdisk +sudo chmod 777 /tmp/ramdisk +sudo mount -t tmpfs -o size=$1 myramdisk /tmp/ramdisk +mount | tail -n 1 \ No newline at end of file diff --git a/sytorch/unmount_ramdrive.sh b/sytorch/unmount_ramdrive.sh new file mode 100755 index 00000000..7407cd9c --- /dev/null +++ b/sytorch/unmount_ramdrive.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sudo umount /tmp/ramdisk/