diff --git a/OnnxBridge/LLAMA/sytorchBackendRep.py b/OnnxBridge/LLAMA/sytorchBackendRep.py
index 6c5a2ed8..a2cd0093 100644
--- a/OnnxBridge/LLAMA/sytorchBackendRep.py
+++ b/OnnxBridge/LLAMA/sytorchBackendRep.py
@@ -108,7 +108,9 @@ def cleartext_post(code_list, program, scale, mode, indent):
         f"""
 
 int main(int argc, char**__argv){'{'}
-
+    std::ios::sync_with_stdio(false);
+    std::cin.tie(NULL);
+    std::cout.tie(NULL);
     prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
     prngStr.SetSeed(osuCrypto::toBlock(time(NULL)));
 
@@ -147,6 +149,9 @@ def cleartext_fp_post(code_list, program, scale, mode, indent):
         f"""
 
 int main(int argc, char**__argv){'{'}
+    std::ios::sync_with_stdio(false);
+    std::cin.tie(NULL);
+    std::cout.tie(NULL);
 
     prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
     prngStr.SetSeed(osuCrypto::toBlock(time(NULL)));
@@ -193,11 +198,14 @@ def llama_post(code_list, program, scale, mode, bitlength, indent):
         f"""
     
 int main(int __argc, char**__argv){'{'}
-    
+    std::ios::sync_with_stdio(false);
+    std::cin.tie(NULL);
+    std::cout.tie(NULL);
     prngWeights.SetSeed(osuCrypto::toBlock(0, 0));
     prngStr.SetSeed(osuCrypto::toBlock(time(NULL)));
 
     int party = atoi(__argv[1]);
+    bool ramdisk_path = false;
     std::string ip = "127.0.0.1";
     int nt=4;
     std::string weights_file = "";
@@ -206,20 +214,32 @@ def llama_post(code_list, program, scale, mode, bitlength, indent):
         weights_file = __argv[2];
     {'}'}
     else if(party == DEALER){'{'}
-        if(__argc > 2){'{'}
+        if(__argc == 3){'{'}
+            nt = atoi(__argv[2]);
+        {'}'}
+        if(__argc > 3){'{'}
             nt = atoi(__argv[2]);
+            ramdisk_path = __argv[3];
         {'}'}
     {'}'}
     else if(party == SERVER){'{'}
         weights_file = __argv[2];
-        if(__argc > 3){'{'}
+        if(__argc == 4){'{'}
             nt = atoi(__argv[3]);
         {'}'}
+        if(__argc > 4){'{'}
+            nt = atoi(__argv[3]);
+            ramdisk_path = __argv[4];
+        {'}'}
     {'}'}
     else if(party == CLIENT){'{'}
         ip = __argv[2];
-        if(__argc > 3){'{'}
+        if(__argc == 4){'{'}
+            nt = atoi(__argv[3]);
+        {'}'}
+        if(__argc > 4){'{'}
             nt = atoi(__argv[3]);
+            ramdisk_path = __argv[4];
         {'}'}
     {'}'}
 
@@ -247,8 +267,13 @@ def llama_post(code_list, program, scale, mode, bitlength, indent):
     LlamaConfig::stochasticT = true;
     LlamaConfig::stochasticRT = true;
     LlamaConfig::num_threads = nt;
+    LlamaConfig::ramdisk_path = ramdisk_path;
 
-    llama->init(ip, true);
+    if(ramdisk_path){'{'}
+    llama->init(ip, true,true);
+    {'}'}else{'{'}
+    llama->init(ip,true,false);
+    {'}'}   
 
     Net<u64> net;
     net.init(scale);
diff --git a/OnnxBridge/README.md b/OnnxBridge/README.md
index 6930c92b..63e79c43 100644
--- a/OnnxBridge/README.md
+++ b/OnnxBridge/README.md
@@ -84,12 +84,19 @@ python3 main.py --path "/path/to/onnx-file" --generate "code" --backend LLAMA --
 # compile secure code
 LLAMA/compile_llama.sh "/path/to/file.cpp"
 
+# Enable and mount Ramdisk on client and server machines
+./ramdrive.sh <ramdisk_size>
 # generate LLAMA keys on client and server machines
 ./<network> 1 <num_threads>
 
+
+
 # start inference on server and client machines
-./<network> 2 <model_weights_file> <num_threads>// Server
-./<network> 3 <server-ip> <num_threads> < <image_file> // Client
+./<network> 2 <model_weights_file> <num_threads> <ramdisk_path "true/false">// Server
+./<network> 3 <server-ip> <num_threads> < <image_file> <ramdisk_path "true/false">// Client
+
+# Disable and unmount Ramdisk on client and server machines
+./unmount_ramdrive.sh
 ```
 
 #### **LLAMA Cleartext**
diff --git a/sytorch/Toy example- single inference.md b/sytorch/Toy example- single inference.md
index ca775147..29c8d668 100644
--- a/sytorch/Toy example- single inference.md	
+++ b/sytorch/Toy example- single inference.md	
@@ -94,5 +94,33 @@ chmod +x client-offline.sh client-online.sh
 (on client)
 ./client-online.sh
 ```
+8. For Using Ramdisk, follow the steps below:
+```
+# Enable and mount Ramdisk on client and server machines
+./ramdrive.sh <ramdisk_size>
+
+ramdisk_size >= sum of sizes of server and client keys.
+(example)
+Lenet server key size = 9.5 MB
+Lenet client key size = 9.5 MB
+Lenet Total key size = 19 MB
+So, ramdisk_size >= 19 MB
+
+command: ./ramdrive.sh 20m
+
+chexpert server key size = 87.5 GB
+chexpert client key size = 87.5 GB
+chexpert Total key size = 175 GB
+So ramdisk_size >= 175 GB
+
+command: ./ramdrive.sh 200g
+
+//change the server and client.sh scripts to use ramdisk 
+ ./lenet_LLAMA_15 1  ->> ./lenet_LLAMA_15 1 4 true
+// sed command to be added.
+
+# Disable and unmount Ramdisk on client and server machines after inference
+./unmount_ramdrive.sh
+```
 
 In this particular example, you should get a score array of `[-2.71362 1.06747 4.43045 0.795044 -3.21173 -2.39871 -8.49094 10.3443 1.0567 -0.694458]`, which is maximum at index 7, which is indeed expected as the [input.jpg](https://github.com/kanav99/models/raw/main/input.jpg) file contains an image of handwritten 7.
diff --git a/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h b/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h
index e4cbd43e..d95b5096 100644
--- a/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h
+++ b/sytorch/ext/cryptoTools/cryptoTools/Common/Log.h
@@ -41,7 +41,7 @@ namespace osuCrypto
         std::lock_guard<std::mutex>l(log.mLock);
         for (u64 i = 0; i < log.mMessages.size(); ++i)
         {
-            o << "[" << i << ", " << log.mMessages[i].first / 1000.0 << "ms ]  " << log.mMessages[i].second << std::endl;
+            o << "[" << i << ", " << log.mMessages[i].first / 1000.0 << "ms ]  " << log.mMessages[i].second << "\n";
         }
 
         return o;
@@ -219,4 +219,4 @@ namespace osuCrypto
     void setThreadName(const std::string name);
     void setThreadName(const char* name);
 
-}
+}
\ No newline at end of file
diff --git a/sytorch/ext/llama/api.cpp b/sytorch/ext/llama/api.cpp
index 8030758e..1e4fa2cf 100644
--- a/sytorch/ext/llama/api.cpp
+++ b/sytorch/ext/llama/api.cpp
@@ -1,5 +1,5 @@
 /*
-Authors: Deepak Kumaraswamy, Kanav Gupta
+Authors: Deepak Kumaraswamy, Kanav Gupta, Tanmay Rajore
 Copyright:
 Copyright (c) 2022 Microsoft Research
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -102,8 +102,8 @@ void llama::end()
         std::cerr << "Select/Bit operations Time = " << selectEvalMicroseconds / 1000.0 << " milliseconds\n";
         std::cerr << "Truncate time = " << arsEvalMicroseconds / 1000.0 << " milliseconds\n";
         auto endTime = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-        std::cerr << "Total Time (including Key Read) = " << (endTime - startTime) / 1000000.0 << " milliseconds\n";
-        std::cerr << std::endl;
+        std::cerr << "Total Time (including Key Read) = " << double((endTime - startTime)) / 1000000.0 << " milliseconds\n";
+        std::cerr << "\n";
         std::cerr << "Conv Online Communication = " << convOnlineComm << " bytes\n";
         std::cerr << "MatMul Online Communication = " << matmulOnlineComm << " bytes\n";
         std::cerr << "Select Online Communication = " << selectOnlineComm << " bytes\n";
@@ -146,7 +146,7 @@ void reconstructRT(int32_t size, GroupElement *arr, int bw)
 {
     uint64_t *tmp = new uint64_t[size];
     int bitarraySize = size % 8 == 0 ? size / 8 : size / 8 + 1;
-    // std::cerr << "bitarraySize = " << bitarraySize << std::endl;
+    // std::cerr << "bitarraySize = " << bitarraySize << "\n";
     uint8_t *tmp2 = new uint8_t[bitarraySize];
     uint8_t *tmp3 = new uint8_t[bitarraySize];
     // std::cerr << "bits = ";
@@ -154,9 +154,9 @@ void reconstructRT(int32_t size, GroupElement *arr, int bw)
     // {
     //     std::cerr << arr[i + size] << "  ";
     // }
-    // std::cerr << std::endl;
+    // std::cerr << "\n";
     packBitArray(arr + size, size, tmp2);
-    // std::cerr << "encoded = " << (int)tmp2[0] << std::endl;
+    // std::cerr << "encoded = " << (int)tmp2[0] << "\n";
     if (parallel_reconstruct)
     {
         std::thread send_thread(&Peer::send_batched_input, peer, arr, size, bw);
@@ -182,7 +182,7 @@ void reconstructRT(int32_t size, GroupElement *arr, int bw)
         // std::cerr << ((tmp3[i / 8] >> (i % 8)) & 1) << "  ";
         arr[i + size] = arr[i + size] + ((tmp3[i / 8] >> (i % 8)) & 1);
     }
-    // std::cerr << std::endl;
+    // std::cerr << "\n";
     delete[] tmp;
     numRounds += 1;
 }
@@ -208,7 +208,7 @@ void Conv2DWrapper(int32_t N, int32_t H, int32_t W,
                    int32_t strideW, MASK_PAIR(GroupElement *inputArr), MASK_PAIR(GroupElement *filterArr),
                    MASK_PAIR(GroupElement *outArr))
 {
-    std::cerr << ">> Conv2D - Start" << std::endl;
+    std::cerr << ">> Conv2D - Start" << "\n";
     int d0 = N;
     int d1 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
     int d2 = ((W - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
@@ -275,7 +275,7 @@ void Conv2DWrapper(int32_t N, int32_t H, int32_t W,
         std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
     }
 
-    std::cerr << ">> Conv2D - End" << std::endl;
+    std::cerr << ">> Conv2D - End" << "\n";
 
 }
 
@@ -287,7 +287,7 @@ void Conv3DWrapper(int32_t N, int32_t D, int32_t H, int32_t W,
             int32_t strideW, GroupElement *inputArr, GroupElement *filterArr,
             GroupElement *outArr)
 {
-    std::cerr << ">> Conv3D - Start" << std::endl;
+    std::cerr << ">> Conv3D - Start" << "\n";
     int d0 = N;
     int d1 = ((D - FD + (zPadDLeft + zPadDRight)) / strideD) + 1;
     int d2 = ((H - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
@@ -350,7 +350,7 @@ void Conv3DWrapper(int32_t N, int32_t D, int32_t H, int32_t W,
         std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
     }
 
-    std::cerr << ">> Conv3D - End" << std::endl;
+    std::cerr << ">> Conv3D - End" << "\n";
 
 }
 
@@ -389,11 +389,14 @@ void ars_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Group
 {
     auto p = get_start_end(size, thread_idx);
     for(int i = p.first; i < p.second; i += 1){
+        
         outArr[i] = evalARS(party - 2, inArr[i], keys[i].shift, keys[i]);
+       
         freeARSKeyPack(keys[i]);
     }
 }
 
+//backup code
 /*
         auto keyread_start = std::chrono::high_resolution_clock::now();
         auto keyread_end = std::chrono::high_resolution_clock::now();
@@ -413,11 +416,11 @@ void ars_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Group
 */
 void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), int32_t shift)
 {
-    std::cerr << ">> Truncate" << (LlamaConfig::stochasticT ? " (stochastic)" : "") << " - Start" << std::endl;
+    std::cerr << ">> Truncate" << (LlamaConfig::stochasticT ? " (stochastic)" : "") << " - Start" << "\n";
     if (party == DEALER) {
         pair<ARSKeyPack> *keys = new pair<ARSKeyPack>[size];
         auto dealer_start = std::chrono::high_resolution_clock::now();
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for (int i = 0; i < size; i++) {
             GroupElement rout = random_ge(bitlength);
             keys[i] = keyGenARS(bitlength, bitlength, shift, inArr_mask[i], rout);
@@ -447,7 +450,11 @@ void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *o
                                                             keyread_start).count();
 
         peer->sync();
+        
+        
         auto start = std::chrono::high_resolution_clock::now();
+
+        
         std::thread thread_pool[num_threads];
         for(int i = 0; i < num_threads; ++i) {
             thread_pool[i] = std::thread(ars_threads_helper, i, size, inArr, outArr, keys);
@@ -475,12 +482,12 @@ void ARS(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *o
         arsEvalMicroseconds += (reconstruct_time + compute_time);
         delete[] keys;
     }
-    std::cerr << ">> Truncate - End" << std::endl;
+    std::cerr << ">> Truncate - End" << "\n";
 }
 
 void ScaleDown(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf)
 {
-    std::cerr << ">> ScaleDown - Start " << std::endl;
+    std::cerr << ">> ScaleDown - Start " << "\n";
 
     if (localTruncation) {
         uint64_t m = ((1L << sf) - 1) << (bitlength - sf);
@@ -511,7 +518,7 @@ void ScaleDown(int32_t size, MASK_PAIR(GroupElement *inArr), int32_t sf)
     else {
         ARS(size, inArr, inArr_mask, inArr, inArr_mask, sf);
     }
-    std::cerr << ">> ScaleDown - End " << std::endl;
+    std::cerr << ">> ScaleDown - End " << "\n";
 }
 
 inline void matmul2d_server_helper(int thread_idx, int s1, int s2, int s3, GroupElement *A, GroupElement *B, GroupElement *C, GroupElement *a, GroupElement *b, GroupElement *c)
@@ -549,7 +556,7 @@ inline void matmul2d_client_helper(int thread_idx, int s1, int s2, int s3, Group
 void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
             MASK_PAIR(GroupElement *B), MASK_PAIR(GroupElement *C), bool modelIsA)
 {
-    std::cerr << ">> MatMul2D - Start" << std::endl;
+    std::cerr << ">> MatMul2D - Start" << "\n";
     if (party == DEALER) {
 
         auto dealer_start = std::chrono::high_resolution_clock::now();
@@ -567,7 +574,7 @@ void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
         client->send_matmul_key(keys.second);
         freeMatMulKey(keys.second);
         dealerMicroseconds += std::chrono::duration_cast<std::chrono::microseconds>(dealer_end - dealer_start).count();
-        std::cerr << "   Dealer Time = " << std::chrono::duration_cast<std::chrono::milliseconds>(dealer_end - dealer_start).count() << " milliseconds" << std::endl;
+        std::cerr << "   Dealer Time = " << std::chrono::duration_cast<std::chrono::milliseconds>(dealer_end - dealer_start).count() << " milliseconds" << "\n";
     }
     else {
 
@@ -602,7 +609,7 @@ void MatMul2D(int32_t s1, int32_t s2, int32_t s3, MASK_PAIR(GroupElement *A),
         freeMatMulKey(key);
     }
 
-    std::cerr << ">> MatMul2D - End" << std::endl;
+    std::cerr << ">> MatMul2D - End" << "\n";
 }
 
 void ElemWiseActModelVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr),
@@ -616,7 +623,7 @@ void ArgMax(int32_t rows, int32_t cols, MASK_PAIR(GroupElement *inp), MASK_PAIR(
     // inp is a vector of size rows*columns and max (resp. maxidx) is caclulated for every
     // column chunk of elements. Result maxidx is stored in out (size: rows)
 
-    std::cerr << ">> ArgMax - Start" << std::endl;
+    std::cerr << ">> ArgMax - Start" << "\n";
     always_assert(rows == 1);
     if (party == DEALER)
     { 
@@ -775,14 +782,14 @@ void ArgMax(int32_t rows, int32_t cols, MASK_PAIR(GroupElement *inp), MASK_PAIR(
         auto eval_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
         argmaxEvalMicroseconds += eval_time;
         evalMicroseconds += eval_time;
-        std::cerr << "   Eval time: " << eval_time / 1000.0 << " milliseconds" << std::endl;
+        std::cerr << "   Eval time: " << eval_time / 1000.0 << " milliseconds" << "\n";
         delete[] tmpMax;
         delete[] tmpIdx;
         delete[] drelu;
         delete[] mult_res;
 
     }
-    std::cerr << ">> ArgMax - End" << std::endl;
+    std::cerr << ">> ArgMax - End" << "\n";
 }
 
 void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
@@ -792,7 +799,7 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
              int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr)) 
 {
     // taken from the equivalent function in Porthos/src/EzPCFunctionalities.cpp
-    std::cerr << ">> AvgPool - Start" << std::endl;
+    std::cerr << ">> AvgPool - Start" << "\n";
     int rows = N*H*W*C;
 	std::vector<GroupElement> filterAvg(rows, 0);
     std::vector<GroupElement> filterAvg_mask(rows, 0);
@@ -846,12 +853,12 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
     auto common_time = std::chrono::duration_cast<std::chrono::microseconds>(common_end - common_start).count();
     if (party == DEALER) {
         dealerMicroseconds += common_time;
-        std::cerr << "   Dealer Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << std::endl;
+        std::cerr << "   Dealer Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << "\n";
     }
     else {
         avgpoolEvalMicroseconds += common_time;
         evalMicroseconds += common_time;
-        std::cerr << "   Eval Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << std::endl;
+        std::cerr << "   Eval Time (without PubDiv) = " << common_time / 1000.0 << " miliseconds" << "\n";
     }
     
 
@@ -879,7 +886,7 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
         // todo: the divisor ksizeH * ksizeW is 32 bits long when passed as param, but ezpc cleartext explicitly converts to 64 bit value
         // will this be an issue in the future?
         // ElemWiseVectorPublicDiv(rows, filterAvg.data(), filterAvg_mask.data(), ksizeH * ksizeW, outp.data(), outp_mask.data());
-        std::cerr << "Error Error Error" << std::endl;
+        std::cerr << "Error Error Error" << "\n";
         exit(1);
     }
 
@@ -898,7 +905,7 @@ void AvgPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t ksizeH,
 			}
 		}
 	}
-    std::cerr << ">> AvgPool - End" << std::endl;
+    std::cerr << ">> AvgPool - End" << "\n";
 }
 
 
@@ -915,12 +922,12 @@ void mult_threads_helper(int thread_idx, int32_t size, GroupElement *inArr, Grou
 void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr),
                                     MASK_PAIR(GroupElement *multArrVec), MASK_PAIR(GroupElement *outputArr))
 {
-    std::cerr << ">> ElemWise Mult - start" << std::endl;
+    std::cerr << ">> ElemWise Mult - start" << "\n";
     if (party == DEALER) {
         uint64_t dealer_toal_time = 0;
         pair<MultKey> *keys = new pair<MultKey>[size];
 
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for(int i = 0; i < size; ++i) {
             auto dealer_start = std::chrono::high_resolution_clock::now();
             auto rout = random_ge(bitlength);
@@ -971,7 +978,7 @@ void ElemWiseSecretSharedVectorMult(int32_t size, MASK_PAIR(GroupElement *inArr)
         delete[] keys;
 
     }
-    std::cerr << ">> ElemWise Mult - end" << std::endl;
+    std::cerr << ">> ElemWise Mult - end" << "\n";
 }
 
 void maxpool_threads_helper(int thread_idx, int fh, int fw, int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
@@ -1017,7 +1024,7 @@ void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
              int32_t strideW, int32_t N1, int32_t imgH, int32_t imgW,
              int32_t C1, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *oneHot) 
 {
-    std::cerr << ">> MaxPool - Start" << std::endl;
+    std::cerr << ">> MaxPool - Start" << "\n";
     int d1 = ((imgH - FH + (zPadHLeft + zPadHRight)) / strideH) + 1;
     int d2 = ((imgW - FW + (zPadWLeft + zPadWRight)) / strideW) + 1;
     always_assert(d1 == H);
@@ -1083,7 +1090,7 @@ void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
         auto dealer_end = std::chrono::high_resolution_clock::now();
         auto dealer_time = std::chrono::duration_cast<std::chrono::microseconds>(dealer_end - dealer_start).count() - dealer_file_read_time;
         dealerMicroseconds += dealer_time;
-        std::cerr << "   Dealer time: " << dealer_time / 1000.0 << " milliseconds" << std::endl;
+        std::cerr << "   Dealer time: " << dealer_time / 1000.0 << " milliseconds" << "\n";
     }
     else {
         MaxpoolKeyPack *keys = new MaxpoolKeyPack[(FH * FW - 1) * N * C * H * W];
@@ -1175,14 +1182,14 @@ void MaxPool(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH,
         evalMicroseconds += eval_time;
         maxpoolEvalMicroseconds += eval_time;
         delete[] keys;
-        std::cerr << "   Key Read Time = " << keyread_time / 1000.0 << " milliseconds" << std::endl;
-        std::cerr << "   Key Size = " << keysize / (1024.0 * 1024.0) << " MB" << std::endl;
-        std::cerr << "   Compute Time = " << timeCompute / 1000.0 << " milliseconds" << std::endl;
-        std::cerr << "   Reconstruct Time = " << timeReconstruct / 1000.0 << " milliseconds" << std::endl;
-        std::cerr << "   Online Time = " << eval_time / 1000.0 << " miliseconds" << std::endl;
+        std::cerr << "   Key Read Time = " << keyread_time / 1000.0 << " milliseconds" << "\n";
+        std::cerr << "   Key Size = " << keysize / (1024.0 * 1024.0) << " MB" << "\n";
+        std::cerr << "   Compute Time = " << timeCompute / 1000.0 << " milliseconds" << "\n";
+        std::cerr << "   Reconstruct Time = " << timeReconstruct / 1000.0 << " milliseconds" << "\n";
+        std::cerr << "   Online Time = " << eval_time / 1000.0 << " miliseconds" << "\n";
     }
 
-    std::cerr << ">> MaxPool - End" << std::endl;
+    std::cerr << ">> MaxPool - End" << "\n";
 }
 
 
@@ -1210,13 +1217,13 @@ void relu_dealer_threads_helper(int thread_idx, int32_t size, GroupElement *inAr
 
 void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *outArr), GroupElement *drelu)
 {
-    std::cerr << ">> Relu (Spline) - Start" << std::endl;
+    std::cerr << ">> Relu (Spline) - Start" << "\n";
     // todo: handle doTruncation param
     if (party == DEALER) {
         uint64_t dealer_total_time = 0;
         std::pair<ReluKeyPack, ReluKeyPack> *keys = new std::pair<ReluKeyPack, ReluKeyPack>[size];
         auto start = std::chrono::high_resolution_clock::now();
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for(int i = 0; i < size; i += 1){
             auto rout = random_ge(bitlength); // prng inside multithreads, need some locking
             drelu[i] = random_ge(1);
@@ -1233,7 +1240,7 @@ void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *
         }
         delete[] keys;
         dealerMicroseconds += dealer_total_time;
-        std::cerr << "   Dealer time = " << dealer_total_time / 1000.0 << " milliseconds" << std::endl;
+        std::cerr << "   Dealer time = " << dealer_total_time / 1000.0 << " milliseconds" << "\n";
     }
     else {
         // Step 1: Preprocessing Keys from Dealer
@@ -1283,7 +1290,7 @@ void Relu(int32_t size, MASK_PAIR(GroupElement *inArr), MASK_PAIR(GroupElement *
         reluEvalMicroseconds += (reconstruct_time + compute_time);
         delete[] keys;
     }
-    std::cerr << ">> Relu (Spline) - End " << std::endl;
+    std::cerr << ">> Relu (Spline) - End " << "\n";
 }
 
 #define BIG_LOOPY(e) for(int n = 0; n < N; ++n) {\
@@ -1326,7 +1333,7 @@ void maxpool_onehot_threads_helper(int thread_idx, int f, int32_t N, int32_t H,
 // This is compatible with both MaxPool and MaxPoolDouble
 void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32_t FW, GroupElement *maxBits, GroupElement *oneHot)
 {
-    std::cerr << ">> MaxPoolOneHot - Start" << std::endl;
+    std::cerr << ">> MaxPoolOneHot - Start" << "\n";
     GroupElement *curr = make_array<GroupElement>(N * H * W * C);
     if (party == DEALER) {
         BIG_LOOPY(
@@ -1383,7 +1390,7 @@ void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32
                 Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c) = evalAnd(party - 2, max, 1 ^ c1, key);
                 mod(Arr5DIdx(oneHot, FH * FW, N, H, W, C, f, n, h, w, c), 1);
             )
-
+            /* testing to be done
             // std::thread thread_pool[num_threads];
             // for(int i = 0; i < num_threads; ++i) {
             //     thread_pool[i] = std::thread(maxpool_onehot_threads_helper, i, f, N, H, W, C, FH, FW, maxBits, curr, oneHot, keys);
@@ -1392,7 +1399,7 @@ void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32
             // for(int i = 0; i < num_threads; ++i) {
             //     thread_pool[i].join();
             // }
-
+            */
             reconstruct(N * H * W * C, oneHot + f * N * H * W * C, 1);
             
             BIG_LOOPY(
@@ -1407,12 +1414,12 @@ void MaxPoolOneHot(int32_t N, int32_t H, int32_t W, int32_t C, int32_t FH, int32
         auto eval_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
         evalMicroseconds += eval_time;
         selectEvalMicroseconds += eval_time;
-        std::cerr << "   Key Read Time = " << keyread_time / 1000.0 << " miliseconds" << std::endl;
-        std::cerr << "   Online Time = " << eval_time / 1000.0 << " miliseconds" << std::endl;
+        std::cerr << "   Key Read Time = " << keyread_time / 1000.0 << " miliseconds" << "\n";
+        std::cerr << "   Online Time = " << eval_time / 1000.0 << " miliseconds" << "\n";
         delete[] keys;
     }
     delete[] curr;
-    std::cerr << ">> MaxPoolOneHot - End" << std::endl;
+    std::cerr << ">> MaxPoolOneHot - End" << "\n";
 }
 
 void ConvTranspose3DWrapper(int64_t N, 
@@ -1440,7 +1447,7 @@ void ConvTranspose3DWrapper(int64_t N,
     GroupElement* filterArr, 
     GroupElement* outArr)
 {
-    std::cerr << ">> ConvTranspose3D - Start" << std::endl;
+    std::cerr << ">> ConvTranspose3D - Start" << "\n";
     always_assert(outD == (D - 1) * strideD - zPadDLeft - zPadDRight + FD);
     always_assert(outH == (H - 1) * strideH - zPadHLeft - zPadHRight + FH);
     always_assert(outW == (W - 1) * strideW - zPadWLeft - zPadWRight + FW);
@@ -1501,6 +1508,6 @@ void ConvTranspose3DWrapper(int64_t N,
         std::cerr << "   Online Comm = " << (onlineComm1 - onlineComm0) << " bytes\n";
     }
 
-    std::cerr << ">> ConvTranspose3D - End" << std::endl;
+    std::cerr << ">> ConvTranspose3D - End" << "\n";
 
-}
+}
\ No newline at end of file
diff --git a/sytorch/ext/llama/include/llama/comms.h b/sytorch/ext/llama/include/llama/comms.h
index 507e0fd7..929889cc 100644
--- a/sytorch/ext/llama/include/llama/comms.h
+++ b/sytorch/ext/llama/include/llama/comms.h
@@ -51,6 +51,7 @@ class Peer {
     std::fstream file;
     uint64_t bytesSent = 0;
     uint64_t bytesReceived = 0;
+    
 
     Peer(std::string ip, int port);
     Peer(int sendsocket, int recvsocket) {
@@ -148,29 +149,32 @@ Peer* waitForPeer(int port);
 class Dealer {
 public:
     int consocket;
-    bool useFile = false;
+    bool useFile = true;
     std::fstream file;
     uint64_t bytesSent = 0;
     uint64_t bytesReceived = 0;
-    bool ramdisk = false;
+    bool ramdisk =true;
     char *ramdiskBuffer;
     char *ramdiskStart;
     int ramdiskSize;
+    bool ramdisk_path = false;
 
     Dealer(std::string ip, int port);
 
-    Dealer(std::string filename, bool ramdisk) {
+    Dealer(std::string filename, bool ramdisk,bool ramdisk_path) {
         this->useFile = true;
         this->ramdisk = ramdisk;
-        if (ramdisk) {
+        this->ramdisk_path = ramdisk_path;
+        if (ramdisk && ramdisk_path) {
             int fd = open(filename.c_str(), O_RDWR | O_CREAT, 0);
             struct stat sb;
             fstat(fd, &sb);
-            std::cerr << "Key Size: " << sb.st_size << " bytes" << std::endl;
+            std::cerr << "Key Size: " << sb.st_size << " bytes" << "\n";
+            int advise=posix_fadvise(fd, 0, sb.st_size, POSIX_FADV_WILLNEED);
             ramdiskSize = sb.st_size;
-            ramdiskBuffer = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
+            ramdiskBuffer = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
             ramdiskStart = ramdiskBuffer;
-            // std::cout << "RAMDISK: " << (int *)ramdiskBuffer << std::endl;
+            std::cout << "RAMDISK: " << (int *)ramdiskBuffer << "\n";
             ::close(fd);
         }
         else {
@@ -257,4 +261,3 @@ class Dealer {
     TripleKeyPack recv_triple_key(int bw, int64_t na, int64_t nb, int64_t nc);
 
 };
-
diff --git a/sytorch/ext/llama/include/llama/config.h b/sytorch/ext/llama/include/llama/config.h
index cfd2c34e..d28ce116 100644
--- a/sytorch/ext/llama/include/llama/config.h
+++ b/sytorch/ext/llama/include/llama/config.h
@@ -14,4 +14,5 @@ namespace LlamaConfig {
     extern int port;
     extern bool stochasticRT;
     extern bool stochasticT;
+    extern bool ramdisk_path;
 }
diff --git a/sytorch/ext/llama/pubdiv.cpp b/sytorch/ext/llama/pubdiv.cpp
index baf63aa5..1279d528 100644
--- a/sytorch/ext/llama/pubdiv.cpp
+++ b/sytorch/ext/llama/pubdiv.cpp
@@ -75,7 +75,7 @@ std::pair<ARSKeyPack, ARSKeyPack> keyGenARS(int Bin, int Bout, uint64_t shift, G
     uint64_t ones = ((uint64_t)1 << shift) - 1;
     GroupElement alpha_s = y & ones;
 
-    // std::cout << "keygen alpha_n (dualdcf alpha) " << alpha_n << " alpha_s (dcf alpha)" << alpha_s << std::endl;
+    // std::cout << "keygen alpha_n (dualdcf alpha) " << alpha_n << " alpha_s (dcf alpha)" << alpha_s << "\n";
 
     if (!LlamaConfig::stochasticT) {
         auto dcfKeys = keyGenDCF(shift, Bout, alpha_s, 1);
@@ -113,7 +113,7 @@ GroupElement evalARS(int party, GroupElement x, uint64_t shift, const ARSKeyPack
     uint8_t x_msb = msb(x, k.Bin);
     // todo: bitsize of x_n should have been k.Bin - 1
     uint64_t x_n = x & (((uint64_t)1 << (k.Bin - 1)) - 1);
-    // std::cout << "x_n " << x_n << std::endl;
+    // std::cout << "x_n " << x_n << "\n";
 
     GroupElement dcfIdx = ((uint64_t)1 << shift) - x_s - 1;
     // GroupElement t_s = evalDCF(party, dcfIdx, k.dcfKey);
@@ -138,4 +138,4 @@ GroupElement evalARS(int party, GroupElement x, uint64_t shift, const ARSKeyPack
     }
 
     return res; 
-}
+}
\ No newline at end of file
diff --git a/sytorch/ext/llama/src/llama/comms.cpp b/sytorch/ext/llama/src/llama/comms.cpp
index 3183696c..89361f6c 100644
--- a/sytorch/ext/llama/src/llama/comms.cpp
+++ b/sytorch/ext/llama/src/llama/comms.cpp
@@ -66,7 +66,7 @@ Peer::Peer(std::string ip, int port) {
         const int one = 1;
         setsockopt(sendsocket, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one));
     }
-    std::cerr << "connected" << std::endl;
+    std::cerr << "connected" << "\n";
 
 }
 
@@ -136,7 +136,7 @@ Peer* waitForPeer(int port) {
         close(mysocket);
     }
     
-    std::cerr << "connected" << std::endl;
+    std::cerr << "connected" << "\n";
     return new Peer(sendsocket, recvsocket);
 }
 
@@ -742,11 +742,11 @@ Dealer::Dealer(std::string ip, int port) {
 
 void Dealer::close() {
     if (useFile) {
-        if (!ramdisk) {
+        if (!ramdisk && ramdisk_path) {
             file.close();
         }
         else {
-            // std::cout << (int)(ramdiskBuffer - ramdiskStart) << "bytes read" << std::endl;
+            // std::cout << (int)(ramdiskBuffer - ramdiskStart) << "bytes read" << "\n";
             // always_assert(ramdiskBuffer - ramdiskStart == ramdiskSize);
         }
     }
@@ -758,15 +758,18 @@ void Dealer::close() {
 GroupElement Dealer::recv_mask() {
     char buf[8];
     if (useFile) {
-        if (ramdisk) {
+        if (ramdisk && ramdisk_path) {
+            std::cout<<"ramdiskBuffer: "<<(uint64_t)ramdiskBuffer<<"\n";
             GroupElement g = *(uint64_t *)ramdiskBuffer;
             ramdiskBuffer += 8;
             bytesReceived += 8;
             return g;
         }
         this->file.read(buf, 8);
+        //std::cout << "dealer recv mask" << "\n";
     } else {
-        recv(consocket, buf, 8, MSG_WAITALL);
+       // recv(consocket, buf, 8, MSG_WAITALL);
+        std::cout << "dealer recv mask" << "\n";
     }
     GroupElement g = *(uint64_t *)buf;
     bytesReceived += 8;
@@ -776,15 +779,18 @@ GroupElement Dealer::recv_mask() {
 MultKey Dealer::recv_mult_key() {
     char buf[sizeof(MultKey)];
     if (useFile) {
-        if (ramdisk) {
-            MultKey k(*(MultKey *)ramdiskBuffer);
+        if (ramdisk && ramdisk_path) {
+             std::cout<<"ramdiskBuffer ,multikey: "<<(uint64_t)ramdiskBuffer<<"\n";
+            MultKey k=(*(MultKey *)ramdiskBuffer);
             ramdiskBuffer += sizeof(MultKey);
             bytesReceived += sizeof(MultKey);
             return k;
         }
         this->file.read(buf, sizeof(MultKey));
+        //std::cout<< "dealer recv mult key" << "\n";
     } else {
-        recv(consocket, buf, sizeof(MultKey), MSG_WAITALL);
+        //recv(consocket, buf, sizeof(MultKey), MSG_WAITALL);
+        std::cout << "dealer recv mask" << "\n";
     }
     MultKey k(*(MultKey *)buf);
     bytesReceived += sizeof(MultKey);
@@ -794,8 +800,8 @@ MultKey Dealer::recv_mult_key() {
 osuCrypto::block Dealer::recv_block() {
     char buf[sizeof(osuCrypto::block)];
     if (useFile) {
-        if (ramdisk) {
-            // std::cout << *(uint64_t *) ramdiskBuffer << std::endl;
+        if (ramdisk && ramdisk_path) {
+             std::cout << *(uint64_t *) ramdiskBuffer << "\n";
             // Kanav: This could break when the endianness of the machine changes
             osuCrypto::block b = osuCrypto::toBlock(*(uint64_t *) (ramdiskBuffer + 8), *(uint64_t *) ramdiskBuffer);
             ramdiskBuffer += sizeof(osuCrypto::block);
@@ -803,8 +809,10 @@ osuCrypto::block Dealer::recv_block() {
             return b;
         }
         this->file.read(buf, sizeof(osuCrypto::block));
+        //std::cout<< "dealer recv block" << "\n";
     } else {
-        recv(consocket, buf, sizeof(osuCrypto::block), MSG_WAITALL);
+       // recv(consocket, buf, sizeof(osuCrypto::block), MSG_WAITALL);
+        std::cout << "dealer recv mask" << "\n";
     }
     osuCrypto::block b = *(osuCrypto::block *)buf;
     bytesReceived += sizeof(osuCrypto::block);
@@ -815,7 +823,7 @@ GroupElement Dealer::recv_ge(int bl) {
     if (bl > 32) {
         char buf[8];
         if (useFile) {
-            if (ramdisk) {
+            if (ramdisk && ramdisk_path) {
                 GroupElement g = *(uint64_t *)ramdiskBuffer;
                 ramdiskBuffer += 8;
                 bytesReceived += 8;
@@ -823,8 +831,10 @@ GroupElement Dealer::recv_ge(int bl) {
                 return g;
             }
             this->file.read(buf, 8);
+            //std::cerr << "dealer recv ge 32" << "\n";
         } else {
-            recv(consocket, buf, 8, MSG_WAITALL);
+           // recv(consocket, buf, 8, MSG_WAITALL);
+            std::cout << "dealer recv mask" << "\n";
         }
         GroupElement g(*(uint64_t *)buf);
         mod(g, bl);
@@ -834,7 +844,7 @@ GroupElement Dealer::recv_ge(int bl) {
     else if (bl > 16) {
         char buf[4];
         if (useFile) {
-            if (ramdisk) {
+            if (ramdisk && ramdisk_path) {
                 GroupElement g = *(uint32_t *)ramdiskBuffer;
                 ramdiskBuffer += 4;
                 bytesReceived += 4;
@@ -842,8 +852,10 @@ GroupElement Dealer::recv_ge(int bl) {
                 return g;
             }
             this->file.read(buf, 4);
+            //std::cout << "dealer recv ge 16" << "\n";
         } else {
-            recv(consocket, buf, 4, MSG_WAITALL);
+           // recv(consocket, buf, 4, MSG_WAITALL);
+            std::cout << "dealer recv mask" << "\n";
         }
         GroupElement g(*(uint32_t *)buf);
         mod(g, bl);
@@ -853,7 +865,7 @@ GroupElement Dealer::recv_ge(int bl) {
     else if (bl > 8) {
         char buf[2];
         if (useFile) {
-            if (ramdisk) {
+            if (ramdisk && ramdisk_path) {
                 GroupElement g = *(uint16_t *)ramdiskBuffer;
                 ramdiskBuffer += 2;
                 bytesReceived += 2;
@@ -861,8 +873,10 @@ GroupElement Dealer::recv_ge(int bl) {
                 return g;
             }
             this->file.read(buf, 2);
+            //std::cout<< "dealer recv ge 8" << "\n";
         } else {
-            recv(consocket, buf, 2, MSG_WAITALL);
+            //recv(consocket, buf, 2, MSG_WAITALL);
+            std::cout << "dealer recv mask" << "\n";
         }
         GroupElement g(*(uint16_t *)buf);
         mod(g, bl);
@@ -872,16 +886,18 @@ GroupElement Dealer::recv_ge(int bl) {
     else {
         char buf[1];
         if (useFile) {
-            if (ramdisk) {
+            if (ramdisk && ramdisk_path) {
                 GroupElement g = *(uint8_t *)ramdiskBuffer;
                 ramdiskBuffer += 1;
                 bytesReceived += 1;
                 mod(g, bl);
                 return g;
             }
-            this->file.read(buf, 1);
+           this->file.read(buf, 1);
+            //std::cout << "dealer recv ge 1" << "\n";
         } else {
-            recv(consocket, buf, 1, MSG_WAITALL);
+           // recv(consocket, buf, 1, MSG_WAITALL);
+            std::cout << "dealer recv mask" << "\n";
         }
         GroupElement g(*(uint8_t *)buf);
         mod(g, bl);
@@ -894,15 +910,17 @@ GroupElement Dealer::recv_ge(int bl) {
 void Dealer::recv_ge_array(const GroupElement *g, int size) {
     char *buf = (char *)g;
     if (useFile) {
-        if (ramdisk) {
+        if (ramdisk && ramdisk_path) {
             memcpy(buf, ramdiskBuffer, 8*size);
             ramdiskBuffer += 8*size;
             bytesReceived += 8*size;
             return;
         }
         this->file.read(buf, 8*size);
+        //std::cout << "dealer recv ge array" << "\n";
     } else {
-        recv(consocket, buf, 8*size, MSG_WAITALL);
+       // recv(consocket, buf, 8*size, MSG_WAITALL);
+        std::cout << "dealer recv mask" << "\n";
     }
     bytesReceived += 8 * size;
     
@@ -914,7 +932,7 @@ DCFKeyPack Dealer::recv_dcf_keypack(int Bin, int Bout, int groupSize) {
     kp.Bout = Bout;
     kp.groupSize = groupSize;
 
-    if (ramdisk) {
+    if (ramdisk && ramdisk_path) {
         kp.k = (osuCrypto::block *)ramdiskBuffer;
         ramdiskBuffer += sizeof(osuCrypto::block) * (Bin + 1);
     } else {
@@ -1136,7 +1154,7 @@ ReluKeyPack Dealer::recv_relu_key(int Bin, int Bout) {
     kp.Bout = Bout;
     kp.g = new GroupElement[groupSize];
     // kp.dcfKey = recv_dcf_keypack(Bin, Bout, groupSize);
-    if (ramdisk) {
+    if (ramdisk && ramdisk_path) {
         kp.k = (osuCrypto::block *)ramdiskBuffer;
         ramdiskBuffer += sizeof(osuCrypto::block) * (Bin + 1);
     } else {
@@ -1148,7 +1166,7 @@ ReluKeyPack Dealer::recv_relu_key(int Bin, int Bout) {
     for(int i = 0; i < groupSize; ++i) {
         kp.g[i] = recv_ge(Bout);
     }
-    if (ramdisk && (Bin > 32)) {
+    if (ramdisk && ramdisk_path && (Bin > 32)) {
         kp.v = (GroupElement *)ramdiskBuffer;
         ramdiskBuffer += sizeof(GroupElement) * (Bin * groupSize);
     }
diff --git a/sytorch/ext/llama/src/llama/config.cpp b/sytorch/ext/llama/src/llama/config.cpp
index b03208a1..684762f7 100644
--- a/sytorch/ext/llama/src/llama/config.cpp
+++ b/sytorch/ext/llama/src/llama/config.cpp
@@ -11,4 +11,5 @@ namespace LlamaConfig {
     int port = 42069;
     bool stochasticRT = false;
     bool stochasticT  = false;
+    bool ramdisk_path = false;
 }
diff --git a/sytorch/ext/llama/src/llama/utils.cpp b/sytorch/ext/llama/src/llama/utils.cpp
index 84c237fc..96ef0ddd 100644
--- a/sytorch/ext/llama/src/llama/utils.cpp
+++ b/sytorch/ext/llama/src/llama/utils.cpp
@@ -137,7 +137,7 @@ void Conv2DReshapeInput(size_t N, size_t H, size_t W, size_t CI, size_t FH, size
 						size_t curPosW = leftTopCornerW + fw;
 						for (size_t ci = 0; ci < CI; ci++){
                             size_t rowidx = (fh*FW*CI) + (fw*CI) + ci;
-                            // std::cout << rowidx << std::endl;
+                            // std::cout << rowidx << "\n";
 							if ((((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){
 								Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = 0L;
 							}
@@ -385,7 +385,7 @@ void Conv3DReshapeInput(size_t N, size_t D, size_t H, size_t W, size_t CI, size_
                                 size_t curPosW = leftTopCornerW + fw;
                                 for (size_t ci = 0; ci < CI; ci++){
                                     size_t rowidx = (fd*FH*FW*CI) + (fh*FW*CI) + (fw*CI) + ci;
-                                    // std::cout << rowidx << std::endl;
+                                    // std::cout << rowidx << "\n";
                                     if ((((curPosD < 0) || (curPosD >= D)) || ((curPosH < 0) || (curPosH >= H)) || ((curPosW < 0) || (curPosW >= W)))){
                                         Arr2DIdx(outputArr, RRows, RCols, rowidx, linIdxFilterMult) = 0L;
                                     }
@@ -596,7 +596,7 @@ void ConvTranspose3DLoopInnerClear(
     zPadWLeft = FW - 1 - zPadWLeft;
     zPadWRight = FW - 1 - zPadWRight;
 
-    #pragma omp parallel for collapse(5)
+    //#pragma omp parallel for collapse(5)
     for (int64_t n =  0; n < N; n++){
         for (int64_t d =  0; d < outD; d++){
             for (int64_t h =  0; h < outH; h++){
@@ -633,10 +633,10 @@ void ConvTranspose3DLoopInnerClear(
                             }
                         }
                         Arr5DIdx(outArr, N, outD, outH, outW, CO, n, d, h, w, co) =  val;
-                        // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << std::endl;
+                        // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << "\n";
                     }
                 }
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/sytorch/include/sytorch/backend/cleartext.h b/sytorch/include/sytorch/backend/cleartext.h
index 277021df..7133a47b 100644
--- a/sytorch/include/sytorch/backend/cleartext.h
+++ b/sytorch/include/sytorch/backend/cleartext.h
@@ -16,7 +16,7 @@ class ClearText : public Backend<T> {
     template <typename Functor>
     void fastfor(u64 size, Functor f)
     {
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for (u64 i = 0; i < size; i++) {
             f(i);
         }
diff --git a/sytorch/include/sytorch/backend/float.h b/sytorch/include/sytorch/backend/float.h
index 9779bb10..0673ac8d 100644
--- a/sytorch/include/sytorch/backend/float.h
+++ b/sytorch/include/sytorch/backend/float.h
@@ -13,7 +13,7 @@ class FloatClearText : public Backend<T>
     template <typename Functor>
     void fastfor(u64 size, Functor f)
     {
-#pragma omp parallel for
+//#pragma omp parallel for
         for (u64 i = 0; i < size; i++)
         {
             f(i);
diff --git a/sytorch/include/sytorch/backend/llama_base.h b/sytorch/include/sytorch/backend/llama_base.h
index 8fb9c688..69ef5cb5 100644
--- a/sytorch/include/sytorch/backend/llama_base.h
+++ b/sytorch/include/sytorch/backend/llama_base.h
@@ -16,23 +16,46 @@ class LlamaBase : public Backend<T> {
 public:
     const bool useLocalTruncation = false;
 
-    void init(std::string ip, bool ramdisk = false)
+    void init(std::string ip, bool ramdisk = true,bool ramdisk_path=false)
     {
         u64 seedKey = 0xdeadbeefbadc0ffe;
         for(int i = 0; i < 256; ++i) {
             LlamaConfig::prngs[i].SetSeed(osuCrypto::toBlock(i, seedKey));
         }
         if (LlamaConfig::party == 1) {
+            std::cerr<<ramdisk<<ramdisk_path<<"\n";
+            if (ramdisk && ramdisk_path)
+            {
+            LlamaConfig::server = new Peer("/tmp/ramdisk/server.dat");
+            LlamaConfig::client = new Peer("/tmp/ramdisk/client.dat");
+            }
+            else
+            {
             LlamaConfig::server = new Peer("server.dat");
             LlamaConfig::client = new Peer("client.dat");
+            }
         }
         else if (LlamaConfig::party == 2) {
-            LlamaConfig::dealer = new Dealer("server.dat", ramdisk);
+            if(ramdisk && ramdisk_path)
+            {
+            LlamaConfig::dealer = new Dealer("/tmp/ramdisk/server.dat", ramdisk,ramdisk_path);
+            }
+            else
+            {
+            LlamaConfig::dealer = new Dealer("server.dat", ramdisk,ramdisk_path);
+            }
             LlamaConfig::client = waitForPeer(42005);
             LlamaConfig::peer = LlamaConfig::client;
         }
         else if (LlamaConfig::party == 3) {
-            LlamaConfig::dealer = new Dealer("client.dat", ramdisk);
+            if(ramdisk && ramdisk_path)
+            {
+            LlamaConfig::dealer = new Dealer("/tmp/ramdisk/client.dat", ramdisk,ramdisk_path);
+            }
+            else
+            {
+            LlamaConfig::dealer = new Dealer("client.dat", ramdisk,ramdisk_path);
+            }
             LlamaConfig::server = new Peer(ip, 42005);
             LlamaConfig::peer = LlamaConfig::server;
         }
@@ -175,7 +198,7 @@ class LlamaBase : public Backend<T> {
 
     void ss2m(T *data, u64 size)
     {
-        std::cerr << ">> SS2M - Start" << std::endl;
+        std::cerr << ">> SS2M - Start" << "\n";
         if (LlamaConfig::party == 1) {
             for (int i = 0; i < size; i++){
                 data[i] = random_ge(64);
@@ -191,7 +214,7 @@ class LlamaBase : public Backend<T> {
             }
             reconstruct(size, data, 64);
         }
-        std::cerr << ">> SS2M - End" << std::endl;
+        std::cerr << ">> SS2M - End" << "\n";
     }
 
     void matmul(const Tensor2D<T> &a, const Tensor2D<T> &b, Tensor2D<T> &c) {
diff --git a/sytorch/include/sytorch/backend/llama_extended.h b/sytorch/include/sytorch/backend/llama_extended.h
index 3e9c9b8f..a078899c 100644
--- a/sytorch/include/sytorch/backend/llama_extended.h
+++ b/sytorch/include/sytorch/backend/llama_extended.h
@@ -168,7 +168,7 @@ class LlamaExtended : public LlamaBase<T> {
     {
         if (node->layer->doTruncationForward) {
             if (node->children.size() == 1) {
-                // std::cout << "yeah.." << std::endl;
+                // std::cout << "yeah.." << "\n";
                 LayerGraphNode<T> *child = node->children[0];
                 if (child->layer->doTruncationForward) {
                     // no optimization possible
diff --git a/sytorch/include/sytorch/graph.h b/sytorch/include/sytorch/graph.h
index 471b22fd..2fe954a2 100644
--- a/sytorch/include/sytorch/graph.h
+++ b/sytorch/include/sytorch/graph.h
@@ -62,7 +62,7 @@ template <typename T>
 void print_dot_graph(LayerGraphNode<T> *root)
 {
     std::ofstream dotfile("graph.dot");
-    dotfile << "digraph G {" << std::endl;
+    dotfile << "digraph G {" << "\n";
 
     topologicalApply(root, [&dotfile](LayerGraphNode<T> *node, LayerGraphNode<T> *_root) {
         if (node->layer != nullptr) {
@@ -77,13 +77,13 @@ void print_dot_graph(LayerGraphNode<T> *root)
                 }
                 label += "(" + args + ")";
             }
-            dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " [label=\"" << label << "\"" + (node->mark ? std::string(" color=\"red\"") : std::string("")) + "];" << std::endl;
+            dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " [label=\"" << label << "\"" + (node->mark ? std::string(" color=\"red\"") : std::string("")) + "];" << "\n";
             for (auto &child : node->children) {
-                dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " -> " << child->layer->name + std::to_string((uint64_t)(child->layer)) << ";" << std::endl;
+                dotfile << node->layer->name + std::to_string((uint64_t)(node->layer)) << " -> " << child->layer->name + std::to_string((uint64_t)(child->layer)) << ";" << "\n";
             }
         }
     });
 
-    dotfile << "}" << std::endl;
+    dotfile << "}" << "\n";
     dotfile.close();
-}
+}
\ No newline at end of file
diff --git a/sytorch/include/sytorch/layers/layers.h b/sytorch/include/sytorch/layers/layers.h
index cff45662..54ec8600 100644
--- a/sytorch/include/sytorch/layers/layers.h
+++ b/sytorch/include/sytorch/layers/layers.h
@@ -752,7 +752,7 @@ class Concat: public Layer<T> {
             sz += t->size();
         }
 
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for(int i = 0; i < sz; ++i)
         {
             u64 l = i % outchannels;
@@ -884,7 +884,7 @@ class Split: public Layer<T> {
         u64 split_size = a.shape.back() / n_splits; // 3
         u64 rest_size = a.size() / a.shape.back(); // 2
         
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for(u64 i = 0; i < a.size(); ++i) {
             u64 p = i / a.shape.back();
             u64 q = i % a.shape.back();
@@ -948,7 +948,7 @@ class Transpose: public Layer<T> {
 
     void _forward(Tensor<T> &a) {
         always_assert(a.shape.size() == 2);
-        #pragma omp parallel for collapse(2)
+        //#pragma omp parallel for collapse(2)
         for (u64 i = 0; i < a.shape[0]; ++i) {
             for (u64 j = 0; j < a.shape[1]; ++j) {
                 this->activation.data[j * a.shape[0] + i] = a.data[i * a.shape[1] + j];
diff --git a/sytorch/include/sytorch/module.h b/sytorch/include/sytorch/module.h
index bce4091d..f0654b19 100644
--- a/sytorch/include/sytorch/module.h
+++ b/sytorch/include/sytorch/module.h
@@ -3,7 +3,10 @@
 #include <filesystem>
 #include <map>
 #include <algorithm>
-#include <sytorch/backend/default.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
 
 template <typename T>
 class SytorchModule {
@@ -54,7 +57,7 @@ class SytorchModule {
         }
         id = id + "|" + paramstring(args...);
         if (functionalLayerMap.find(id) == functionalLayerMap.end()) {
-            std::cerr << "Layer not found = \"" << id << "\"" << std::endl;
+            std::cerr << "Layer not found = \"" << id << "\"" << "\n";
             exit(1);
         }
         return functionalLayerMap[id];
@@ -148,14 +151,26 @@ class SytorchModule {
         always_assert(size_in_bytes % 4 == 0); // as it's float
         size_t numParameters = size_in_bytes / 4;
         float *floatWeights = new float[numParameters];
+        //float *buffer;
+        int buffersize = 0;
         
-        std::ifstream file(weightsFile, std::ios::binary);
-        file.read((char*) floatWeights, size_in_bytes);
-        file.close();
+        // std::ifstream file(weightsFile, std::ios::binary);
+        // file.read((char*) floatWeights, size_in_bytes);
+        // file.close();
+        int fd1 = open(weightsFile.c_str(), O_RDWR | O_CREAT, 0);
+        struct stat sb;
+        fstat(fd1, &sb);
+        buffersize = sb.st_size;
+        int advise=posix_fadvise(fd1, 0, sb.st_size, POSIX_FADV_WILLNEED);
+        floatWeights= (float*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd1, 0);
+        //floatWeights = buffer;
+        std::cerr << "Model Weights Size: " << sb.st_size << " bytes" << "\n";
+        ::close(fd1);
         u64 scale = this->scale;
         
         size_t wIdx = 0;
         for (auto &node: allNodesInExecutionOrder) {
+            
             auto layer = node->layer;
             if (layer->name == "BatchNormInference") {
                 auto bn = (BatchNormInference<T>*) layer;
@@ -177,12 +192,13 @@ class SytorchModule {
                 }
 
                 wIdx += weights.size;
+                
 
                 auto bias = layer->getbias();
                 if (layer->useBias) {
 
                     for (u64 j = 0; j < bias.size; ++j) {
-                        bias.data[j] = type_cast<T>(floatWeights[wIdx + j] * (float)(1LL << (2 * scale)));
+                        bias.data[j] = type_cast<T>(floatWeights[wIdx + j] * (float)(1LL << (2*scale)));
                     }
 
                     wIdx += bias.size;
@@ -192,9 +208,12 @@ class SytorchModule {
                 }
             }
         }
-
+        
         always_assert(wIdx == numParameters);
-        delete[] floatWeights;
+    
+        //delete floatWeights;
+        munmap(floatWeights, buffersize);
+       
     }
 
     void dumpi64(const std::string weightsFile)
@@ -391,4 +410,4 @@ class SytorchModule {
 };
 
 template <typename T>
-std::map<std::string, LayerGraphNode<T> *> SytorchModule<T>::functionalLayerMap = std::map<std::string, LayerGraphNode<T> *>();
+std::map<std::string, LayerGraphNode<T> *> SytorchModule<T>::functionalLayerMap = std::map<std::string, LayerGraphNode<T> *>();
\ No newline at end of file
diff --git a/sytorch/include/sytorch/tensor.h b/sytorch/include/sytorch/tensor.h
index 389651d1..87bca667 100644
--- a/sytorch/include/sytorch/tensor.h
+++ b/sytorch/include/sytorch/tensor.h
@@ -8,7 +8,11 @@
 #include <llama/assert.h>
 #include <cmath>
 #include <filesystem>
-
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <fstream>
+#include <unistd.h>
 typedef uint64_t u64;
 typedef uint8_t u8;
 typedef int64_t i64;
@@ -163,7 +167,7 @@ class Tensor {
     void copy(const Tensor<T> &other, bool copyGraph = true) {
         assert_same_shape(other);
         // memcpy(data, other.data, size() * sizeof(T));
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for(u64 i = 0; i < size(); ++i)
         {
             data[i] = other.data[i];
@@ -223,12 +227,12 @@ class Tensor {
         {
             std::cout << this->shape[i] << ", ";
         }
-        std::cout << ")" << std::endl;
+        std::cout << ")" << "\n";
         for (u64 i = 0; i < size(); i++)
         {
             std::cout << data[i] << " ";
         }
-        std::cout << std::endl;
+        std::cout << "\n";
     }
 
     void printshape() {
@@ -236,7 +240,7 @@ class Tensor {
         for(int i = 0; i < this->shape.size(); i++) {
             std::cout << this->shape[i] << ", ";
         }
-        std::cout << ")" << std::endl;
+        std::cout << ")" << "\n";
     }
 
     T multidir_broadcast_value(const std::vector<u64> &broadcast_shape, const std::vector<u64> &idx) const
@@ -296,14 +300,23 @@ class Tensor {
         size_t size_in_bytes = std::filesystem::file_size(filename);
         always_assert(size_in_bytes == size() * 4);
         float *floatInput = new float[size()];
-        std::ifstream file(filename, std::ios::binary);
-        file.read((char*) floatInput, size_in_bytes);
-        file.close();
+        int buffersize;
+        // std::ifstream file(filename, std::ios::binary);
+        // file.read((char*) floatInput, size_in_bytes);
+        // file.close();
+        int fd2 = open(filename.c_str(), O_RDWR | O_CREAT, 0);
+        struct stat sb;
+        fstat(fd2, &sb);
+        buffersize = sb.st_size;
+        int advise=posix_fadvise(fd2, 0, sb.st_size, POSIX_FADV_WILLNEED);
+        floatInput= (float*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd2, 0);
         for(u64 i = 0; i < size(); ++i)
         {
             data[i] = (T)(floatInput[i] * (1LL << scale));
         }
-        delete[] floatInput;
+        ::close(fd2);
+        //delete[] floatInput;
+        munmap(floatInput, buffersize);
     }
 
     Tensor5D<T> as_5d()
@@ -590,4 +603,3 @@ class Tensor5D {
     }
 
 };
-
diff --git a/sytorch/include/sytorch/utils.h b/sytorch/include/sytorch/utils.h
index 57401a33..2c0d3b75 100644
--- a/sytorch/include/sytorch/utils.h
+++ b/sytorch/include/sytorch/utils.h
@@ -233,7 +233,7 @@ void convTranspose3dLoop(
     zPadWLeft = FW - 1 - zPadWLeft;
     zPadWRight = FW - 1 - zPadWRight;
 
-    #pragma omp parallel for collapse(5)
+    //#pragma omp parallel for collapse(5)
     for (int64_t n =  0; n < N; n++){
         for (int64_t d =  0; d < outD; d++){
             for (int64_t h =  0; h < outH; h++){
@@ -270,7 +270,7 @@ void convTranspose3dLoop(
                             }
                         }
                         Arr5DIdx(outArr, N, outD, outH, outW, CO, n, d, h, w, co) =  val;
-                        // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << std::endl;
+                        // std::cout << "setting element at (" << n << " " << d << " " << h << " " << w << " " << co << ")" << "\n";
                     }
                 }
             }
@@ -363,7 +363,7 @@ void print(const Tensor<T> &p, u64 scale, u64 bw)
         }
         std::cout << (double) val / (1LL << scale);
         if ((i + 1) % d == 0) {
-            std::cout << std::endl;
+            std::cout << "\n";
         }
         else {
             std::cout << " ";
@@ -382,7 +382,7 @@ inline void printshape(const std::vector<u64> &shape) {
     for(int i = 0; i < shape.size(); i++) {
         std::cout << shape[i] << ", ";
     }
-    std::cout << ")" << std::endl;
+    std::cout << ")" << "\n";
 }
 
 inline void sytorch_init()
@@ -414,4 +414,4 @@ void qkv_split(Tensor2D<T> &x, Tensor4D<T> &y, u64 n_heads)
             y(2, head, i, pos) = x(i, j + 2 * n_embd);
         }
     }
-}
+}
\ No newline at end of file
diff --git a/sytorch/ramdrive.sh b/sytorch/ramdrive.sh
new file mode 100755
index 00000000..5e265d1a
--- /dev/null
+++ b/sytorch/ramdrive.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+sudo mkdir /tmp/ramdisk
+sudo chmod 777 /tmp/ramdisk
+sudo mount -t tmpfs -o size=$1 myramdisk /tmp/ramdisk
+mount | tail -n 1
\ No newline at end of file
diff --git a/sytorch/unmount_ramdrive.sh b/sytorch/unmount_ramdrive.sh
new file mode 100755
index 00000000..7407cd9c
--- /dev/null
+++ b/sytorch/unmount_ramdrive.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+sudo umount /tmp/ramdisk/