From ccd059ea86f9abfcc8a6f0cbe00aba2fa565727a Mon Sep 17 00:00:00 2001 From: Ofer Gill Date: Thu, 5 Sep 2019 14:38:15 -0600 Subject: [PATCH] Makes RAMCloud Fault Tolerant when a server goes down!!! If replicas isn't explicitly set, it's assume to be zero (i.e. no backups for servers???) So setting this explicitly to 1 to achieve the fault-tolerant effects corresponding to the RAMCloud paper. 7 is the max # of containers to support this behavior on a 401GB HDD with 16GB RAM, so using that for the test. We can figure out how to support more containers later. --- .gitignore | 2 -- config/supervisord.conf | 4 +++- testing/test_cluster.py | 38 ++++++++++++++++++++++---------------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index f6d5382..22a7f25 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ RAMCloud RAMCloud-install -.vscode -tmp __pycache__/ *.py[cod] diff --git a/config/supervisord.conf b/config/supervisord.conf index 66f7cf1..f21b55f 100644 --- a/config/supervisord.conf +++ b/config/supervisord.conf @@ -10,13 +10,15 @@ stderr_logfile_maxbytes=0 [program:ramcloud-coordinator] command=/usr/local/bin/rc-coordinator --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --coordinator basic+udp:host=%(ENV_RC_IP)s,port=11111 +autorestart=false stdout_logfile=/dev/fd/1 stdout_logfile_maxbytes=0 stderr_logfile=/dev/fd/2 stderr_logfile_maxbytes=0 [program:ramcloud-server] -command=/usr/local/bin/rc-server --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --local basic+udp:host=%(ENV_RC_IP)s,port=11112 +command=/usr/local/bin/rc-server --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --local basic+udp:host=%(ENV_RC_IP)s,port=11112 --replicas 1 +autorestart=false stdout_logfile=/dev/fd/1 stdout_logfile_maxbytes=0 stderr_logfile=/dev/fd/2 diff --git a/testing/test_cluster.py b/testing/test_cluster.py index 5250261..80f00f7 100644 --- a/testing/test_cluster.py +++ b/testing/test_cluster.py @@ -1,4 +1,5 @@ import ramcloud +import os import unittest from pyexpect import expect import cluster_test_utils as ctu @@ -38,6 +39,23 @@ def make_cluster(self, num_nodes): self.ramcloud_network) self.rc_client.connect(external_storage, 'main') + def simple_recovery(self, kill_command): + self.make_cluster(num_nodes=7) + self.createTestValue() + value = self.rc_client.read(self.table, 'testKey') + expect(value).equals(('testValue', 1)) + + # find the host corresponding to the server with our table and 'testKey', + # then kill its rc-server! + locator = self.rc_client.testing_get_service_locator(self.table, 'testKey') + host = ctu.get_host(locator) + self.node_containers[host].exec_run(kill_command) + + # read the value again (without waiting for the server to recover). It + # should come out to the same value + value = self.rc_client.read(self.table, 'testKey') + expect(value).equals(('testValue', 1)) + def test_read_write(self): self.make_cluster(num_nodes=3) self.rc_client.create_table('test_table') @@ -58,23 +76,11 @@ def test_two_writes(self): expect(value).equals('Good weather') - @unittest.skip("trying stuff out") - def test_01_simple_recovery(self): - self.make_cluster(num_nodes=3) # num_nodes=8 - self.createTestValue() - value = self.rc_client.read(self.table, 'testKey') - expect(value).equals(('testValue', 1)) - - # find the host corresponding to the server with our table and 'testKey', - # then kill it! - locator = self.rc_client.testing_get_service_locator(self.table, 'testKey') - host = ctu.get_host(locator) - self.node_containers[host].kill() + def test_01_simple_recovery_graceful_server_down(self): + self.simple_recovery(kill_command = 'killall -SIGTERM rc-server') - # read the value again (without waiting for the server to recover). It - # should come out to the same value - value = self.rc_client.read(self.table, 'testKey') - expect(value).equals(('testValue', 1)) + def test_01_simple_recovery_forced_server_down(self): + self.simple_recovery(kill_command = 'killall -SIGKILL rc-server') if __name__ == '__main__': unittest.main()