Skip to content

Commit

Permalink
Makes RAMCloud Fault Tolerant when a server goes down!!!
Browse files Browse the repository at this point in the history
If replicas isn't explicitly set, it's assume to be zero (i.e. no backups for servers???) So setting this explicitly to 1 to achieve the fault-tolerant effects corresponding to the RAMCloud paper.

7 is the max # of containers to support this behavior on a 401GB HDD with 16GB RAM, so using that for the test. We can figure out how to support more containers later.
  • Loading branch information
OferMania committed Sep 6, 2019
1 parent 4b69dae commit ccd059e
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 19 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
RAMCloud
RAMCloud-install
.vscode
tmp
__pycache__/
*.py[cod]
4 changes: 3 additions & 1 deletion config/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ stderr_logfile_maxbytes=0

[program:ramcloud-coordinator]
command=/usr/local/bin/rc-coordinator --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --coordinator basic+udp:host=%(ENV_RC_IP)s,port=11111
autorestart=false
stdout_logfile=/dev/fd/1
stdout_logfile_maxbytes=0
stderr_logfile=/dev/fd/2
stderr_logfile_maxbytes=0

[program:ramcloud-server]
command=/usr/local/bin/rc-server --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --local basic+udp:host=%(ENV_RC_IP)s,port=11112
command=/usr/local/bin/rc-server --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --local basic+udp:host=%(ENV_RC_IP)s,port=11112 --replicas 1
autorestart=false
stdout_logfile=/dev/fd/1
stdout_logfile_maxbytes=0
stderr_logfile=/dev/fd/2
Expand Down
38 changes: 22 additions & 16 deletions testing/test_cluster.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import ramcloud
import os
import unittest
from pyexpect import expect
import cluster_test_utils as ctu
Expand Down Expand Up @@ -38,6 +39,23 @@ def make_cluster(self, num_nodes):
self.ramcloud_network)
self.rc_client.connect(external_storage, 'main')

def simple_recovery(self, kill_command):
self.make_cluster(num_nodes=7)
self.createTestValue()
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))

# find the host corresponding to the server with our table and 'testKey',
# then kill its rc-server!
locator = self.rc_client.testing_get_service_locator(self.table, 'testKey')
host = ctu.get_host(locator)
self.node_containers[host].exec_run(kill_command)

# read the value again (without waiting for the server to recover). It
# should come out to the same value
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))

def test_read_write(self):
self.make_cluster(num_nodes=3)
self.rc_client.create_table('test_table')
Expand All @@ -58,23 +76,11 @@ def test_two_writes(self):

expect(value).equals('Good weather')

@unittest.skip("trying stuff out")
def test_01_simple_recovery(self):
self.make_cluster(num_nodes=3) # num_nodes=8
self.createTestValue()
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))

# find the host corresponding to the server with our table and 'testKey',
# then kill it!
locator = self.rc_client.testing_get_service_locator(self.table, 'testKey')
host = ctu.get_host(locator)
self.node_containers[host].kill()
def test_01_simple_recovery_graceful_server_down(self):
self.simple_recovery(kill_command = 'killall -SIGTERM rc-server')

# read the value again (without waiting for the server to recover). It
# should come out to the same value
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))
def test_01_simple_recovery_forced_server_down(self):
self.simple_recovery(kill_command = 'killall -SIGKILL rc-server')

if __name__ == '__main__':
unittest.main()

0 comments on commit ccd059e

Please sign in to comment.