Skip to content

Commit

Permalink
Merge pull request #13 from OferMania/freeze
Browse files Browse the repository at this point in the history
Makes RAMCloud Fault Tolerant when a server goes down!!!
  • Loading branch information
OferMania authored Sep 6, 2019
2 parents 4b69dae + ccd059e commit adb9caf
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 19 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
RAMCloud
RAMCloud-install
.vscode
tmp
__pycache__/
*.py[cod]
4 changes: 3 additions & 1 deletion config/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ stderr_logfile_maxbytes=0

[program:ramcloud-coordinator]
command=/usr/local/bin/rc-coordinator --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --coordinator basic+udp:host=%(ENV_RC_IP)s,port=11111
autorestart=false
stdout_logfile=/dev/fd/1
stdout_logfile_maxbytes=0
stderr_logfile=/dev/fd/2
stderr_logfile_maxbytes=0

[program:ramcloud-server]
command=/usr/local/bin/rc-server --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --local basic+udp:host=%(ENV_RC_IP)s,port=11112
command=/usr/local/bin/rc-server --externalStorage %(ENV_RC_EXTERNAL_STORAGE)s --clusterName %(ENV_RC_CLUSTER_NAME)s --local basic+udp:host=%(ENV_RC_IP)s,port=11112 --replicas 1
autorestart=false
stdout_logfile=/dev/fd/1
stdout_logfile_maxbytes=0
stderr_logfile=/dev/fd/2
Expand Down
38 changes: 22 additions & 16 deletions testing/test_cluster.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import ramcloud
import os
import unittest
from pyexpect import expect
import cluster_test_utils as ctu
Expand Down Expand Up @@ -38,6 +39,23 @@ def make_cluster(self, num_nodes):
self.ramcloud_network)
self.rc_client.connect(external_storage, 'main')

def simple_recovery(self, kill_command):
self.make_cluster(num_nodes=7)
self.createTestValue()
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))

# find the host corresponding to the server with our table and 'testKey',
# then kill its rc-server!
locator = self.rc_client.testing_get_service_locator(self.table, 'testKey')
host = ctu.get_host(locator)
self.node_containers[host].exec_run(kill_command)

# read the value again (without waiting for the server to recover). It
# should come out to the same value
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))

def test_read_write(self):
self.make_cluster(num_nodes=3)
self.rc_client.create_table('test_table')
Expand All @@ -58,23 +76,11 @@ def test_two_writes(self):

expect(value).equals('Good weather')

@unittest.skip("trying stuff out")
def test_01_simple_recovery(self):
self.make_cluster(num_nodes=3) # num_nodes=8
self.createTestValue()
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))

# find the host corresponding to the server with our table and 'testKey',
# then kill it!
locator = self.rc_client.testing_get_service_locator(self.table, 'testKey')
host = ctu.get_host(locator)
self.node_containers[host].kill()
def test_01_simple_recovery_graceful_server_down(self):
self.simple_recovery(kill_command = 'killall -SIGTERM rc-server')

# read the value again (without waiting for the server to recover). It
# should come out to the same value
value = self.rc_client.read(self.table, 'testKey')
expect(value).equals(('testValue', 1))
def test_01_simple_recovery_forced_server_down(self):
self.simple_recovery(kill_command = 'killall -SIGKILL rc-server')

if __name__ == '__main__':
unittest.main()

0 comments on commit adb9caf

Please sign in to comment.