From 92b77d0ba8b91c96be10ebb9c2b8aa22806f7037 Mon Sep 17 00:00:00 2001 From: Mykola Yurchenko Date: Wed, 15 Nov 2017 20:48:46 -0500 Subject: [PATCH] Adds Load Balancer NF (#227) This adds a new NF to our collection, the Load Balancer NF. The Load Balancer NF is an L3, round-robin load balancer that distributes network traffic across multiple backend servers. This NF takes advantage of the ONVM NFLib Flow Table to maintain a list of network flows. Using the flow table and the source NIC port of a packet, the Load Balancer NF is able to decide how to route it: either load balancer to backend server or vice versa. If incoming traffic does not have a pre-existing entry in the flow table, one will be created and subsequent traffic in that flow will be routed accordingly. For detailed information about using the NF and how it works, please read the corresponding README in the NF directory. The Load Balancer NF performs advanced operation on the network traffic to determine how to route them. With this requirement, we add several new APIs to the OpenNetVM NFLib to aid NF development. New NFLib APIs: * `int onvm_pkt_parse_ip(char* ip_str, uint32_t* dest)` - Parses IP Addresses in dotted decimal form (192.168.1.1) to decimal form (3232235777) * `int onvm_pkt_parse_mac(char * mac_str, uint8_t* dest)` - Parses MAC Addresses in string form (11:22:33:44:55:66) to an array of six octets * `uint32 onvm_pkt_get_checksum_offload_flags(uint8_t port_id)` - Determine if the NIC port supports checksum offloading * `void onvm_pkt_set_checksums(struct rte_mbuf* pkt)` - Recalculate and set packet checksums * `static uint16_t calculate_tcpudp_cksum(const struct ipv4_hdr *ip, const void *l4_hdr, const uint32_t l3_len, uint8_t protocol)` - Calculates the TCP or UDP checksum based on the provided headers * `static uint16_t calculate_ip_cksum(const struct ipv4_hdr *ip, const uint32_t l3_len)` - Calculates the IP checksum based on the provided headers * `#define ONVM_PKT_GET_FLAGS(tcp, flags)` - Gets the TCP flags from the provided header * `#define ONVM_PKT_SET_FLAGS(tcp, flags)` - Sets the TCP flags of the provided header Commit log: * Initial commit for load_balancer NF - Layer 3 round-robin load balancer - Using onvm_flow_table to keep track of flows - Using tcp_helper for pkt checksum recalculation - Config values are currently hard coded in the .c file - Still has testing values and debug prints * Added basic config file parsing - Config file has Number of entries -- Each entry has ip and mac or backend server * Removed debug printing, added some comments * Added config filename parsing, more error checks * Better code structure, style fixes * Remove hard coded mac for client * Added client/server iface name args - Added parsing of the interface information, mathing interface to onvm ports id - Multiple name changes - This removes hard coded port values * Minor updates * README, config example * Clearing up flow entries after expiration timeout - This mimics the flow_tracker NF logic of terminatin old flows - Old flow termination happens when flow table gets full, should also be cleaned up on some timeout * Change flow_table returns, fixed mac iface id bug - Change return of table_lookup to int value, flow_info is now passed as an argument - Fixed mac iface mac comparison with onvm_mgr - Fixed wrong client/server port id refernces in packet_handler * Added arp table print * Minor printing changes * Moved to a no-modification client ip style - This allows the load balancer to handle multiple clients - Now instead of modifying the client ip to send it to the backend server we add a routing rule to the backend server - Remove hard coded port switching with !port * Documentation for the NF * Moved global variables to loadbalance struct * Moved checksum helper function to onvm_pkt_helper * Change year in Makefile * Requested changes: macro functions, readme changes * Removed unused pkt_copy func * Parse ip/mac functions, change macro functions --- examples/Makefile | 2 +- examples/load_balancer/.gitignore | 2 + examples/load_balancer/Makefile | 68 +++ examples/load_balancer/README.md | 48 ++ examples/load_balancer/go.sh | 34 ++ examples/load_balancer/load_balancer.c | 593 +++++++++++++++++++++++++ examples/load_balancer/server.conf | 3 + onvm/onvm_nflib/onvm_pkt_helper.c | 154 +++++++ onvm/onvm_nflib/onvm_pkt_helper.h | 52 +++ 9 files changed, 955 insertions(+), 1 deletion(-) create mode 100644 examples/load_balancer/.gitignore create mode 100644 examples/load_balancer/Makefile create mode 100644 examples/load_balancer/README.md create mode 100755 examples/load_balancer/go.sh create mode 100644 examples/load_balancer/load_balancer.c create mode 100644 examples/load_balancer/server.conf diff --git a/examples/Makefile b/examples/Makefile index 4199d2cf7..30eab1346 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -40,7 +40,7 @@ $(error "Please define RTE_SDK environment variable") endif # To add new examples, append the directory name to this variable -examples = bridge basic_monitor simple_forward speed_tester flow_table test_flow_dir aes_encrypt aes_decrypt flow_tracker +examples = bridge basic_monitor simple_forward speed_tester flow_table test_flow_dir aes_encrypt aes_decrypt flow_tracker load_balancer ifeq ($(NDPI_HOME),) $(warning "Skipping ndpi_stats NF as NDPI_HOME is not set") diff --git a/examples/load_balancer/.gitignore b/examples/load_balancer/.gitignore new file mode 100644 index 000000000..fdb076eb8 --- /dev/null +++ b/examples/load_balancer/.gitignore @@ -0,0 +1,2 @@ +load_balancer/ +build/ diff --git a/examples/load_balancer/Makefile b/examples/load_balancer/Makefile new file mode 100644 index 000000000..ee9d82d9a --- /dev/null +++ b/examples/load_balancer/Makefile @@ -0,0 +1,68 @@ +# openNetVM +# https://github.com/sdnfv/openNetVM +# +# BSD LICENSE +# +# Copyright(c) +# 2015-2017 George Washington University +# 2015-2017 University of California Riverside +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# The name of the author may not be used to endorse or promote +# products derived from this software without specific prior +# written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +RTE_TARGET ?= x86_64-native-linuxapp-gcc + +# Default target, can be overriden by command line or environment +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = load_balancer + +# all source are stored in SRCS-y +SRCS-y := load_balancer.c + +# OpenNetVM path +ONVM= $(SRCDIR)/../../onvm + +#CFLAGS += $(USER_FLAGS) +CFLAGS += $(WERROR_FLAGS) -O3 $(USER_FLAGS) + +CFLAGS += -I$(ONVM)/onvm_nflib +CFLAGS += -I$(ONVM)/onvm_mgr +LDFLAGS += $(ONVM)/onvm_nflib/onvm_nflib/$(RTE_TARGET)/libonvm.a + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_main.o += -Wno-return-type +endif + +include $(RTE_SDK)/mk/rte.extapp.mk diff --git a/examples/load_balancer/README.md b/examples/load_balancer/README.md new file mode 100644 index 000000000..87bf44acd --- /dev/null +++ b/examples/load_balancer/README.md @@ -0,0 +1,48 @@ +Load Balancer NF +== + +This NF acts as a layer 3, round-robin load balancer. When a packet arrives the NF checks whether it is from an already existing flow. If not, it creates a new flow entry and assigns it to the destination backend server. The NF decides what to do with the packet based on which port it arrived at. It is also setup to clean the flow table whenever it fills up. + +App Specific Instuctions +-- +**Setting up dpdk interfaces** +This NF requires 2 DPDK interfaces to work, both can be setup using the `openNetVM/dpdk/tools/dpdk-setup-iface.sh` script. + +**Server Config** +The server config needs to have the total number of backend servers with their ip and mac address combination, an example config file `server.conf` is provided. + +**Server Configuration** +The backend servers need to be configured to forward traffic back to the load balancer, this can be done using ip routes on the server machine. +An example usage for LB server port at 10.0.0.37 with clients matching 11.0.0.0/24 using iface p2p1. +```sudo ip route add 11.0.0.0/24 via 10.0.0.37 dev p2p1``` + +**This NF should be run with the ARP NF** +The Load Balancer NF needs to respond to client and server ARP requests. As `onvm_mgr` currently doesn't resolve arp requests, an ARP NF with the LB NF as destination is used. + +An example usage of the ARP&LB NF, with the Load Balancer using dpdk0 - 10.0.0.37 and dpdk1 - 11.0.0.37 for client, server ports respecively. +``` +ARP NF +./go.sh 4 1 2 -s 10.0.0.37,11.0.0.37 + +LB NF +./go.sh 5 2 dpdk0 dpdk1 server.conf +``` + + +Compilation and Execution +-- +``` +make +./go.sh CORELIST SERVICE_ID CLIENT_IFACE SERVER_IFACE SERVER_CONFIG [PRINT_DELAY] + +OR + +sudo ./load_balancer/x86_64-native-linuxapp-gcc/forward -l CORELIST -n 3 --proc-type=secondary -- -r SERVICE_ID -- -c CLIENT_IFACE -s SERVER_IFACE -f SERVER_CONFIG [-p PRINT_DELAY] +``` + +App Specific Arguments +-- + - `CLIENT_IFACE` : name of the client interface + - `SERVER_IFACE` : name of the server interface + - `SERVER_CONFIG` : backend server config file + - `-p `: number of packets between each print, e.g. `-p 1` prints every packets. diff --git a/examples/load_balancer/go.sh b/examples/load_balancer/go.sh new file mode 100755 index 000000000..dd22fd30d --- /dev/null +++ b/examples/load_balancer/go.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +function usage { + echo "$0 CPU-LIST SERVICE-ID CLIENT_IFACE SERVER_IFACE SERVER_CONFIG [-p PRINT] [-n NF-ID]" + echo "$0 5 1 dpdk0 dpdk1 example.conf --> core 5, with Service ID 1, dpdk0 as Client interface, dpdk1 as Server interface, example.conf config" + echo "$0 3,7,9 1 server.conf 1000 --> cores 3,7, and 9, with Service ID 1, server.conf as backend conf, and Print Rate of 1000" + exit 1 +} + +SCRIPT=$(readlink -f "$0") +SCRIPTPATH=$(dirname "$SCRIPT") +cpu=$1 +service=$2 +client=$3 +server=$4 +config=$5 + +shift 5 + +if [ -z $config ] +then + usage +fi + +while getopts ":p:n:" opt; do + case $opt in + p) print="-p $OPTARG";; + n) instance="-n $OPTARG";; + \?) echo "Unknown option -$OPTARG" && usage + ;; + esac +done + +exec sudo $SCRIPTPATH/build/load_balancer -l $cpu -n 3 --proc-type=secondary -- -r $service $instance -- -c $client -s $server -f $config $print diff --git a/examples/load_balancer/load_balancer.c b/examples/load_balancer/load_balancer.c new file mode 100644 index 000000000..3c7c5aacd --- /dev/null +++ b/examples/load_balancer/load_balancer.c @@ -0,0 +1,593 @@ +/********************************************************************* + * openNetVM + * https://sdnfv.github.io + * + * BSD LICENSE + * + * Copyright(c) + * 2015-2017 George Washington University + * 2015-2017 University of California Riverside + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * load_balancer.c - an example Layer 3 round-robin load balancer. + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "onvm_nflib.h" +#include "onvm_pkt_helper.h" +#include "onvm_flow_table.h" + +#define NF_TAG "load_balancer" +#define TABLE_SIZE 65536 + +/* Struct for load balancer information */ +struct loadbalance { + struct onvm_ft *ft; + + /* backend server information */ + uint8_t server_count; + struct backend_server *server; + + /* for cleaning up connections */ + uint16_t num_stored; + uint64_t elapsed_cycles; + uint64_t last_cycles; + uint32_t expire_time; + + /* port and ip values */ + uint32_t ip_lb_server; + uint32_t ip_lb_client; + uint8_t server_port; + uint8_t client_port; + + /* config file, interface names */ + char * cfg_filename; + char * client_iface_name; + char * server_iface_name; +}; + +/* Struct for backend servers */ +struct backend_server { + uint8_t d_addr_bytes[ETHER_ADDR_LEN]; + uint32_t d_ip; +}; + +/* Struct for flow info */ +struct flow_info { + uint8_t dest; + uint8_t s_addr_bytes[ETHER_ADDR_LEN]; + uint64_t last_pkt_cycles; + int is_active; +}; + +/* Struct that contains information about this NF */ +struct onvm_nf_info *nf_info; + +struct loadbalance *lb; +/* number of package between each print */ +static uint32_t print_delay = 1000000; + +/* onvm struct for port info lookup */ +extern struct port_info *ports; + +/* + * Print a usage message + */ +static void +usage(const char *progname) { + printf("Usage: %s [EAL args] -- [NF_LIB args] -- client_iface server_iface server_config -p \n\n", progname); +} + +/* + * Parse the application arguments. + */ +static int +parse_app_args(int argc, char *argv[], const char *progname) { + int c; + + lb->cfg_filename = NULL; + lb->client_iface_name = NULL; + lb->server_iface_name = NULL; + + while ((c = getopt(argc, argv, "c:s:f:p:")) != -1) { + switch (c) { + case 'c': + lb->client_iface_name = strdup(optarg); + break; + case 's': + lb->server_iface_name = strdup(optarg); + break; + case 'f': + lb->cfg_filename = strdup(optarg); + break; + case 'p': + print_delay = strtoul(optarg, NULL, 10); + break; + case '?': + usage(progname); + if (optopt == 'd') + RTE_LOG(INFO, APP, "Option -%c requires an argument.\n", optopt); + else if (optopt == 'p') + RTE_LOG(INFO, APP, "Option -%c requires an argument.\n", optopt); + else if (isprint(optopt)) + RTE_LOG(INFO, APP, "Unknown option `-%c'.\n", optopt); + else + RTE_LOG(INFO, APP, "Unknown option character `\\x%x'.\n", optopt); + return -1; + default: + usage(progname); + return -1; + } + } + + if (!lb->cfg_filename) { + RTE_LOG(INFO, APP, "Load balancer NF requires a backend server config file.\n"); + return -1; + + if (!lb->client_iface_name) { + RTE_LOG(INFO, APP, "Load balancer NF requires a client interface name.\n"); + return -1; + } + if (!lb->server_iface_name) { + RTE_LOG(INFO, APP, "Load balancer NF requires a backend server interface name.\n"); + return -1; + } } + + return optind; +} + +/* + * This function parses the backend config. It takes the filename + * and fills up the backend_server array. This includes the mac and ip + * address of the backend servers + */ +static int +parse_backend_config(void) { + int ret, temp, i; + char ip[32]; + char mac[32]; + FILE * cfg; + + cfg = fopen(lb->cfg_filename, "r"); + if (cfg == NULL) { + rte_exit(EXIT_FAILURE, "Error openning server \'%s\' config\n", lb->cfg_filename); + } + ret = fscanf(cfg, "%*s %d", &temp); + if (temp <= 0) { + rte_exit(EXIT_FAILURE, "Error parsing config, need at least one server configurations\n"); + } + lb->server_count = temp; + + lb->server = (struct backend_server *)rte_malloc("backend server info", sizeof(struct backend_server) * lb->server_count, 0); + if (lb->server == NULL) { + rte_exit(EXIT_FAILURE, "Malloc failed, can't allocate server information\n"); + } + + for (i = 0; i < lb->server_count; i++) { + ret = fscanf(cfg, "%s %s", ip, mac); + if (ret != 2) { + rte_exit(EXIT_FAILURE, "Invalid backend config structure\n"); + } + + ret = onvm_pkt_parse_ip(ip, &lb->server[i].d_ip); + if (ret < 0) { + rte_exit(EXIT_FAILURE, "Error parsing config IP address #%d\n", i); + } + + ret =onvm_pkt_parse_mac(mac, lb->server[i].d_addr_bytes); + if (ret < 0) { + rte_exit(EXIT_FAILURE, "Error parsing config MAC address #%d\n", i); + } + } + + fclose(cfg); + printf("\nARP config:\n"); + for (i = 0; i < lb->server_count; i++) { + printf("%" PRIu8 ".%" PRIu8 ".%" PRIu8 ".%" PRIu8 " ", + lb->server[i].d_ip & 0xFF, (lb->server[i].d_ip >> 8) & 0xFF, (lb->server[i].d_ip >> 16) & 0xFF, (lb->server[i].d_ip >> 24) & 0xFF); + printf("%02x:%02x:%02x:%02x:%02x:%02x\n", + lb->server[i].d_addr_bytes[0], lb->server[i].d_addr_bytes[1], + lb->server[i].d_addr_bytes[2], lb->server[i].d_addr_bytes[3], + lb->server[i].d_addr_bytes[4], lb->server[i].d_addr_bytes[5]); + } + + return ret; +} + +/* + * This function displays stats. It uses ANSI terminal codes to clear + * screen when called. It is called from a single non-master + * thread in the server process, when the process is run with more + * than one lcore enabled. + */ +static void +do_stats_display(struct rte_mbuf* pkt) { + const char clr[] = { 27, '[', '2', 'J', '\0' }; + const char topLeft[] = { 27, '[', '1', ';', '1', 'H', '\0' }; + static uint64_t pkt_process = 0; + struct ipv4_hdr* ip; + + pkt_process += print_delay; + + /* Clear screen and move to top left */ + printf("%s%s", clr, topLeft); + + printf("PACKETS\n"); + printf("-----\n"); + printf("Port : %d\n", pkt->port); + printf("Size : %d\n", pkt->pkt_len); + printf("N° : %"PRIu64"\n", pkt_process); + printf("\n\n"); + + ip = onvm_pkt_ipv4_hdr(pkt); + if (ip != NULL) { + onvm_pkt_print(pkt); + } else { + printf("No IP4 header found\n"); + } +} + +/* + * Print flow information + */ +static void +print_flow_info(struct flow_info* f) { + printf("Flow INFO\n"); + printf("Destination server: %d\n", f->dest); + printf("Source mac %02x:%02x:%02x:%02x:%02x:%02x\n", + f->s_addr_bytes[0], f->s_addr_bytes[1], + f->s_addr_bytes[2], f->s_addr_bytes[3], + f->s_addr_bytes[4], f->s_addr_bytes[5]); +} + +/* + * Parse and assign load balancer server/client interface information + */ +static void +get_iface_inf(void) { + int fd, i; + struct ifreq ifr; + uint8_t client_addr_bytes[ETHER_ADDR_LEN]; + uint8_t server_addr_bytes[ETHER_ADDR_LEN]; + + fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + ifr.ifr_addr.sa_family = AF_INET; + + /* Parse server interface */ + strncpy(ifr.ifr_name, lb->server_iface_name, IFNAMSIZ-1); + + ioctl(fd, SIOCGIFADDR, &ifr); + lb->ip_lb_server = *(uint32_t *)(&((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr); + + ioctl(fd, SIOCGIFHWADDR, &ifr); + for (i = 0; i < ETHER_ADDR_LEN; i++) + server_addr_bytes[i] = ifr.ifr_hwaddr.sa_data[i]; + + /* Parse client interface */ + strncpy(ifr.ifr_name, lb->client_iface_name, IFNAMSIZ-1); + + ioctl(fd, SIOCGIFADDR, &ifr); + lb->ip_lb_client = *(uint32_t *)(&((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr); + + ioctl(fd, SIOCGIFHWADDR, &ifr); + for (i = 0; i < ETHER_ADDR_LEN; i++) + client_addr_bytes[i] = ifr.ifr_hwaddr.sa_data[i]; + + /* Compare the interfaces to onvm_mgr ports by hwaddr and assign port id accordingly */ + if (memcmp(&client_addr_bytes, &ports->mac[0], ETHER_ADDR_LEN) == 0) { + lb->client_port = ports->id[0]; + lb->server_port = ports->id[1]; + } else { + lb->client_port = ports->id[1]; + lb->server_port = ports->id[0]; + } + + close(fd); + + printf("\nLoad balancer interfaces:\n"); + printf("Client iface \'%s\' ID: %d, IP: %" PRIu32 " (%" PRIu8 ".%" PRIu8 ".%" PRIu8 ".%" PRIu8 "), ", + lb->client_iface_name, lb->client_port, lb->ip_lb_client, + lb->ip_lb_client & 0xFF, (lb->ip_lb_client >> 8) & 0xFF, (lb->ip_lb_client >> 16) & 0xFF, (lb->ip_lb_client >> 24) & 0xFF); + printf("MAC: %02x:%02x:%02x:%02x:%02x:%02x\n", + client_addr_bytes[0], client_addr_bytes[1], + client_addr_bytes[2], client_addr_bytes[3], + client_addr_bytes[4], client_addr_bytes[5]); + printf("Server iface \'%s\' ID: %d, IP: %" PRIu32 " (%" PRIu8 ".%" PRIu8 ".%" PRIu8 ".%" PRIu8 "), ", + lb->server_iface_name, lb->server_port, lb->ip_lb_server, + lb->ip_lb_server & 0xFF, (lb->ip_lb_server >> 8) & 0xFF, (lb->ip_lb_server >> 16) & 0xFF, (lb->ip_lb_server >> 24) & 0xFF); + printf("MAC: %02x:%02x:%02x:%02x:%02x:%02x\n", + server_addr_bytes[0], server_addr_bytes[1], + server_addr_bytes[2], server_addr_bytes[3], + server_addr_bytes[4], server_addr_bytes[5]); +} + +/* + * Updates flow info to be "active" or "expired" + */ +static int +update_status(uint64_t elapsed_cycles, struct flow_info *data) { + if (unlikely(data == NULL)) { + return -1; + } + if ((elapsed_cycles - data->last_pkt_cycles) / rte_get_timer_hz() >= lb->expire_time) { + data->is_active = 0; + } else { + data->is_active = 1; + } + + return 0; +} + +/* + * Clears expired entries from the flow table + */ +static int +clear_entries(void) { + if (unlikely(lb == NULL)) { + return -1; + } + + printf("Clearing expired entries\n"); + struct flow_info *data = NULL; + struct onvm_ft_ipv4_5tuple *key = NULL; + uint32_t next = 0; + int ret = 0; + + while (onvm_ft_iterate(lb->ft, (const void **)&key, (void **)&data, &next) > -1) { + if (update_status(lb->elapsed_cycles, data) < 0) { + return -1; + } + + if (!data->is_active) { + ret = onvm_ft_remove_key(lb->ft, key); + lb->num_stored--; + if (ret < 0) { + printf("Key should have been removed, but was not\n"); + lb->num_stored++; + } + } + } + + return 0; +} + +/* + * Adds an entry to the flow table. It first checks if the table is full, and + * if so, it calls clear_entries() to free up space. + */ +static int +table_add_entry(struct onvm_ft_ipv4_5tuple* key, struct flow_info **flow) { + struct flow_info *data = NULL; + + if (unlikely(key == NULL || lb == NULL)) { + return -1; + } + + if (TABLE_SIZE - 1 - lb->num_stored == 0) { + int ret = clear_entries(); + if (ret < 0) { + return -1; + } + } + + int tbl_index = onvm_ft_add_key(lb->ft, key, (char **)&data); + if (tbl_index < 0) { + return -1; + } + + lb->num_stored++; + data->dest = lb->num_stored % lb->server_count; + data->last_pkt_cycles = lb->elapsed_cycles; + data->is_active = 0; + + *flow = data; + + return 0; +} + +/* + * Looks up a packet hash to see if there is a matching key in the table. + * If it finds one, it updates the metadata associated with the key entry, + * and if it doesn't, it calls table_add_entry() to add it to the table. + */ +static int +table_lookup_entry(struct rte_mbuf* pkt, struct flow_info **flow) { + struct flow_info *data = NULL; + struct onvm_ft_ipv4_5tuple key; + + if (unlikely(pkt == NULL || lb == NULL || flow == NULL)) { + return -1; + } + + int ret = onvm_ft_fill_key_symmetric(&key, pkt); + if (ret < 0) + return -1; + + int tbl_index = onvm_ft_lookup_key(lb->ft, &key, (char **)&data); + if (tbl_index == -ENOENT) { + return table_add_entry(&key, flow); + } else if (tbl_index < 0) { + printf("Some other error occurred with the packet hashing\n"); + return -1; + } else { + data->last_pkt_cycles = lb->elapsed_cycles; + *flow = data; + return 0; + } +} + +static int +callback_handler(void) { + lb->elapsed_cycles = rte_get_tsc_cycles(); + + if ((lb->elapsed_cycles - lb->last_cycles) / rte_get_timer_hz() > lb->expire_time) { + lb->last_cycles = lb->elapsed_cycles; + } + + return 0; +} + +static int +packet_handler(struct rte_mbuf* pkt, struct onvm_pkt_meta* meta) { + static uint32_t counter = 0; + struct ipv4_hdr* ip; + struct ether_hdr *ehdr; + struct flow_info *flow_info; + int i, ret; + + ehdr = onvm_pkt_ether_hdr(pkt); + ip = onvm_pkt_ipv4_hdr(pkt); + + /* Ignore packets without ip header, also ignore packets with invalid ip */ + if (ip == NULL || ip->src_addr == 0 || ip->dst_addr == 0) { + meta->action = ONVM_NF_ACTION_DROP; + meta->destination = 0; + return 0; + } + + /* + * Before hashing remove the Load Balancer ip from the pkt so that both + * connections from client -> lbr and lbr <- server + * will have the same hash + */ + if (pkt->port == lb->client_port) { + ip->dst_addr = 0; + } else { + ip->src_addr = 0; + } + + /* Get the packet flow entry */ + ret = table_lookup_entry(pkt, &flow_info); + if (ret == -1) { + meta->action = ONVM_NF_ACTION_DROP; + meta->destination = 0; + return 0; + } + + /* If the flow entry is new, save the client information */ + if (flow_info->is_active == 0) { + flow_info->is_active = 1; + for (i = 0; i < ETHER_ADDR_LEN; i++) { + flow_info->s_addr_bytes[i] = ehdr->s_addr.addr_bytes[i]; + } + } + + if (pkt->port == lb->server_port) { + rte_eth_macaddr_get(lb->client_port, &ehdr->s_addr); + for (i = 0; i < ETHER_ADDR_LEN; i++) { + ehdr->d_addr.addr_bytes[i] = flow_info->s_addr_bytes[i]; + } + + ip->src_addr = lb->ip_lb_client; + meta->destination = lb->client_port; + } else { + for (i = 0; i < ETHER_ADDR_LEN; i++) { + ehdr->d_addr.addr_bytes[i] = lb->server[flow_info->dest].d_addr_bytes[i]; + } + + ip->dst_addr = lb->server[flow_info->dest].d_ip; + meta->destination = lb->server_port; + } + + /* Changing the pkt ip header so we want to recalculate pkt checksums */ + onvm_pkt_set_checksums(pkt); + + meta->action = ONVM_NF_ACTION_OUT; + + if (++counter == print_delay) { + do_stats_display(pkt); + print_flow_info(flow_info); + counter = 0; + } + + return 0; +} + +int main(int argc, char *argv[]) { + int arg_offset; + const char *progname = argv[0]; + + if ((arg_offset = onvm_nflib_init(argc, argv, NF_TAG)) < 0) + return -1; + argc -= arg_offset; + argv += arg_offset; + + lb = rte_calloc("state", 1, sizeof(struct loadbalance), 0); + if (lb == NULL) { + onvm_nflib_stop(); + rte_exit(EXIT_FAILURE, "Unable to initialize NF lb struct"); + } + + if (parse_app_args(argc, argv, progname) < 0) + rte_exit(EXIT_FAILURE, "Invalid command-line arguments\n"); + + lb->ft = onvm_ft_create(TABLE_SIZE, sizeof(struct flow_info)); + if (lb->ft == NULL) { + onvm_nflib_stop(); + rte_exit(EXIT_FAILURE, "Unable to create flow table"); + } + + get_iface_inf(); + parse_backend_config(); + + lb->expire_time = 32; + lb->elapsed_cycles = rte_get_tsc_cycles(); + + onvm_nflib_run_callback(nf_info, &packet_handler, &callback_handler); + printf("If we reach here, program is ending\n"); + + return 0; +} diff --git a/examples/load_balancer/server.conf b/examples/load_balancer/server.conf new file mode 100644 index 000000000..c3ed82a38 --- /dev/null +++ b/examples/load_balancer/server.conf @@ -0,0 +1,3 @@ +LIST_SIZE 2 +10.0.0.17 90:e2:ba:5e:73:20 +10.0.0.36 00:1b:21:85:6a:d4 diff --git a/onvm/onvm_nflib/onvm_pkt_helper.c b/onvm/onvm_nflib/onvm_pkt_helper.c index af55a4a71..a73114429 100644 --- a/onvm/onvm_nflib/onvm_pkt_helper.c +++ b/onvm/onvm_nflib/onvm_pkt_helper.c @@ -52,6 +52,9 @@ #include #include +#include +#include + int onvm_pkt_set_mac_addr(struct rte_mbuf* pkt, unsigned src_port_id, unsigned dst_port_id, struct port_info *ports) { struct ether_hdr *eth; @@ -338,3 +341,154 @@ void onvm_pkt_print_ether(struct ether_hdr* hdr) { } printf("Type: %s\n", type); } + +int +onvm_pkt_parse_ip(char *ip_str, uint32_t *dest) { + int ret; + int ip[4]; + + if (ip_str == NULL || dest == NULL) { + return -1; + } + + ret = sscanf(ip_str, "%u.%u.%u.%u", &ip[3], &ip[2], &ip[1], &ip[0]); + if (ret != 4) { + return -1; + } + *dest = IPv4(ip[0], ip[1], ip[2], ip[3]); + return 0; +} + +int +onvm_pkt_parse_mac(char * mac_str, uint8_t* dest) { + int ret, i; + int mac[ETHER_ADDR_LEN]; + + if (mac_str == NULL || dest == NULL) { + return -1; + } + + ret = sscanf(mac_str, "%x:%x:%x:%x:%x:%x", &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]); + if (ret != ETHER_ADDR_LEN) { + return -1; + } + + for (i = 0; i < ETHER_ADDR_LEN; i++){ + dest[i] = mac[i]; + } + return 0; +} + +uint32_t +onvm_pkt_get_checksum_offload_flags(uint8_t port_id) { + struct rte_eth_dev_info dev_info; + uint32_t hw_offload_flags = 0; + + rte_eth_dev_info_get(port_id, &dev_info); + + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM) { + hw_offload_flags |= SUPPORTS_IPV4_CHECKSUM_OFFLOAD; + } + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) { + hw_offload_flags |= SUPPORTS_TCP_CHECKSUM_OFFLOAD; + } + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) { + hw_offload_flags |= SUPPORTS_UDP_CHECKSUM_OFFLOAD; + } + return hw_offload_flags; +} + +/** + * Calculates TCP or UDP checksum. + * This is the same implementation as rte_ipv4_udptcp_cksum(), + * except that this implementation can process packets with IP options. + */ +static uint16_t +calculate_tcpudp_cksum(const struct ipv4_hdr *ip, const void *l4_hdr, const uint32_t l3_len, uint8_t protocol) { + uint32_t cksum = 0; + uint32_t l4_len = ip->total_length - l3_len; + + /* pseudo header checksum */ + struct { + uint32_t saddr; + uint32_t daddr; + uint8_t reserved; /* all zeros */ + uint8_t protocol; + uint16_t total_length; /* the length of the TCP/UDP header and data */ + } __attribute__((__packed__)) ph; + + ph.saddr = ip->src_addr; + ph.daddr = ip->dst_addr; + ph.reserved = 0; + ph.protocol = protocol; + ph.total_length = (uint16_t)l4_len; + + cksum += rte_raw_cksum(&ph, sizeof(ph)); + + /* packet checksum */ + cksum += rte_raw_cksum(l4_hdr, l4_len); + + while (cksum & 0xffff0000) { + cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); + } + return ~cksum; +} + +/** + * Calculates IP checksum. + * This is the same implementation as rte_ipv4_cksum(), + * exception that this implementation can process packets with IP options. + */ +static uint16_t +calculate_ip_cksum(const struct ipv4_hdr *ip, const uint32_t l3_len) { + uint16_t cksum = rte_raw_cksum(ip, l3_len); + return (cksum == 0xffff) ? cksum : ~cksum; +} + +void +onvm_pkt_set_checksums(struct rte_mbuf *pkt) { + uint32_t hw_cksum_support = onvm_pkt_get_checksum_offload_flags(pkt->port); + struct ipv4_hdr *ip = onvm_pkt_ipv4_hdr(pkt); + struct tcp_hdr *tcp = onvm_pkt_tcp_hdr(pkt); + struct udp_hdr *udp = onvm_pkt_udp_hdr(pkt); + + if (ip != NULL) { + ip->hdr_checksum = 0; + pkt->l2_len = sizeof(struct ether_hdr); + pkt->l3_len = (ip->version_ihl & 0b1111) * 4; + pkt->ol_flags |= PKT_TX_IPV4; + + if (tcp != NULL) { + tcp->cksum = 0; + pkt->l4_len = (tcp->data_off >> 4) & 0b1111; + + if (hw_cksum_support & SUPPORTS_TCP_CHECKSUM_OFFLOAD) { + tcp->cksum = rte_ipv4_phdr_cksum(ip, pkt->ol_flags); + pkt->ol_flags |= PKT_TX_TCP_CKSUM; + } else { + /* software TCP checksumming */ + tcp->cksum = calculate_tcpudp_cksum(ip, tcp, pkt->l3_len, IP_PROTOCOL_TCP); + } + } + + if (udp != NULL) { + udp->dgram_cksum = 0; + pkt->l4_len = 8; + + if (hw_cksum_support & SUPPORTS_UDP_CHECKSUM_OFFLOAD) { + udp->dgram_cksum = rte_ipv4_phdr_cksum(ip, pkt->ol_flags); + pkt->ol_flags |= PKT_TX_UDP_CKSUM; + } else { + /* software UDP checksumming */ + udp->dgram_cksum = calculate_tcpudp_cksum(ip, udp, pkt->l3_len, IP_PROTOCOL_UDP); + } + } + + if (hw_cksum_support & SUPPORTS_IPV4_CHECKSUM_OFFLOAD) { + pkt->ol_flags |= PKT_TX_IP_CKSUM; + } else { + /* software IP checksumming */ + ip->hdr_checksum = calculate_ip_cksum(ip, pkt->l3_len); + } + } +} diff --git a/onvm/onvm_nflib/onvm_pkt_helper.h b/onvm/onvm_nflib/onvm_pkt_helper.h index b765938d1..25691a0ea 100644 --- a/onvm/onvm_nflib/onvm_pkt_helper.h +++ b/onvm/onvm_nflib/onvm_pkt_helper.h @@ -41,6 +41,9 @@ #ifndef _ONVM_PKT_HELPER_H_ #define _ONVM_PKT_HELPER_H_ +#include +#include + struct port_info; struct rte_mbuf; struct tcp_hdr; @@ -50,6 +53,33 @@ struct ipv4_hdr; #define IP_PROTOCOL_TCP 6 #define IP_PROTOCOL_UDP 17 +#define TCP_FLAG_FIN (1<<0) +#define TCP_FLAG_SYN (1<<1) +#define TCP_FLAG_RST (1<<2) +#define TCP_FLAG_PSH (1<<3) +#define TCP_FLAG_ACK (1<<4) +#define TCP_FLAG_URG (1<<5) +#define TCP_FLAG_ECE (1<<6) +#define TCP_FLAG_CWR (1<<7) +#define TCP_FLAG_NS (1<<8) + +#define SUPPORTS_IPV4_CHECKSUM_OFFLOAD (1<<0) +#define SUPPORTS_TCP_CHECKSUM_OFFLOAD (1<<1) +#define SUPPORTS_UDP_CHECKSUM_OFFLOAD (1<<2) + +/* Returns the bitflags in the tcp header */ +#define ONVM_PKT_GET_FLAGS(tcp, flags) \ + do { \ + (flags) = (((tcp)->data_off << 8) | (tcp)->tcp_flags) & 0b111111111; \ + } while (0) + +/* Sets the bitflags in the tcp header */ +#define ONVM_PKT_SET_FLAGS(tcp, flags) \ + do { \ + (tcp)->tcp_flags = (flags) & 0xFF; \ + (tcp)->data_off |= ((flags) >> 8) & 0x1; \ + } while (0) + /** * Assign the source and destination MAC address of the packet to the specified * source and destination port ID. @@ -114,4 +144,26 @@ onvm_pkt_print_ipv4(struct ipv4_hdr* hdr); void onvm_pkt_print_ether(struct ether_hdr* hdr); +/** + * Parsing ip addr of form X.X.X.X into decimal form + */ +int +onvm_pkt_parse_ip(char * ip_str, uint32_t* dest); + +/** + * Parsing mac addr of form xx:xx:xx:xx:xx:xx into dest array + */ +int +onvm_pkt_parse_mac(char * mac_str, uint8_t* dest); + +/** + * Packet checksum calculation routines + */ + +uint32_t +onvm_pkt_get_checksum_offload_flags(uint8_t port_id); + +void +onvm_pkt_set_checksums(struct rte_mbuf* pkt); + #endif // _ONVM_PKT_HELPER_H_"