forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
505 lines
13 KiB
505 lines
13 KiB
// SPDX-License-Identifier: GPL-2.0 |
|
/* Copyright (c) 2019 Facebook |
|
* |
|
* This program is free software; you can redistribute it and/or |
|
* modify it under the terms of version 2 of the GNU General Public |
|
* License as published by the Free Software Foundation. |
|
* |
|
* Example program for Host Bandwidth Managment |
|
* |
|
* This program loads a cgroup skb BPF program to enforce cgroup output |
|
* (egress) or input (ingress) bandwidth limits. |
|
* |
|
* USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] |
|
* Where: |
|
* -d Print BPF trace debug buffer |
|
* -l Also limit flows doing loopback |
|
* -n <#> To create cgroup \"/hbm#\" and attach prog |
|
* Default is /hbm1 |
|
* --no_cn Do not return cn notifications |
|
* -r <rate> Rate limit in Mbps |
|
* -s Get HBM stats (marked, dropped, etc.) |
|
* -t <time> Exit after specified seconds (default is 0) |
|
* -w Work conserving flag. cgroup can increase its bandwidth |
|
* beyond the rate limit specified while there is available |
|
* bandwidth. Current implementation assumes there is only |
|
* NIC (eth0), but can be extended to support multiple NICs. |
|
* Currrently only supported for egress. |
|
* -h Print this info |
|
* prog BPF program file name. Name defaults to hbm_out_kern.o |
|
*/ |
|
|
|
#define _GNU_SOURCE |
|
|
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <assert.h> |
|
#include <sys/resource.h> |
|
#include <sys/time.h> |
|
#include <unistd.h> |
|
#include <errno.h> |
|
#include <fcntl.h> |
|
#include <linux/unistd.h> |
|
#include <linux/compiler.h> |
|
|
|
#include <linux/bpf.h> |
|
#include <bpf/bpf.h> |
|
#include <getopt.h> |
|
|
|
#include "bpf_rlimit.h" |
|
#include "cgroup_helpers.h" |
|
#include "hbm.h" |
|
#include "bpf_util.h" |
|
#include <bpf/libbpf.h> |
|
|
|
bool outFlag = true; |
|
int minRate = 1000; /* cgroup rate limit in Mbps */ |
|
int rate = 1000; /* can grow if rate conserving is enabled */ |
|
int dur = 1; |
|
bool stats_flag; |
|
bool loopback_flag; |
|
bool debugFlag; |
|
bool work_conserving_flag; |
|
bool no_cn_flag; |
|
bool edt_flag; |
|
|
|
static void Usage(void); |
|
static void read_trace_pipe2(void); |
|
static void do_error(char *msg, bool errno_flag); |
|
|
|
#define DEBUGFS "/sys/kernel/debug/tracing/" |
|
|
|
static struct bpf_program *bpf_prog; |
|
static struct bpf_object *obj; |
|
static int queue_stats_fd; |
|
|
|
static void read_trace_pipe2(void) |
|
{ |
|
int trace_fd; |
|
FILE *outf; |
|
char *outFname = "hbm_out.log"; |
|
|
|
trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); |
|
if (trace_fd < 0) { |
|
printf("Error opening trace_pipe\n"); |
|
return; |
|
} |
|
|
|
// Future support of ingress |
|
// if (!outFlag) |
|
// outFname = "hbm_in.log"; |
|
outf = fopen(outFname, "w"); |
|
|
|
if (outf == NULL) |
|
printf("Error creating %s\n", outFname); |
|
|
|
while (1) { |
|
static char buf[4097]; |
|
ssize_t sz; |
|
|
|
sz = read(trace_fd, buf, sizeof(buf) - 1); |
|
if (sz > 0) { |
|
buf[sz] = 0; |
|
puts(buf); |
|
if (outf != NULL) { |
|
fprintf(outf, "%s\n", buf); |
|
fflush(outf); |
|
} |
|
} |
|
} |
|
} |
|
|
|
static void do_error(char *msg, bool errno_flag) |
|
{ |
|
if (errno_flag) |
|
printf("ERROR: %s, errno: %d\n", msg, errno); |
|
else |
|
printf("ERROR: %s\n", msg); |
|
exit(1); |
|
} |
|
|
|
static int prog_load(char *prog) |
|
{ |
|
obj = bpf_object__open_file(prog, NULL); |
|
if (libbpf_get_error(obj)) { |
|
printf("ERROR: opening BPF object file failed\n"); |
|
return 1; |
|
} |
|
|
|
/* load BPF program */ |
|
if (bpf_object__load(obj)) { |
|
printf("ERROR: loading BPF object file failed\n"); |
|
goto err; |
|
} |
|
|
|
bpf_prog = bpf_object__find_program_by_title(obj, "cgroup_skb/egress"); |
|
if (!bpf_prog) { |
|
printf("ERROR: finding a prog in obj file failed\n"); |
|
goto err; |
|
} |
|
|
|
queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); |
|
if (queue_stats_fd < 0) { |
|
printf("ERROR: finding a map in obj file failed\n"); |
|
goto err; |
|
} |
|
|
|
return 0; |
|
|
|
err: |
|
bpf_object__close(obj); |
|
return 1; |
|
} |
|
|
|
static int run_bpf_prog(char *prog, int cg_id) |
|
{ |
|
struct hbm_queue_stats qstats = {0}; |
|
char cg_dir[100], cg_pin_path[100]; |
|
struct bpf_link *link = NULL; |
|
int key = 0; |
|
int cg1 = 0; |
|
int rc = 0; |
|
|
|
sprintf(cg_dir, "/hbm%d", cg_id); |
|
rc = prog_load(prog); |
|
if (rc != 0) |
|
return rc; |
|
|
|
if (setup_cgroup_environment()) { |
|
printf("ERROR: setting cgroup environment\n"); |
|
goto err; |
|
} |
|
cg1 = create_and_get_cgroup(cg_dir); |
|
if (!cg1) { |
|
printf("ERROR: create_and_get_cgroup\n"); |
|
goto err; |
|
} |
|
if (join_cgroup(cg_dir)) { |
|
printf("ERROR: join_cgroup\n"); |
|
goto err; |
|
} |
|
|
|
qstats.rate = rate; |
|
qstats.stats = stats_flag ? 1 : 0; |
|
qstats.loopback = loopback_flag ? 1 : 0; |
|
qstats.no_cn = no_cn_flag ? 1 : 0; |
|
if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { |
|
printf("ERROR: Could not update map element\n"); |
|
goto err; |
|
} |
|
|
|
if (!outFlag) |
|
bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); |
|
|
|
link = bpf_program__attach_cgroup(bpf_prog, cg1); |
|
if (libbpf_get_error(link)) { |
|
fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); |
|
goto err; |
|
} |
|
|
|
sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); |
|
rc = bpf_link__pin(link, cg_pin_path); |
|
if (rc < 0) { |
|
printf("ERROR: bpf_link__pin failed: %d\n", rc); |
|
goto err; |
|
} |
|
|
|
if (work_conserving_flag) { |
|
struct timeval t0, t_last, t_new; |
|
FILE *fin; |
|
unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; |
|
signed long long last_cg_tx_bytes, new_cg_tx_bytes; |
|
signed long long delta_time, delta_bytes, delta_rate; |
|
int delta_ms; |
|
#define DELTA_RATE_CHECK 10000 /* in us */ |
|
#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ |
|
|
|
bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); |
|
if (gettimeofday(&t0, NULL) < 0) |
|
do_error("gettimeofday failed", true); |
|
t_last = t0; |
|
fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); |
|
if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) |
|
do_error("fscanf fails", false); |
|
fclose(fin); |
|
last_cg_tx_bytes = qstats.bytes_total; |
|
while (true) { |
|
usleep(DELTA_RATE_CHECK); |
|
if (gettimeofday(&t_new, NULL) < 0) |
|
do_error("gettimeofday failed", true); |
|
delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + |
|
(t_new.tv_usec - t0.tv_usec)/1000; |
|
if (delta_ms > dur * 1000) |
|
break; |
|
delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + |
|
(t_new.tv_usec - t_last.tv_usec); |
|
if (delta_time == 0) |
|
continue; |
|
t_last = t_new; |
|
fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", |
|
"r"); |
|
if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) |
|
do_error("fscanf fails", false); |
|
fclose(fin); |
|
printf(" new_eth_tx_bytes:%llu\n", |
|
new_eth_tx_bytes); |
|
bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); |
|
new_cg_tx_bytes = qstats.bytes_total; |
|
delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; |
|
last_eth_tx_bytes = new_eth_tx_bytes; |
|
delta_rate = (delta_bytes * 8000000) / delta_time; |
|
printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", |
|
delta_ms, delta_rate/1000000000.0, |
|
rate/1000.0); |
|
if (delta_rate < RATE_THRESHOLD) { |
|
/* can increase cgroup rate limit, but first |
|
* check if we are using the current limit. |
|
* Currently increasing by 6.25%, unknown |
|
* if that is the optimal rate. |
|
*/ |
|
int rate_diff100; |
|
|
|
delta_bytes = new_cg_tx_bytes - |
|
last_cg_tx_bytes; |
|
last_cg_tx_bytes = new_cg_tx_bytes; |
|
delta_rate = (delta_bytes * 8000000) / |
|
delta_time; |
|
printf(" rate:%.3fGbps", |
|
delta_rate/1000000000.0); |
|
rate_diff100 = (((long long)rate)*1000000 - |
|
delta_rate) * 100 / |
|
(((long long) rate) * 1000000); |
|
printf(" rdiff:%d", rate_diff100); |
|
if (rate_diff100 <= 3) { |
|
rate += (rate >> 4); |
|
if (rate > RATE_THRESHOLD / 1000000) |
|
rate = RATE_THRESHOLD / 1000000; |
|
qstats.rate = rate; |
|
printf(" INC\n"); |
|
} else { |
|
printf("\n"); |
|
} |
|
} else { |
|
/* Need to decrease cgroup rate limit. |
|
* Currently decreasing by 12.5%, unknown |
|
* if that is optimal |
|
*/ |
|
printf(" DEC\n"); |
|
rate -= (rate >> 3); |
|
if (rate < minRate) |
|
rate = minRate; |
|
qstats.rate = rate; |
|
} |
|
if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) |
|
do_error("update map element fails", false); |
|
} |
|
} else { |
|
sleep(dur); |
|
} |
|
// Get stats! |
|
if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { |
|
char fname[100]; |
|
FILE *fout; |
|
|
|
if (!outFlag) |
|
sprintf(fname, "hbm.%d.in", cg_id); |
|
else |
|
sprintf(fname, "hbm.%d.out", cg_id); |
|
fout = fopen(fname, "w"); |
|
fprintf(fout, "id:%d\n", cg_id); |
|
fprintf(fout, "ERROR: Could not lookup queue_stats\n"); |
|
} else if (stats_flag && qstats.lastPacketTime > |
|
qstats.firstPacketTime) { |
|
long long delta_us = (qstats.lastPacketTime - |
|
qstats.firstPacketTime)/1000; |
|
unsigned int rate_mbps = ((qstats.bytes_total - |
|
qstats.bytes_dropped) * 8 / |
|
delta_us); |
|
double percent_pkts, percent_bytes; |
|
char fname[100]; |
|
FILE *fout; |
|
int k; |
|
static const char *returnValNames[] = { |
|
"DROP_PKT", |
|
"ALLOW_PKT", |
|
"DROP_PKT_CWR", |
|
"ALLOW_PKT_CWR" |
|
}; |
|
#define RET_VAL_COUNT 4 |
|
|
|
// Future support of ingress |
|
// if (!outFlag) |
|
// sprintf(fname, "hbm.%d.in", cg_id); |
|
// else |
|
sprintf(fname, "hbm.%d.out", cg_id); |
|
fout = fopen(fname, "w"); |
|
fprintf(fout, "id:%d\n", cg_id); |
|
fprintf(fout, "rate_mbps:%d\n", rate_mbps); |
|
fprintf(fout, "duration:%.1f secs\n", |
|
(qstats.lastPacketTime - qstats.firstPacketTime) / |
|
1000000000.0); |
|
fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); |
|
fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / |
|
1000000)); |
|
fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); |
|
fprintf(fout, "bytes_dropped_MB:%d\n", |
|
(int)(qstats.bytes_dropped / |
|
1000000)); |
|
// Marked Pkts and Bytes |
|
percent_pkts = (qstats.pkts_marked * 100.0) / |
|
(qstats.pkts_total + 1); |
|
percent_bytes = (qstats.bytes_marked * 100.0) / |
|
(qstats.bytes_total + 1); |
|
fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); |
|
fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); |
|
|
|
// Dropped Pkts and Bytes |
|
percent_pkts = (qstats.pkts_dropped * 100.0) / |
|
(qstats.pkts_total + 1); |
|
percent_bytes = (qstats.bytes_dropped * 100.0) / |
|
(qstats.bytes_total + 1); |
|
fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); |
|
fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); |
|
|
|
// ECN CE markings |
|
percent_pkts = (qstats.pkts_ecn_ce * 100.0) / |
|
(qstats.pkts_total + 1); |
|
fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, |
|
(int)qstats.pkts_ecn_ce); |
|
|
|
// Average cwnd |
|
fprintf(fout, "avg cwnd:%d\n", |
|
(int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); |
|
// Average rtt |
|
fprintf(fout, "avg rtt:%d\n", |
|
(int)(qstats.sum_rtt / (qstats.pkts_total + 1))); |
|
// Average credit |
|
if (edt_flag) |
|
fprintf(fout, "avg credit_ms:%.03f\n", |
|
(qstats.sum_credit / |
|
(qstats.pkts_total + 1.0)) / 1000000.0); |
|
else |
|
fprintf(fout, "avg credit:%d\n", |
|
(int)(qstats.sum_credit / |
|
(1500 * ((int)qstats.pkts_total ) + 1))); |
|
|
|
// Return values stats |
|
for (k = 0; k < RET_VAL_COUNT; k++) { |
|
percent_pkts = (qstats.returnValCount[k] * 100.0) / |
|
(qstats.pkts_total + 1); |
|
fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], |
|
percent_pkts, (int)qstats.returnValCount[k]); |
|
} |
|
fclose(fout); |
|
} |
|
|
|
if (debugFlag) |
|
read_trace_pipe2(); |
|
goto cleanup; |
|
|
|
err: |
|
rc = 1; |
|
|
|
cleanup: |
|
bpf_link__destroy(link); |
|
bpf_object__close(obj); |
|
|
|
if (cg1 != -1) |
|
close(cg1); |
|
|
|
if (rc != 0) |
|
cleanup_cgroup_environment(); |
|
return rc; |
|
} |
|
|
|
static void Usage(void) |
|
{ |
|
printf("This program loads a cgroup skb BPF program to enforce\n" |
|
"cgroup output (egress) bandwidth limits.\n\n" |
|
"USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" |
|
" [-s] [-t <secs>] [-w] [-h] [prog]\n" |
|
" Where:\n" |
|
" -o indicates egress direction (default)\n" |
|
" -d print BPF trace debug buffer\n" |
|
" --edt use fq's Earliest Departure Time\n" |
|
" -l also limit flows using loopback\n" |
|
" -n <#> to create cgroup \"/hbm#\" and attach prog\n" |
|
" Default is /hbm1\n" |
|
" --no_cn disable CN notifications\n" |
|
" -r <rate> Rate in Mbps\n" |
|
" -s Update HBM stats\n" |
|
" -t <time> Exit after specified seconds (default is 0)\n" |
|
" -w Work conserving flag. cgroup can increase\n" |
|
" bandwidth beyond the rate limit specified\n" |
|
" while there is available bandwidth. Current\n" |
|
" implementation assumes there is only eth0\n" |
|
" but can be extended to support multiple NICs\n" |
|
" -h print this info\n" |
|
" prog BPF program file name. Name defaults to\n" |
|
" hbm_out_kern.o\n"); |
|
} |
|
|
|
int main(int argc, char **argv) |
|
{ |
|
char *prog = "hbm_out_kern.o"; |
|
int k; |
|
int cg_id = 1; |
|
char *optstring = "iodln:r:st:wh"; |
|
struct option loptions[] = { |
|
{"no_cn", 0, NULL, 1}, |
|
{"edt", 0, NULL, 2}, |
|
{NULL, 0, NULL, 0} |
|
}; |
|
|
|
while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { |
|
switch (k) { |
|
case 1: |
|
no_cn_flag = true; |
|
break; |
|
case 2: |
|
prog = "hbm_edt_kern.o"; |
|
edt_flag = true; |
|
break; |
|
case'o': |
|
break; |
|
case 'd': |
|
debugFlag = true; |
|
break; |
|
case 'l': |
|
loopback_flag = true; |
|
break; |
|
case 'n': |
|
cg_id = atoi(optarg); |
|
break; |
|
case 'r': |
|
minRate = atoi(optarg) * 1.024; |
|
rate = minRate; |
|
break; |
|
case 's': |
|
stats_flag = true; |
|
break; |
|
case 't': |
|
dur = atoi(optarg); |
|
break; |
|
case 'w': |
|
work_conserving_flag = true; |
|
break; |
|
case '?': |
|
if (optopt == 'n' || optopt == 'r' || optopt == 't') |
|
fprintf(stderr, |
|
"Option -%c requires an argument.\n\n", |
|
optopt); |
|
case 'h': |
|
__fallthrough; |
|
default: |
|
Usage(); |
|
return 0; |
|
} |
|
} |
|
|
|
if (optind < argc) |
|
prog = argv[optind]; |
|
printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); |
|
|
|
return run_bpf_prog(prog, cg_id); |
|
}
|
|
|