mTCP, io_engine, and sample applications

master
EunYoung Jeong 2014-04-02 22:23:08 +09:00
parent 209e31cc00
commit d9af90df66
109 changed files with 49070 additions and 0 deletions

43
apps/example/Makefile Normal file
View File

@ -0,0 +1,43 @@
CC = gcc
#CFLAGS = -Wall -g -DINFO -DDBGERR
CFLAGS = -DNDEBUG -O3 -DINFO -DDBGERR
TARGET = epserver epwget eprelay
UTIL_FLD = ../../util
UTIL_INC = ${UTIL_FLD}/include
EPSERVER_OBJS = epserver.o ${UTIL_FLD}/http_parsing.o ${UTIL_FLD}/tdate_parse.o
EPWGET_OBJS = epwget.o ${UTIL_FLD}/rss.o ${UTIL_FLD}/http_parsing.o ${UTIL_FLD}/tdate_parse.o
EPRELAY_OBJS = eprelay.o ${UTIL_FLD}/rss.o ${UTIL_FLD}/http_parsing.o \
${UTIL_FLD}/tdate_parse.o ${UTIL_FLD}/ring_buffer.o
EPPIPE_OBJS = eppipe.o
MTCP_FLD = ../../mtcp/lib
MTCP_INC = ../../mtcp/include
MTCP_LIB = ${MTCP_FLD}/libmtcp.a
PS_FLD = ../../../io_engine/io_engine-2.0.38.2/lib
PS_INC = ../../../io_engine/io_engine-2.0.38.2/include
INC = -I./include/ -I${UTIL_INC} -I${MTCP_INC} -I${PS_INC}
LIBS = -lnuma -lmtcp -lps -lpthread -lrt
LIB = -L${PS_FLD} -L${MTCP_FLD}
all: epserver epwget eppipe
%.o: %.c
${CC} -c ${CFLAGS} ${INC} -o $@ $<
epserver: ${EPSERVER_OBJS} ${MTCP_LIB}
${CC} -o epserver ${EPSERVER_OBJS} ${LIB} ${LIBS}
epwget: ${EPWGET_OBJS} ${MTCP_LIB}
${CC} -o epwget ${EPWGET_OBJS} ${LIB} ${LIBS}
eppipe: ${EPPIPE_OBJS} ${MTCP_LIB}
${CC} -o eppipe ${EPPIPE_OBJS} ${LIB} ${LIBS}
clean:
rm -f *~ *.o ${TARGET}

37
apps/example/README Normal file
View File

@ -0,0 +1,37 @@
========================================================================
USAGE OF EXAMPLE APPLICATIONS
========================================================================
epserver: a simple mtcp-epoll-based web server
usage: ./epserver www_home [-N #cores]
ex) ./epserver /home/notav/www -N 8
options:
www_home: the directory to server. # max files are limited to
MAX_FILES in epserver.c:36
-N: number of CPU cores to use. default: all existing cores
========================================================================
epwget: simple mtcp-epoll-based http request generator
usage: ./epwget URL #requests [-N #cores] [-c concurrency]
ex) ./epwget 10.0.0.43/example.txt 10000000 -N 8 -c 8000
options:
URL: url of the content to download.
#requests: number of requests to generate
-N: number of CPU cores to use. default: min(# cores, # requests)
-c: number of maximum concurrent connections. default: 100
notes:
- epwget can use a range of IP addresses for larger concurrent
connections that cannot be in an IP. you can set it in epwget.c:33.
- epwget overrides some part of the settings in epgwet.conf and uses
mtcp_setconf() internally to apply the input arguments to the
configuration.
========================================================================
Contact: mtcp at list.ndsl.kaist.edu
April 2, 2014.
EunYoung Jeong <notav at ndsl.kaist.edu>

656
apps/example/epserver.c Normal file
View File

@ -0,0 +1,656 @@
#define _LARGEFILE64_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <dirent.h>
#include <string.h>
#include <time.h>
#include <pthread.h>
#include <signal.h>
#include <mtcp_api.h>
#include <mtcp_epoll.h>
#include "http_parsing.h"
#include "debug.h"
#define MAX_FLOW_NUM (10000)
#define RCVBUF_SIZE (2*1024)
#define SNDBUF_SIZE (8*1024)
#define MAX_EVENTS (MAX_FLOW_NUM * 3)
#define HTTP_HEADER_LEN 1024
#define URL_LEN 128
#define MAX_CPUS 16
#define MAX_FILES 30
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
#ifndef TRUE
#define TRUE (1)
#endif
#ifndef FALSE
#define FALSE (0)
#endif
#ifndef ERROR
#define ERROR (-1)
#endif
#define HT_SUPPORT FALSE
/*----------------------------------------------------------------------------*/
struct file_cache
{
char name[128];
char fullname[256];
uint64_t size;
char *file;
};
/*----------------------------------------------------------------------------*/
struct server_vars
{
char request[HTTP_HEADER_LEN];
int recv_len;
int request_len;
long int total_read, total_sent;
uint8_t done;
uint8_t rspheader_sent;
uint8_t keep_alive;
int fidx; // file cache index
char fname[128]; // file name
long int fsize; // file size
};
/*----------------------------------------------------------------------------*/
struct thread_context
{
mctx_t mctx;
int ep;
struct server_vars *svars;
};
/*----------------------------------------------------------------------------*/
static int num_cores;
static int core_limit;
static pthread_t app_thread[MAX_CPUS];
static int done[MAX_CPUS];
/*----------------------------------------------------------------------------*/
const char *www_main;
static struct file_cache fcache[MAX_FILES];
static int nfiles;
/*----------------------------------------------------------------------------*/
static int finished;
/*----------------------------------------------------------------------------*/
static char *
StatusCodeToString(int scode)
{
switch (scode) {
case 200:
return "OK";
break;
case 404:
return "Not Found";
break;
}
return NULL;
}
/*----------------------------------------------------------------------------*/
void
CleanServerVariable(struct server_vars *sv)
{
sv->recv_len = 0;
sv->request_len = 0;
sv->total_read = 0;
sv->total_sent = 0;
sv->done = 0;
sv->rspheader_sent = 0;
sv->keep_alive = 0;
}
/*----------------------------------------------------------------------------*/
void
CloseConnection(struct thread_context *ctx, int sockid, struct server_vars *sv)
{
mtcp_epoll_ctl(ctx->mctx, ctx->ep, MTCP_EPOLL_CTL_DEL, sockid, NULL);
mtcp_close(ctx->mctx, sockid);
}
/*----------------------------------------------------------------------------*/
static int
SendUntilAvailable(struct thread_context *ctx, int sockid, struct server_vars *sv)
{
int ret;
int sent;
int len;
if (sv->done || !sv->rspheader_sent) {
return 0;
}
sent = 0;
ret = 1;
while (ret > 0) {
len = MIN(SNDBUF_SIZE, sv->fsize - sv->total_sent);
if (len <= 0) {
break;
}
ret = mtcp_write(ctx->mctx, sockid,
fcache[sv->fidx].file + sv->total_sent, len);
if (ret < 0) {
TRACE_APP("Connection closed with client.\n");
break;
}
TRACE_APP("Socket %d: mtcp_write try: %d, ret: %d\n", sockid, len, ret);
sent += ret;
sv->total_sent += ret;
}
if (sv->total_sent >= fcache[sv->fidx].size) {
struct mtcp_epoll_event ev;
sv->done = TRUE;
finished++;
if (sv->keep_alive) {
/* if keep-alive connection, wait for the incoming request */
ev.events = MTCP_EPOLLIN;
ev.data.sockid = sockid;
mtcp_epoll_ctl(ctx->mctx, ctx->ep, MTCP_EPOLL_CTL_MOD, sockid, &ev);
CleanServerVariable(sv);
} else {
/* else, close connection */
CloseConnection(ctx, sockid, sv);
}
}
return sent;
}
/*----------------------------------------------------------------------------*/
static int
HandleReadEvent(struct thread_context *ctx, int sockid, struct server_vars *sv)
{
struct mtcp_epoll_event ev;
char buf[HTTP_HEADER_LEN];
char url[URL_LEN];
char response[HTTP_HEADER_LEN];
int scode; // status code
time_t t_now;
char t_str[128];
char keepalive_str[128];
int rd;
int i;
int len;
int sent;
/* HTTP request handling */
rd = mtcp_read(ctx->mctx, sockid, buf, HTTP_HEADER_LEN);
if (rd <= 0) {
return rd;
}
memcpy(sv->request + sv->recv_len,
(char *)buf, MIN(rd, HTTP_HEADER_LEN - sv->recv_len));
sv->recv_len += rd;
//sv->request[rd] = '\0';
//fprintf(stderr, "HTTP Request: \n%s", request);
sv->request_len = find_http_header(sv->request, sv->recv_len);
if (sv->request_len <= 0) {
TRACE_ERROR("Socket %d: Failed to parse HTTP request header.\n"
"read bytes: %d, recv_len: %d, "
"request_len: %d, strlen: %ld, request: \n%s\n",
sockid, rd, sv->recv_len,
sv->request_len, strlen(sv->request), sv->request);
return rd;
}
http_get_url(sv->request, sv->request_len, url, URL_LEN);
TRACE_APP("Socket %d URL: %s\n", sockid, url);
sprintf(sv->fname, "%s%s", www_main, url);
TRACE_APP("Socket %d File name: %s\n", sockid, sv->fname);
sv->keep_alive = FALSE;
if (http_header_str_val(sv->request, "Connection: ",
strlen("Connection: "), keepalive_str, 128)) {
if (strstr(keepalive_str, "Keep-Alive")) {
sv->keep_alive = TRUE;
} else if (strstr(keepalive_str, "Close")) {
sv->keep_alive = FALSE;
}
}
/* Find file in cache */
scode = 404;
for (i = 0; i < nfiles; i++) {
if (strcmp(sv->fname, fcache[i].fullname) == 0) {
sv->fsize = fcache[i].size;
sv->fidx = i;
scode = 200;
break;
}
}
TRACE_APP("Socket %d File size: %ld (%ldMB)\n",
sockid, sv->fsize, sv->fsize / 1024 / 1024);
/* Response header handling */
time(&t_now);
strftime(t_str, 128, "%a, %d %b %Y %X GMT", gmtime(&t_now));
if (sv->keep_alive)
sprintf(keepalive_str, "Keep-Alive");
else
sprintf(keepalive_str, "Close");
sprintf(response, "HTTP/1.1 %d %s\r\n"
"Date: %s\r\n"
"Server: Webserver on Middlebox TCP (Ubuntu)\r\n"
"Content-Length: %ld\r\n"
"Connection: %s\r\n\r\n",
scode, StatusCodeToString(scode), t_str, sv->fsize, keepalive_str);
len = strlen(response);
TRACE_APP("Socket %d HTTP Response: \n%s", sockid, response);
sent = mtcp_write(ctx->mctx, sockid, response, len);
TRACE_APP("Socket %d Sent response header: try: %d, sent: %d\n",
sockid, len, sent);
assert(sent == len);
sv->rspheader_sent = TRUE;
ev.events = MTCP_EPOLLIN | MTCP_EPOLLOUT;
ev.data.sockid = sockid;
mtcp_epoll_ctl(ctx->mctx, ctx->ep, MTCP_EPOLL_CTL_MOD, sockid, &ev);
SendUntilAvailable(ctx, sockid, sv);
return rd;
}
/*----------------------------------------------------------------------------*/
int
AcceptConnection(struct thread_context *ctx, int listener)
{
mctx_t mctx = ctx->mctx;
struct server_vars *sv;
struct mtcp_epoll_event ev;
int c;
c = mtcp_accept(mctx, listener, NULL, NULL);
if (c >= 0) {
if (c >= MAX_FLOW_NUM) {
TRACE_ERROR("Invalid socket id %d.\n", c);
return -1;
}
sv = &ctx->svars[c];
CleanServerVariable(sv);
TRACE_APP("New connection %d accepted.\n", c);
ev.events = MTCP_EPOLLIN;
ev.data.sockid = c;
mtcp_setsock_nonblock(ctx->mctx, c);
mtcp_epoll_ctl(mctx, ctx->ep, MTCP_EPOLL_CTL_ADD, c, &ev);
TRACE_APP("Socket %d registered.\n", c);
} else {
if (errno != EAGAIN) {
TRACE_ERROR("mtcp_accept() error %s\n",
strerror(errno));
}
}
return c;
}
/*----------------------------------------------------------------------------*/
struct thread_context *
InitializeServerThread(int core)
{
struct thread_context *ctx;
/* affinitize application thread to a CPU core */
#if HT_SUPPORT
mtcp_core_affinitize(core + (num_cores / 2));
#else
mtcp_core_affinitize(core);
#endif /* HT_SUPPORT */
ctx = (struct thread_context *)calloc(1, sizeof(struct thread_context));
if (!ctx) {
TRACE_ERROR("Failed to create thread context!\n");
return NULL;
}
/* create mtcp context: this will spawn an mtcp thread */
ctx->mctx = mtcp_create_context(core);
if (!ctx->mctx) {
TRACE_ERROR("Failed to create mtcp context!\n");
return NULL;
}
/* create epoll descriptor */
ctx->ep = mtcp_epoll_create(ctx->mctx, MAX_EVENTS);
if (ctx->ep < 0) {
TRACE_ERROR("Failed to create epoll descriptor!\n");
return NULL;
}
/* allocate memory for server variables */
ctx->svars = (struct server_vars *)
calloc(MAX_FLOW_NUM, sizeof(struct server_vars));
if (!ctx->svars) {
TRACE_ERROR("Failed to create server_vars struct!\n");
return NULL;
}
return ctx;
}
/*----------------------------------------------------------------------------*/
int
CreateListeningSocket(struct thread_context *ctx)
{
int listener;
struct mtcp_epoll_event ev;
struct sockaddr_in saddr;
int ret;
/* create socket and set it as nonblocking */
listener = mtcp_socket(ctx->mctx, AF_INET, SOCK_STREAM, 0);
if (listener < 0) {
TRACE_ERROR("Failed to create listening socket!\n");
return -1;
}
ret = mtcp_setsock_nonblock(ctx->mctx, listener);
if (ret < 0) {
TRACE_ERROR("Failed to set socket in nonblocking mode.\n");
return -1;
}
/* bind to port 80 */
saddr.sin_family = AF_INET;
saddr.sin_addr.s_addr = INADDR_ANY;
saddr.sin_port = htons(80);
ret = mtcp_bind(ctx->mctx, listener,
(struct sockaddr *)&saddr, sizeof(struct sockaddr_in));
if (ret < 0) {
TRACE_ERROR("Failed to bind to the listening socket!\n");
return -1;
}
/* listen (backlog: 4K) */
ret = mtcp_listen(ctx->mctx, listener, 4096);
if (ret < 0) {
TRACE_ERROR("mtcp_listen() failed!\n");
return -1;
}
/* wait for incoming accept events */
ev.events = MTCP_EPOLLIN;
ev.data.sockid = listener;
mtcp_epoll_ctl(ctx->mctx, ctx->ep, MTCP_EPOLL_CTL_ADD, listener, &ev);
return listener;
}
/*----------------------------------------------------------------------------*/
void *
RunServerThread(void *arg)
{
int core = *(int *)arg;
struct thread_context *ctx;
mctx_t mctx;
int listener;
int ep;
struct mtcp_epoll_event *events;
int nevents;
int i, ret;
int do_accept;
/* initialization */
ctx = InitializeServerThread(core);
if (!ctx) {
TRACE_ERROR("Failed to initialize server thread.\n");
exit(-1);
}
mctx = ctx->mctx;
ep = ctx->ep;
events = (struct mtcp_epoll_event *)
calloc(MAX_EVENTS, sizeof(struct mtcp_epoll_event));
if (!events) {
TRACE_ERROR("Failed to create event struct!\n");
exit(-1);
}
listener = CreateListeningSocket(ctx);
if (listener < 0) {
TRACE_ERROR("Failed to create listening socket.\n");
exit(-1);
}
while (!done[core]) {
nevents = mtcp_epoll_wait(mctx, ep, events, MAX_EVENTS, -1);
if (nevents < 0) {
if (errno != EINTR)
perror("mtcp_epoll_wait");
break;
}
do_accept = FALSE;
for (i = 0; i < nevents; i++) {
if (events[i].data.sockid == listener) {
/* if the event is for the listener, accept connection */
do_accept = TRUE;
} else if (events[i].events & MTCP_EPOLLERR) {
int err;
socklen_t len = sizeof(err);
/* error on the connection */
TRACE_APP("[CPU %d] Error on socket %d\n",
core, events[i].data.sockid);
if (mtcp_getsockopt(mctx, events[i].data.sockid,
SOL_SOCKET, SO_ERROR, (void *)&err, &len) == 0) {
if (err != ETIMEDOUT) {
fprintf(stderr, "Error on socket %d: %s\n",
events[i].data.sockid, strerror(err));
}
} else {
perror("mtcp_getsockopt");
}
CloseConnection(ctx, events[i].data.sockid,
&ctx->svars[events[i].data.sockid]);
} else if (events[i].events & MTCP_EPOLLIN) {
ret = HandleReadEvent(ctx, events[i].data.sockid,
&ctx->svars[events[i].data.sockid]);
if (ret == 0) {
/* connection closed by remote host */
CloseConnection(ctx, events[i].data.sockid,
&ctx->svars[events[i].data.sockid]);
} else if (ret < 0) {
/* if not EAGAIN, it's an error */
if (errno != EAGAIN) {
CloseConnection(ctx, events[i].data.sockid,
&ctx->svars[events[i].data.sockid]);
}
}
} else if (events[i].events & MTCP_EPOLLOUT) {
struct server_vars *sv = &ctx->svars[events[i].data.sockid];
if (sv->rspheader_sent) {
SendUntilAvailable(ctx, events[i].data.sockid, sv);
} else {
TRACE_APP("Socket %d: Response header not sent yet.\n",
events[i].data.sockid);
}
} else {
assert(0);
}
}
/* if do_accept flag is set, accept connections */
if (do_accept) {
while (1) {
ret = AcceptConnection(ctx, listener);
if (ret < 0)
break;
}
}
}
/* destroy mtcp context: this will kill the mtcp thread */
mtcp_destroy_context(mctx);
pthread_exit(NULL);
return NULL;
}
/*----------------------------------------------------------------------------*/
void
SignalHandler(int signum)
{
int i;
for (i = 0; i < core_limit; i++) {
if (app_thread[i] == pthread_self()) {
//TRACE_INFO("Server thread %d got SIGINT\n", i);
done[i] = TRUE;
} else {
if (!done[i]) {
pthread_kill(app_thread[i], signum);
}
}
}
}
/*----------------------------------------------------------------------------*/
int
main(int argc, char **argv)
{
DIR *dir;
struct dirent *ent;
int fd;
int ret;
uint64_t total_read;
int cores[MAX_CPUS];
int i;
num_cores = GetNumCPUs();
core_limit = num_cores;
if (argc < 2) {
TRACE_ERROR("$%s directory_to_service\n", argv[0]);
return FALSE;
}
/* open the directory to serve */
www_main = argv[1];
dir = opendir(www_main);
if (!dir) {
TRACE_ERROR("Failed to open %s.\n", www_main);
perror("opendir");
return FALSE;
}
for (i = 0; i < argc - 1; i++) {
if (strcmp(argv[i], "-N") == 0) {
core_limit = atoi(argv[i + 1]);
if (core_limit > num_cores) {
TRACE_CONFIG("CPU limit should be smaller than the "
"number of CPUS: %d\n", num_cores);
return FALSE;
}
}
}
nfiles = 0;
while ((ent = readdir(dir)) != NULL) {
if (strcmp(ent->d_name, ".") == 0)
continue;
else if (strcmp(ent->d_name, "..") == 0)
continue;
strcpy(fcache[nfiles].name, ent->d_name);
sprintf(fcache[nfiles].fullname, "%s/%s", www_main, ent->d_name);
fd = open(fcache[nfiles].fullname, O_RDONLY);
if (fd < 0) {
perror("open");
continue;
} else {
fcache[nfiles].size = lseek64(fd, 0, SEEK_END);
lseek64(fd, 0, SEEK_SET);
}
fcache[nfiles].file = (char *)malloc(fcache[nfiles].size);
if (!fcache[nfiles].file) {
TRACE_ERROR("Failed to allocate memory for file %s\n",
fcache[nfiles].name);
perror("malloc");
continue;
}
TRACE_INFO("Reading %s (%lu bytes)\n",
fcache[nfiles].name, fcache[nfiles].size);
total_read = 0;
while (1) {
ret = read(fd, fcache[nfiles].file + total_read,
fcache[nfiles].size - total_read);
if (ret < 0) {
break;
} else if (ret == 0) {
break;
}
total_read += ret;
}
if (total_read < fcache[nfiles].size) {
free(fcache[nfiles].file);
continue;
}
close(fd);
nfiles++;
if (nfiles >= MAX_FILES)
break;
}
finished = 0;
/* initialize mtcp */
ret = mtcp_init("epserver.conf");
if (ret) {
TRACE_ERROR("Failed to initialize mtcp\n");
exit(EXIT_FAILURE);
}
/* register signal handler to mtcp */
mtcp_register_signal(SIGINT, SignalHandler);
TRACE_INFO("Application initialization finished.\n");
for (i = 0; i < core_limit; i++) {
cores[i] = i;
done[i] = FALSE;
if (pthread_create(&app_thread[i],
NULL, RunServerThread, (void *)&cores[i])) {
perror("pthread_create");
TRACE_ERROR("Failed to create server thread.\n");
exit(-1);
}
}
for (i = 0; i < core_limit; i++) {
pthread_join(app_thread[i], NULL);
}
mtcp_destroy();
closedir(dir);
return 0;
}

View File

@ -0,0 +1,25 @@
############### mtcp configuration file ###############
# maximum concurrency per core
max_concurrency = 10000
# maximum number of socket buffers per core
# set this to small value if there are many idle connections
max_num_buffers = 10000
# receive buffer size of sockets
rcvbuf = 2048
# send buffer size of sockets
sndbuf = 8192
# tcp timeout seconds
# (tcp_timeout = -1 can disable the timeout check)
tcp_timeout = 30
# tcp timewait seconds
tcp_timewait = 0
# interface to print stats
stat_print = xge0
#stat_print = xge1

812
apps/example/epwget.c Normal file
View File

@ -0,0 +1,812 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/queue.h>
#include <assert.h>
#include <mtcp_api.h>
#include <mtcp_epoll.h>
#include "rss.h"
#include "http_parsing.h"
#include "debug.h"
#define MAX_CPUS 16
#define MAX_URL_LEN 128
#define MAX_FILE_LEN 128
#define HTTP_HEADER_LEN 1024
#define IP_RANGE 1
#define MAX_IP_STR_LEN 16
#define BUF_SIZE (8*1024)
#define CALC_MD5SUM FALSE
#define TIMEVAL_TO_MSEC(t) ((t.tv_sec * 1000) + (t.tv_usec / 1000))
#define TIMEVAL_TO_USEC(t) ((t.tv_sec * 1000000) + (t.tv_usec))
#define TS_GT(a,b) ((int64_t)((a)-(b)) > 0)
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
#ifndef TRUE
#define TRUE (1)
#endif
#ifndef FALSE
#define FALSE (0)
#endif
#ifndef ERROR
#define ERROR (-1)
#endif
/*----------------------------------------------------------------------------*/
static pthread_t app_thread[MAX_CPUS];
static mctx_t g_mctx[MAX_CPUS];
static int done[MAX_CPUS];
/*----------------------------------------------------------------------------*/
static int num_cores;
static int core_limit;
/*----------------------------------------------------------------------------*/
static int fio = FALSE;
static char outfile[MAX_FILE_LEN + 1];
/*----------------------------------------------------------------------------*/
static char host[MAX_IP_STR_LEN + 1];
static char url[MAX_URL_LEN + 1];
static in_addr_t daddr;
static in_port_t dport;
static in_addr_t saddr;
/*----------------------------------------------------------------------------*/
static int total_flows;
static int flows[MAX_CPUS];
static int flowcnt = 0;
static int concurrency;
static int max_fds;
static int response_size = 0;
/*----------------------------------------------------------------------------*/
struct wget_stat
{
uint64_t waits;
uint64_t events;
uint64_t connects;
uint64_t reads;
uint64_t writes;
uint64_t completes;
uint64_t errors;
uint64_t timedout;
uint64_t sum_resp_time;
uint64_t max_resp_time;
};
/*----------------------------------------------------------------------------*/
struct thread_context
{
int core;
mctx_t mctx;
int ep;
struct wget_vars *wvars;
int target;
int started;
int errors;
int incompletes;
int done;
int pending;
struct wget_stat stat;
};
typedef struct thread_context* thread_context_t;
/*----------------------------------------------------------------------------*/
struct wget_vars
{
int request_sent;
char response[HTTP_HEADER_LEN];
int resp_len;
int headerset;
uint32_t header_len;
uint64_t file_len;
uint64_t recv;
uint64_t write;
struct timeval t_start;
struct timeval t_end;
int fd;
};
/*----------------------------------------------------------------------------*/
static struct thread_context *g_ctx[MAX_CPUS];
static struct wget_stat *g_stat[MAX_CPUS];
/*----------------------------------------------------------------------------*/
thread_context_t
CreateContext(int core)
{
thread_context_t ctx;
ctx = (thread_context_t)calloc(1, sizeof(struct thread_context));
if (!ctx) {
perror("malloc");
TRACE_ERROR("Failed to allocate memory for thread context.\n");
return NULL;
}
ctx->core = core;
ctx->mctx = mtcp_create_context(core);
if (!ctx->mctx) {
TRACE_ERROR("Failed to create mtcp context.\n");
return NULL;
}
g_mctx[core] = ctx->mctx;
return ctx;
}
/*----------------------------------------------------------------------------*/
void
DestroyContext(thread_context_t ctx)
{
mtcp_destroy_context(ctx->mctx);
free(ctx);
}
/*----------------------------------------------------------------------------*/
inline int
CreateConnection(thread_context_t ctx)
{
mctx_t mctx = ctx->mctx;
struct mtcp_epoll_event ev;
struct sockaddr_in addr;
int sockid;
int ret;
sockid = mtcp_socket(mctx, AF_INET, SOCK_STREAM, 0);
if (sockid < 0) {
TRACE_INFO("Failed to create socket!\n");
return -1;
}
memset(&ctx->wvars[sockid], 0, sizeof(struct wget_vars));
ret = mtcp_setsock_nonblock(mctx, sockid);
if (ret < 0) {
TRACE_ERROR("Failed to set socket in nonblocking mode.\n");
exit(-1);
}
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = daddr;
addr.sin_port = dport;
ret = mtcp_connect(mctx, sockid,
(struct sockaddr *)&addr, sizeof(struct sockaddr_in));
if (ret < 0) {
if (errno != EINPROGRESS) {
perror("mtcp_connect");
mtcp_close(mctx, sockid);
return -1;
}
}
ctx->started++;
ctx->pending++;
ctx->stat.connects++;
ev.events = MTCP_EPOLLOUT;
ev.data.sockid = sockid;
mtcp_epoll_ctl(mctx, ctx->ep, MTCP_EPOLL_CTL_ADD, sockid, &ev);
return sockid;
}
/*----------------------------------------------------------------------------*/
inline void
CloseConnection(thread_context_t ctx, int sockid)
{
mtcp_epoll_ctl(ctx->mctx, ctx->ep, MTCP_EPOLL_CTL_DEL, sockid, NULL);
mtcp_close(ctx->mctx, sockid);
ctx->pending--;
ctx->done++;
assert(ctx->pending >= 0);
while (ctx->pending < concurrency && ctx->started < ctx->target) {
if (CreateConnection(ctx) < 0) {
done[ctx->core] = TRUE;
break;
}
}
}
/*----------------------------------------------------------------------------*/
static inline int
SendHTTPRequest(thread_context_t ctx, int sockid, struct wget_vars *wv)
{
char request[HTTP_HEADER_LEN];
struct mtcp_epoll_event ev;
int wr;
int len;
wv->headerset = FALSE;
wv->recv = 0;
wv->header_len = wv->file_len = 0;
snprintf(request, HTTP_HEADER_LEN, "GET %s HTTP/1.0\r\n"
"User-Agent: Wget/1.12 (linux-gnu)\r\n"
"Accept: */*\r\n"
"Host: %s\r\n"
// "Connection: Keep-Alive\r\n\r\n",
"Connection: Close\r\n\r\n",
url, host);
len = strlen(request);
wr = mtcp_write(ctx->mctx, sockid, request, len);
if (wr < len) {
TRACE_ERROR("Socket %d: Sending HTTP request failed. "
"try: %d, sent: %d\n", sockid, len, wr);
}
ctx->stat.writes += wr;
TRACE_APP("Socket %d HTTP Request of %d bytes. sent.\n", sockid, wr);
wv->request_sent = TRUE;
ev.events = MTCP_EPOLLIN;
ev.data.sockid = sockid;
mtcp_epoll_ctl(ctx->mctx, ctx->ep, MTCP_EPOLL_CTL_MOD, sockid, &ev);
gettimeofday(&wv->t_start, NULL);
char fname[MAX_FILE_LEN + 1];
if (fio) {
snprintf(fname, MAX_FILE_LEN, "%s.%d", outfile, flowcnt++);
wv->fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (wv->fd < 0) {
TRACE_APP("Failed to open file descriptor for %s\n", fname);
exit(1);
}
}
return 0;
}
/*----------------------------------------------------------------------------*/
static inline int
DownloadComplete(thread_context_t ctx, int sockid, struct wget_vars *wv)
{
#ifdef APP
mctx_t mctx = ctx->mctx;
#endif
uint64_t tdiff;
TRACE_APP("Socket %d File download complete!\n", sockid);
gettimeofday(&wv->t_end, NULL);
CloseConnection(ctx, sockid);
ctx->stat.completes++;
if (response_size == 0) {
response_size = wv->recv;
fprintf(stderr, "Response size set to %d\n", response_size);
} else {
if (wv->recv != response_size) {
fprintf(stderr, "Response size mismatch! mine: %ld, theirs: %d\n",
wv->recv, response_size);
}
}
tdiff = (wv->t_end.tv_sec - wv->t_start.tv_sec) * 1000000 +
(wv->t_end.tv_usec - wv->t_start.tv_usec);
TRACE_APP("Socket %d Total received bytes: %lu (%luMB)\n",
sockid, wv->recv, wv->recv / 1000000);
TRACE_APP("Socket %d Total spent time: %lu us\n", sockid, tdiff);
if (tdiff > 0) {
TRACE_APP("Socket %d Average bandwidth: %lf[MB/s]\n",
sockid, (double)wv->recv / tdiff);
}
ctx->stat.sum_resp_time += tdiff;
if (tdiff > ctx->stat.max_resp_time)
ctx->stat.max_resp_time = tdiff;
if (fio && wv->fd > 0)
close(wv->fd);
return 0;
}
/*----------------------------------------------------------------------------*/
static inline int
HandleReadEvent(thread_context_t ctx, int sockid, struct wget_vars *wv)
{
mctx_t mctx = ctx->mctx;
char buf[BUF_SIZE];
char *pbuf;
int rd, copy_len;
rd = 1;
while (rd > 0) {
rd = mtcp_read(mctx, sockid, buf, BUF_SIZE);
if (rd <= 0)
break;
ctx->stat.reads += rd;
TRACE_APP("Socket %d: mtcp_read ret: %d, total_recv: %lu, "
"header_set: %d, header_len: %u, file_len: %lu\n",
sockid, rd, wv->recv + rd,
wv->headerset, wv->header_len, wv->file_len);
pbuf = buf;
if (!wv->headerset) {
copy_len = MIN(rd, HTTP_HEADER_LEN - wv->resp_len);
memcpy(wv->response + wv->resp_len, buf, copy_len);
wv->resp_len += copy_len;
wv->header_len = find_http_header(wv->response, wv->resp_len);
if (wv->header_len > 0) {
wv->response[wv->header_len] = '\0';
wv->file_len = http_header_long_val(wv->response,
CONTENT_LENGTH_HDR, sizeof(CONTENT_LENGTH_HDR) - 1);
TRACE_APP("Socket %d Parsed response header. "
"Header length: %u, File length: %lu (%luMB)\n",
sockid, wv->header_len,
wv->file_len, wv->file_len / 1024 / 1024);
wv->headerset = TRUE;
wv->recv += (rd - (wv->resp_len - wv->header_len));
rd = (wv->resp_len - wv->header_len);
pbuf += (rd - (wv->resp_len - wv->header_len));
//printf("Successfully parse header.\n");
//fflush(stdout);
} else {
/* failed to parse response header */
#if 0
printf("[CPU %d] Socket %d Failed to parse response header."
" Data: \n%s\n", ctx->core, sockid, wv->response);
fflush(stdout);
#endif
wv->recv += rd;
rd = 0;
ctx->stat.errors++;
ctx->errors++;
CloseConnection(ctx, sockid);
return 0;
}
//pbuf += wv->header_len;
//wv->recv += wv->header_len;
//rd -= wv->header_len;
}
wv->recv += rd;
if (fio && wv->fd > 0) {
int wr = 0;
while (wr < rd) {
int _wr = write(wv->fd, pbuf + wr, rd - wr);
assert (_wr == rd - wr);
if (_wr < 0) {
perror("write");
TRACE_ERROR("Failed to write.\n");
assert(0);
break;
}
wr += _wr;
wv->write += _wr;
}
}
if (wv->header_len && (wv->recv >= wv->header_len + wv->file_len)) {
break;
}
}
if (rd > 0) {
if (wv->header_len && (wv->recv >= wv->header_len + wv->file_len)) {
TRACE_APP("Socket %d Done Write: "
"header: %u file: %lu recv: %lu write: %lu\n",
sockid, wv->header_len, wv->file_len,
wv->recv - wv->header_len, wv->write);
DownloadComplete(ctx, sockid, wv);
return 0;
}
} else if (rd == 0) {
/* connection closed by remote host */
TRACE_DBG("Socket %d connection closed with server.\n", sockid);
if (wv->header_len && (wv->recv >= wv->header_len + wv->file_len)) {
DownloadComplete(ctx, sockid, wv);
} else {
ctx->stat.errors++;
ctx->incompletes++;
CloseConnection(ctx, sockid);
}
} else if (rd < 0) {
if (errno != EAGAIN) {
TRACE_DBG("Socket %d: mtcp_read() error %s\n",
sockid, strerror(errno));
ctx->stat.errors++;
ctx->errors++;
CloseConnection(ctx, sockid);
}
}
return 0;
}
/*----------------------------------------------------------------------------*/
#if 0
void
PrintStats()
{
#define LINE_LEN 2048
char line[LINE_LEN];
int total_trans;
int i;
total_trans = 0;
line[0] = '\0';
//sprintf(line, "Trans/s: ");
for (i = 0; i < core_limit; i++) {
//sprintf(line + strlen(line), "%6d ", g_trans[i]);
sprintf(line + strlen(line), "[CPU%2d] %7d trans/s ", i, g_trans[i]);
total_trans += g_trans[i];
g_trans[i] = 0;
if (i % 4 == 3)
sprintf(line + strlen(line), "\n");
}
fprintf(stderr, "%s", line);
fprintf(stderr, "[ ALL ] %7d trans/s\n", total_trans);
//sprintf(line + strlen(line), "total: %6d", total_trans);
//printf("%s\n", line);
//fprintf(stderr, "Transactions/s: %d\n", total_trans);
fflush(stderr);
}
#endif
/*----------------------------------------------------------------------------*/
static void
PrintStats()
{
struct wget_stat total = {0};
struct wget_stat *st;
uint64_t avg_resp_time;
uint64_t total_resp_time = 0;
int i;
for (i = 0; i < core_limit; i++) {
st = g_stat[i];
avg_resp_time = st->completes? st->sum_resp_time / st->completes : 0;
#if 0
fprintf(stderr, "[CPU%2d] epoll_wait: %5lu, event: %7lu, "
"connect: %7lu, read: %4lu MB, write: %4lu MB, "
"completes: %7lu (resp_time avg: %4lu, max: %6lu us), "
"errors: %2lu (timedout: %2lu)\n",
i, st->waits, st->events, st->connects,
st->reads / 1000 / 1000, st->writes / 1000 / 1000,
st->completes, avg_resp_time, st->max_resp_time,
st->errors, st->timedout);
#endif
total.waits += st->waits;
total.events += st->events;
total.connects += st->connects;
total.reads += st->reads;
total.writes += st->writes;
total.completes += st->completes;
total_resp_time += avg_resp_time;
if (st->max_resp_time > total.max_resp_time)
total.max_resp_time = st->max_resp_time;
total.errors += st->errors;
total.timedout += st->timedout;
memset(st, 0, sizeof(struct wget_stat));
}
fprintf(stderr, "[ ALL ] connect: %7lu, read: %4lu MB, write: %4lu MB, "
"completes: %7lu (resp_time avg: %4lu, max: %6lu us)\n",
total.connects,
total.reads / 1000 / 1000, total.writes / 1000 / 1000,
total.completes, total_resp_time / core_limit, total.max_resp_time);
#if 0
fprintf(stderr, "[ ALL ] epoll_wait: %5lu, event: %7lu, "
"connect: %7lu, read: %4lu MB, write: %4lu MB, "
"completes: %7lu (resp_time avg: %4lu, max: %6lu us), "
"errors: %2lu (timedout: %2lu)\n",
total.waits, total.events, total.connects,
total.reads / 1000 / 1000, total.writes / 1000 / 1000,
total.completes, total_resp_time / core_limit, total.max_resp_time,
total.errors, total.timedout);
#endif
}
/*----------------------------------------------------------------------------*/
void *
RunWgetMain(void *arg)
{
thread_context_t ctx;
mctx_t mctx;
int core = *(int *)arg;
struct in_addr daddr_in;
int n, maxevents;
int ep;
struct mtcp_epoll_event *events;
int nevents;
struct wget_vars *wvars;
int i;
struct timeval cur_tv, prev_tv;
uint64_t cur_ts, prev_ts;
mtcp_core_affinitize(core);
ctx = CreateContext(core);
if (!ctx) {
return NULL;
}
mctx = ctx->mctx;
g_ctx[core] = ctx;
g_stat[core] = &ctx->stat;
srand(time(NULL));
mtcp_init_rss(mctx, saddr, IP_RANGE, daddr, dport);
n = flows[core];
if (n == 0) {
TRACE_DBG("Application thread %d finished.\n", core);
pthread_exit(NULL);
return NULL;
}
ctx->target = n;
daddr_in.s_addr = daddr;
fprintf(stderr, "Thread %d handles %d flows. connecting to %s:%u\n",
core, n, inet_ntoa(daddr_in), ntohs(dport));
/* Initialization */
maxevents = max_fds * 3;
ep = mtcp_epoll_create(mctx, maxevents);
if (ep < 0) {
TRACE_ERROR("Failed to create epoll struct!n");
exit(EXIT_FAILURE);
}
events = (struct mtcp_epoll_event *)
calloc(maxevents, sizeof(struct mtcp_epoll_event));
if (!events) {
TRACE_ERROR("Failed to allocate events!\n");
exit(EXIT_FAILURE);
}
ctx->ep = ep;
wvars = (struct wget_vars *)calloc(max_fds, sizeof(struct wget_vars));
if (!wvars) {
TRACE_ERROR("Failed to create wget variables!\n");
exit(EXIT_FAILURE);
}
ctx->wvars = wvars;
ctx->started = ctx->done = ctx->pending = 0;
ctx->errors = ctx->incompletes = 0;
gettimeofday(&cur_tv, NULL);
prev_ts = TIMEVAL_TO_USEC(cur_tv);
prev_tv = cur_tv;
while (!done[core]) {
gettimeofday(&cur_tv, NULL);
cur_ts = TIMEVAL_TO_USEC(cur_tv);
/* print statistics every second */
if (core == 0 && cur_tv.tv_sec > prev_tv.tv_sec) {
PrintStats();
prev_tv = cur_tv;
}
while (ctx->pending < concurrency && ctx->started < ctx->target) {
if (CreateConnection(ctx) < 0) {
done[core] = TRUE;
break;
}
}
nevents = mtcp_epoll_wait(mctx, ep, events, maxevents, -1);
ctx->stat.waits++;
if (nevents < 0) {
if (errno != EINTR) {
TRACE_ERROR("mtcp_epoll_wait failed! ret: %d\n", nevents);
}
done[core] = TRUE;
break;
} else {
ctx->stat.events += nevents;
}
for (i = 0; i < nevents; i++) {
if (events[i].events & MTCP_EPOLLERR) {
int err;
socklen_t len = sizeof(err);
TRACE_APP("[CPU %d] Error on socket %d\n",
core, events[i].data.sockid);
ctx->stat.errors++;
ctx->errors++;
if (mtcp_getsockopt(mctx, events[i].data.sockid,
SOL_SOCKET, SO_ERROR, (void *)&err, &len) == 0) {
if (err == ETIMEDOUT)
ctx->stat.timedout++;
}
CloseConnection(ctx, events[i].data.sockid);
} else if (events[i].events & MTCP_EPOLLIN) {
HandleReadEvent(ctx,
events[i].data.sockid, &wvars[events[i].data.sockid]);
} else if (events[i].events == MTCP_EPOLLOUT) {
struct wget_vars *wv = &wvars[events[i].data.sockid];
if (!wv->request_sent) {
SendHTTPRequest(ctx, events[i].data.sockid, wv);
} else {
//TRACE_DBG("Request already sent.\n");
}
} else {
TRACE_ERROR("Socket %d: event: %s\n",
events[i].data.sockid, EventToString(events[i].events));
assert(0);
}
}
if (ctx->done >= ctx->target) {
fprintf(stdout, "[CPU %d] Completed %d connections, "
"errors: %d incompletes: %d\n",
ctx->core, ctx->done, ctx->errors, ctx->incompletes);
break;
}
}
TRACE_INFO("Wget thread %d waiting for mtcp to be destroyed.\n", core);
DestroyContext(ctx);
TRACE_DBG("Wget thread %d finished.\n", core);
pthread_exit(NULL);
return NULL;
}
/*----------------------------------------------------------------------------*/
void
SignalHandler(int signum)
{
int i;
for (i = 0; i < core_limit; i++) {
done[i] = TRUE;
}
}
/*----------------------------------------------------------------------------*/
int
main(int argc, char **argv)
{
struct mtcp_conf mcfg;
int cores[MAX_CPUS];
int flow_per_thread;
int flow_remainder_cnt;
int total_concurrency = 0;
int ret;
int i;
if (argc < 3) {
TRACE_CONFIG("Too few arguments!\n");
TRACE_CONFIG("Usage: %s url #flows [output]\n", argv[0]);
return FALSE;
}
if (strlen(argv[1]) > MAX_URL_LEN) {
TRACE_CONFIG("Length of URL should be smaller than %d!\n", MAX_URL_LEN);
return FALSE;
}
char* slash_p = strchr(argv[1], '/');
if (slash_p) {
strncpy(host, argv[1], slash_p - argv[1]);
strncpy(url, strchr(argv[1], '/'), MAX_URL_LEN);
} else {
strncpy(host, argv[1], MAX_IP_STR_LEN);
strncpy(url, "/", 1);
}
daddr = inet_addr(host);
dport = htons(80);
saddr = INADDR_ANY;
total_flows = atoi(argv[2]);
if (total_flows <= 0) {
TRACE_CONFIG("Number of flows should be large than 0.\n");
return FALSE;
}
num_cores = GetNumCPUs();
core_limit = num_cores;
concurrency = 100;
for (i = 3; i < argc - 1; i++) {
if (strcmp(argv[i], "-N") == 0) {
core_limit = atoi(argv[i + 1]);
if (core_limit > num_cores) {
TRACE_CONFIG("CPU limit should be smaller than the "
"number of CPUS: %d\n", num_cores);
return FALSE;
}
} else if (strcmp(argv[i], "-c") == 0) {
total_concurrency = atoi(argv[i + 1]);
} else if (strcmp(argv[i], "-o") == 0) {
if (strlen(argv[i + 1]) > MAX_FILE_LEN) {
TRACE_CONFIG("Output file length should be smaller than %d!\n",
MAX_FILE_LEN);
return FALSE;
}
fio = TRUE;
strncpy(outfile, argv[i + 1], MAX_FILE_LEN);
}
}
if (total_flows < core_limit) {
core_limit = total_flows;
}
/* per-core concurrency = total_concurrency / # cores */
if (total_concurrency > 0)
concurrency = total_concurrency / core_limit;
/* set the max number of fds 3x larger than concurrency */
max_fds = concurrency * 3;
TRACE_CONFIG("Application configuration:\n");
TRACE_CONFIG("URL: %s\n", url);
TRACE_CONFIG("# of total_flows: %d\n", total_flows);
TRACE_CONFIG("# of cores: %d\n", core_limit);
TRACE_CONFIG("Concurrency: %d\n", total_concurrency);
if (fio) {
TRACE_CONFIG("Output file: %s\n", outfile);
}
ret = mtcp_init("epwget.conf");
if (ret) {
TRACE_ERROR("Failed to initialize mtcp.\n");
exit(EXIT_FAILURE);
}
mtcp_getconf(&mcfg);
mcfg.max_concurrency = max_fds;
mcfg.max_num_buffers = max_fds;
mtcp_setconf(&mcfg);
mtcp_register_signal(SIGINT, SignalHandler);
flow_per_thread = total_flows / core_limit;
flow_remainder_cnt = total_flows % core_limit;
for (i = 0; i < core_limit; i++) {
cores[i] = i;
done[i] = FALSE;
flows[i] = flow_per_thread;
if (flow_remainder_cnt-- > 0)
flows[i]++;
if (flows[i] == 0)
continue;
if (pthread_create(&app_thread[i],
NULL, RunWgetMain, (void *)&cores[i])) {
perror("pthread_create");
TRACE_ERROR("Failed to create wget thread.\n");
exit(-1);
}
}
for (i = 0; i < core_limit; i++) {
pthread_join(app_thread[i], NULL);
TRACE_INFO("Wget thread %d joined.\n", i);
}
mtcp_destroy();
return 0;
}
/*----------------------------------------------------------------------------*/

18
apps/example/epwget.conf Normal file
View File

@ -0,0 +1,18 @@
############### mtcp configuration file ###############
# receive buffer size of sockets
rcvbuf = 8192
# send buffer size of sockets
sndbuf = 2048
# tcp timeout seconds
# (tcp_timeout = -1 can disable the timeout check)
tcp_timeout = 30
# tcp timewait seconds
tcp_timewait = 0
# interface to print stats
stat_print = xge0
#stat_print = xge1

7
config/sample_arp.conf Normal file
View File

@ -0,0 +1,7 @@
# This file is to configure static arp tables
# Rename this file to arp.conf and set the appropriate values
# (Destination IP address) (Destination MAC address)
ARP_ENTRY 2
10.0.0.1 00:00:00:00:00:01
10.0.1.1 00:00:00:00:00:02

25
config/sample_mtcp.conf Normal file
View File

@ -0,0 +1,25 @@
############### mtcp configuration file ###############
# maximum concurrency per core
max_concurrency = 100000
# maximum number of socket buffers per core
max_num_buffers = 100000
# receive buffer size of sockets
rcvbuf = 8192
# send buffer size of sockets
sndbuf = 8192
# tcp timeout seconds
# (tcp_timeout = -1 can disable the timeout check)
tcp_timeout = 30
# tcp timewait seconds
tcp_timewait = 0
# NICs to print network statistics per second
# if enabled, mTCP will print xx Gbps and xx pps for RX and TX
stat_print = xge0
#stat_print = xge1

7
config/sample_route.conf Normal file
View File

@ -0,0 +1,7 @@
# This file is routing table example of coffee5
# copy this file to route.conf and give appropriate routes
# (Destination address)/(Prefix) (Device name)
ROUTES 2
10.0.0.1/24 xge0
10.0.1.1/24 xge1

339
io_engine/COPYING Normal file
View File

@ -0,0 +1,339 @@
"This software program is licensed subject to the GNU General Public License
(GPL). Version 2, June 1991, available at
<http://www.fsf.org/copyleft/gpl.html>"
GNU General Public License
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
Everyone is permitted to copy and distribute verbatim copies of this license
document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your freedom to
share and change it. By contrast, the GNU General Public License is intended
to guarantee your freedom to share and change free software--to make sure
the software is free for all its users. This General Public License applies
to most of the Free Software Foundation's software and to any other program
whose authors commit to using it. (Some other Free Software Foundation
software is covered by the GNU Library General Public License instead.) You
can apply it to your programs, too.
When we speak of free software, we are referring to freedom, not price. Our
General Public Licenses are designed to make sure that you have the freedom
to distribute copies of free software (and charge for this service if you
wish), that you receive source code or can get it if you want it, that you
can change the software or use pieces of it in new free programs; and that
you know you can do these things.
To protect your rights, we need to make restrictions that forbid anyone to
deny you these rights or to ask you to surrender the rights. These
restrictions translate to certain responsibilities for you if you distribute
copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether gratis or
for a fee, you must give the recipients all the rights that you have. You
must make sure that they, too, receive or can get the source code. And you
must show them these terms so they know their rights.
We protect your rights with two steps: (1) copyright the software, and (2)
offer you this license which gives you legal permission to copy, distribute
and/or modify the software.
Also, for each author's protection and ours, we want to make certain that
everyone understands that there is no warranty for this free software. If
the software is modified by someone else and passed on, we want its
recipients to know that what they have is not the original, so that any
problems introduced by others will not reflect on the original authors'
reputations.
Finally, any free program is threatened constantly by software patents. We
wish to avoid the danger that redistributors of a free program will
individually obtain patent licenses, in effect making the program
proprietary. To prevent this, we have made it clear that any patent must be
licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and modification
follow.
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains a notice
placed by the copyright holder saying it may be distributed under the
terms of this General Public License. The "Program", below, refers to any
such program or work, and a "work based on the Program" means either the
Program or any derivative work under copyright law: that is to say, a
work containing the Program or a portion of it, either verbatim or with
modifications and/or translated into another language. (Hereinafter,
translation is included without limitation in the term "modification".)
Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of running
the Program is not restricted, and the output from the Program is covered
only if its contents constitute a work based on the Program (independent
of having been made by running the Program). Whether that is true depends
on what the Program does.
1. You may copy and distribute verbatim copies of the Program's source code
as you receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice and
disclaimer of warranty; keep intact all the notices that refer to this
License and to the absence of any warranty; and give any other recipients
of the Program a copy of this License along with the Program.
You may charge a fee for the physical act of transferring a copy, and you
may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion of it,
thus forming a work based on the Program, and copy and distribute such
modifications or work under the terms of Section 1 above, provided that
you also meet all of these conditions:
* a) You must cause the modified files to carry prominent notices stating
that you changed the files and the date of any change.
* b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any part
thereof, to be licensed as a whole at no charge to all third parties
under the terms of this License.
* c) If the modified program normally reads commands interactively when
run, you must cause it, when started running for such interactive
use in the most ordinary way, to print or display an announcement
including an appropriate copyright notice and a notice that there is
no warranty (or else, saying that you provide a warranty) and that
users may redistribute the program under these conditions, and
telling the user how to view a copy of this License. (Exception: if
the Program itself is interactive but does not normally print such
an announcement, your work based on the Program is not required to
print an announcement.)
These requirements apply to the modified work as a whole. If identifiable
sections of that work are not derived from the Program, and can be
reasonably considered independent and separate works in themselves, then
this License, and its terms, do not apply to those sections when you
distribute them as separate works. But when you distribute the same
sections as part of a whole which is a work based on the Program, the
distribution of the whole must be on the terms of this License, whose
permissions for other licensees extend to the entire whole, and thus to
each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of a
storage or distribution medium does not bring the other work under the
scope of this License.
3. You may copy and distribute the Program (or a work based on it, under
Section 2) in object code or executable form under the terms of Sections
1 and 2 above provided that you also do one of the following:
* a) Accompany it with the complete corresponding machine-readable source
code, which must be distributed under the terms of Sections 1 and 2
above on a medium customarily used for software interchange; or,
* b) Accompany it with a written offer, valid for at least three years,
to give any third party, for a charge no more than your cost of
physically performing source distribution, a complete machine-
readable copy of the corresponding source code, to be distributed
under the terms of Sections 1 and 2 above on a medium customarily
used for software interchange; or,
* c) Accompany it with the information you received as to the offer to
distribute corresponding source code. (This alternative is allowed
only for noncommercial distribution and only if you received the
program in object code or executable form with such an offer, in
accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source code
means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to control
compilation and installation of the executable. However, as a special
exception, the source code distributed need not include anything that is
normally distributed (in either source or binary form) with the major
components (compiler, kernel, and so on) of the operating system on which
the executable runs, unless that component itself accompanies the
executable.
If distribution of executable or object code is made by offering access
to copy from a designated place, then offering equivalent access to copy
the source code from the same place counts as distribution of the source
code, even though third parties are not compelled to copy the source
along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program except as
expressly provided under this License. Any attempt otherwise to copy,
modify, sublicense or distribute the Program is void, and will
automatically terminate your rights under this License. However, parties
who have received copies, or rights, from you under this License will not
have their licenses terminated so long as such parties remain in full
compliance.
5. You are not required to accept this License, since you have not signed
it. However, nothing else grants you permission to modify or distribute
the Program or its derivative works. These actions are prohibited by law
if you do not accept this License. Therefore, by modifying or
distributing the Program (or any work based on the Program), you
indicate your acceptance of this License to do so, and all its terms and
conditions for copying, distributing or modifying the Program or works
based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further restrictions
on the recipients' exercise of the rights granted herein. You are not
responsible for enforcing compliance by third parties to this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot distribute
so as to satisfy simultaneously your obligations under this License and
any other pertinent obligations, then as a consequence you may not
distribute the Program at all. For example, if a patent license would
not permit royalty-free redistribution of the Program by all those who
receive copies directly or indirectly through you, then the only way you
could satisfy both it and this License would be to refrain entirely from
distribution of the Program.
If any portion of this section is held invalid or unenforceable under any
particular circumstance, the balance of the section is intended to apply
and the section as a whole is intended to apply in other circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is implemented
by public license practices. Many people have made generous contributions
to the wide range of software distributed through that system in
reliance on consistent application of that system; it is up to the
author/donor to decide if he or she is willing to distribute software
through any other system and a licensee cannot impose that choice.
This section is intended to make thoroughly clear what is believed to be
a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in certain
countries either by patents or by copyrighted interfaces, the original
copyright holder who places the Program under this License may add an
explicit geographical distribution limitation excluding those countries,
so that distribution is permitted only in or among countries not thus
excluded. In such case, this License incorporates the limitation as if
written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions of
the General Public License from time to time. Such new versions will be
similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and
conditions either of that version or of any later version published by
the Free Software Foundation. If the Program does not specify a version
number of this License, you may choose any version ever published by the
Free Software Foundation.
10. If you wish to incorporate parts of the Program into other free programs
whose distribution conditions are different, write to the author to ask
for permission. For software which is copyrighted by the Free Software
Foundation, write to the Free Software Foundation; we sometimes make
exceptions for this. Our decision will be guided by the two goals of
preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH
YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
NECESSARY SERVICING, REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR
DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL
DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM
(INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED
INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF
THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR
OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it free
software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest to
attach them to the start of each source file to most effectively convey the
exclusion of warranty; and each file should have at least the "copyright"
line and a pointer to where the full notice is found.
one line to give the program's name and an idea of what it does.
Copyright (C) yyyy name of author
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59
Temple Place - Suite 330, Boston, MA 02111-1307, USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this when
it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author Gnomovision comes
with ABSOLUTELY NO WARRANTY; for details type 'show w'. This is free
software, and you are welcome to redistribute it under certain conditions;
type 'show c' for details.
The hypothetical commands 'show w' and 'show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may be
called something other than 'show w' and 'show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
'Gnomovision' (which makes passes at compilers) written by James Hacker.
signature of Ty Coon, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Library General Public
License instead of this License.

361
io_engine/driver/Makefile Normal file
View File

@ -0,0 +1,361 @@
################################################################################
#
# Intel 10 Gigabit PCI Express Linux driver
# Copyright(c) 1999 - 2009 Intel Corporation.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms and conditions of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
#
# This program is distributed in the hope it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
#
# The full GNU General Public License is included in this distribution in
# the file called "COPYING".
#
# Contact Information:
# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
#
################################################################################
###########################################################################
# Driver files
# core driver files
CFILES = ixgbe_main.c ixgbe_common.c ixgbe_api.c ixgbe_param.c \
ixgbe_ethtool.c kcompat.c ixgbe_82598.c \
ixgbe_82599.c \
ixgbe_dcb.c ixgbe_dcb_82598.c \
ixgbe_dcb_82599.c \
ixgbe_phy.c
HFILES = ixgbe.h ixgbe_common.h ixgbe_api.h ixgbe_osdep.h kcompat.h \
ixgbe_dcb.h \
ixgbe_phy.h
ifeq (,$(BUILD_KERNEL))
BUILD_KERNEL=$(shell uname -r)
endif
DRIVER_NAME=ps_ixgbe
###########################################################################
# Environment tests
# Kernel Search Path
# All the places we look for kernel source
KSP := /lib/modules/$(BUILD_KERNEL)/build \
/lib/modules/$(BUILD_KERNEL)/source \
/usr/src/linux-$(BUILD_KERNEL) \
/usr/src/linux-$($(BUILD_KERNEL) | sed 's/-.*//') \
/usr/src/kernel-headers-$(BUILD_KERNEL) \
/usr/src/kernel-source-$(BUILD_KERNEL) \
/usr/src/linux-$($(BUILD_KERNEL) | sed 's/\([0-9]*\.[0-9]*\)\..*/\1/') \
/usr/src/linux
# prune the list down to only values that exist
# and have an include/linux sub-directory
test_dir = $(shell [ -e $(dir)/include/linux ] && echo $(dir))
KSP := $(foreach dir, $(KSP), $(test_dir))
# we will use this first valid entry in the search path
ifeq (,$(KSRC))
KSRC := $(firstword $(KSP))
endif
ifeq (,$(KSRC))
$(warning *** Linux kernel source not found in any of these locations:)
$(warning $(KSP))
$(warning *** Install the appropriate kernel development package, e.g.)
$(error kernel-devel, for building kernel modules and try again)
else
ifeq (/lib/modules/$(BUILD_KERNEL)/source, $(KSRC))
KOBJ := /lib/modules/$(BUILD_KERNEL)/build
else
KOBJ := $(KSRC)
endif
endif
# Version file Search Path
VSP := $(KOBJ)/include/generated/utsrelease.h \
$(KOBJ)/include/linux/utsrelease.h \
$(KOBJ)/include/linux/version.h \
/boot/vmlinuz.version.h
# Config file Search Path
CSP := $(KSRC)/include/generated/autoconf.h \
$(KSRC)/include/linux/autoconf.h \
/boot/vmlinuz.autoconf.h
# prune the lists down to only files that exist
test_file = $(shell [ -f $(file) ] && echo $(file))
VSP := $(foreach file, $(VSP), $(test_file))
CSP := $(foreach file, $(CSP), $(test_file))
# and use the first valid entry in the Search Paths
ifeq (,$(VERSION_FILE))
VERSION_FILE := $(firstword $(VSP))
endif
ifeq (,$(CONFIG_FILE))
CONFIG_FILE := $(firstword $(CSP))
endif
ifeq (,$(wildcard $(VERSION_FILE)))
$(error Linux kernel source not configured - missing version.h)
endif
ifeq (,$(wildcard $(CONFIG_FILE)))
$(error Linux kernel source not configured - missing autoconf.h)
endif
# pick a compiler
ifneq (,$(findstring egcs-2.91.66, $(shell cat /proc/version)))
CC := kgcc gcc cc
else
CC := gcc cc
endif
test_cc = $(shell $(cc) --version > /dev/null 2>&1 && echo $(cc))
CC := $(foreach cc, $(CC), $(test_cc))
CC := $(firstword $(CC))
ifeq (,$(CC))
$(error Compiler not found)
endif
# we need to know what platform the driver is being built on
# some additional features are only built on Intel platforms
ARCH := $(shell uname -m | sed 's/i.86/i386/')
ifeq ($(ARCH),alpha)
EXTRA_CFLAGS += -ffixed-8 -mno-fp-regs
endif
ifeq ($(ARCH),x86_64)
EXTRA_CFLAGS += -mcmodel=kernel -mno-red-zone
endif
ifeq ($(ARCH),ppc)
EXTRA_CFLAGS += -msoft-float
endif
ifeq ($(ARCH),ppc64)
EXTRA_CFLAGS += -m64 -msoft-float
LDFLAGS += -melf64ppc
endif
# extra flags for module builds
EXTRA_CFLAGS += -DDRIVER_$(shell echo $(DRIVER_NAME) | tr '[a-z]' '[A-Z]')
EXTRA_CFLAGS += -DDRIVER_NAME=$(DRIVER_NAME)
EXTRA_CFLAGS += -DDRIVER_NAME_CAPS=$(shell echo $(DRIVER_NAME) | tr '[a-z]' '[A-Z]')
# standard flags for module builds
EXTRA_CFLAGS += -DLINUX -D__KERNEL__ -DMODULE -O3 -pipe -Wall
EXTRA_CFLAGS += -I$(KSRC)/include -I.
EXTRA_CFLAGS += $(shell [ -f $(KSRC)/include/linux/modversions.h ] && \
echo "-DMODVERSIONS -DEXPORT_SYMTAB \
-include $(KSRC)/include/linux/modversions.h")
EXTRA_CFLAGS += $(CFLAGS_EXTRA)
EXTRA_CFLAGS += -DIXGBE_RSS
# do nothing here, so it's stripped properly
RHC := $(KSRC)/include/linux/rhconfig.h
ifneq (,$(wildcard $(RHC)))
# 7.3 typo in rhconfig.h
ifneq (,$(shell $(CC) $(CFLAGS) -E -dM $(RHC) | grep __module__bigmem))
EXTRA_CFLAGS += -D__module_bigmem
endif
endif
# get the kernel version - we use this to find the correct install path
KVER := $(shell $(CC) $(EXTRA_CFLAGS) -E -dM $(VERSION_FILE) | grep UTS_RELEASE | \
awk '{ print $$3 }' | sed 's/\"//g')
# assume source symlink is the same as build, otherwise adjust KOBJ
ifneq (,$(wildcard /lib/modules/$(KVER)/build))
ifneq ($(KSRC),$(shell readlink /lib/modules/$(KVER)/build))
KOBJ=/lib/modules/$(KVER)/build
endif
endif
KKVER := $(shell echo $(KVER) | \
awk '{ if ($$0 ~ /2\.[4-9]\./) print "1"; else print "0"}')
ifeq ($(KKVER), 0)
$(error *** Aborting the build. \
*** This driver is not supported on kernel versions older than 2.4.0)
endif
# Add DCB netlink source if our kernel is 2.6.23 or newer
KKVER := $(shell echo $(KVER) | \
awk '{ if ($$0 ~ /2\.[6-9]\.(2[3-9]|[3-9][0-9])/) print "1"; else print "0"}')
ifeq ($(KKVER), 1)
CFILES += ixgbe_dcb_nl.c
endif
# Add FCoE source if FCoE is supported by the kernel
FCOE := $(shell grep -wE 'CONFIG_FCOE|CONFIG_FCOE_MODULE' $(CONFIG_FILE) | \
awk '{print $$3}')
ifeq ($(FCOE), 1)
CFILES += ixgbe_sysfs.c
CFILES += ixgbe_fcoe.c
HFILES += ixgbe_fcoe.h
endif
# set the install path
INSTDIR := /lib/modules/$(KVER)/kernel/drivers/net/$(DRIVER_NAME)
# look for SMP in config.h
SMP := $(shell $(CC) $(EXTRA_CFLAGS) -E -dM $(CONFIG_FILE) | \
grep -w CONFIG_SMP | awk '{ print $$3 }')
ifneq ($(SMP),1)
SMP := 0
endif
ifneq ($(SMP),$(shell uname -a | grep SMP > /dev/null 2>&1 && echo 1 || echo 0))
$(warning ***)
ifeq ($(SMP),1)
$(warning *** Warning: kernel source configuration (SMP))
$(warning *** does not match running kernel (UP))
else
$(warning *** Warning: kernel source configuration (UP))
$(warning *** does not match running kernel (SMP))
endif
$(warning *** Continuing with build,)
$(warning *** resulting driver may not be what you want)
$(warning ***)
endif
ifeq ($(SMP),1)
EXTRA_CFLAGS += -D__SMP__
endif
###########################################################################
# 2.4.x & 2.6.x Specific rules
K_VERSION:=$(shell echo $(BUILD_KERNEL) | cut -c1-3 | sed 's/2\.[56]/2\.6/')
ifeq ($(K_VERSION), 2.6)
# Makefile for 2.6.x kernel
TARGET = $(DRIVER_NAME).ko
# man page
MANSECTION = 7
MANFILE = $(TARGET:.ko=.$(MANSECTION))
ifneq ($(PATCHLEVEL),)
EXTRA_CFLAGS += $(CFLAGS_EXTRA)
obj-m += $(DRIVER_NAME).o
$(DRIVER_NAME)-objs := $(CFILES:.c=.o)
else
default:
ifeq ($(KOBJ),$(KSRC))
$(MAKE) -C $(KSRC) SUBDIRS=$(shell pwd) modules
else
$(MAKE) -C $(KSRC) O=$(KOBJ) SUBDIRS=$(shell pwd) modules
endif
endif
else # ifeq ($(K_VERSION),2.6)
# Makefile for 2.4.x kernel
TARGET = $(DRIVER_NAME).o
# man page
MANSECTION = 7
MANFILE = $(TARGET:.o=.$(MANSECTION))
# Get rid of compile warnings in kernel header files from SuSE
ifneq (,$(wildcard /etc/SuSE-release))
EXTRA_CFLAGS += -Wno-sign-compare -fno-strict-aliasing
endif
# Get rid of compile warnings in kernel header files from fedora
ifneq (,$(wildcard /etc/fedora-release))
EXTRA_CFLAGS += -fno-strict-aliasing
endif
CFLAGS += $(EXTRA_CFLAGS)
.SILENT: $(TARGET)
$(TARGET): $(filter-out $(TARGET), $(CFILES:.c=.o))
$(LD) $(LDFLAGS) -r $^ -o $@
echo; echo
echo "**************************************************"
echo "** $(TARGET) built for $(KVER)"
echo -n "** SMP "
if [ "$(SMP)" = "1" ]; \
then echo "Enabled"; else echo "Disabled"; fi
echo "**************************************************"
echo
$(CFILES:.c=.o): $(HFILES) Makefile
default:
$(MAKE)
endif # ifeq ($(K_VERSION),2.6)
ifeq (,$(MANDIR))
# find the best place to install the man page
MANPATH := $(shell (manpath 2>/dev/null || echo $MANPATH) | sed 's/:/ /g')
ifneq (,$(MANPATH))
# test based on inclusion in MANPATH
test_dir = $(findstring $(dir), $(MANPATH))
else
# no MANPATH, test based on directory existence
test_dir = $(shell [ -e $(dir) ] && echo $(dir))
endif
# our preferred install path
# should /usr/local/man be in here ?
MANDIR := /usr/share/man /usr/man
MANDIR := $(foreach dir, $(MANDIR), $(test_dir))
MANDIR := $(firstword $(MANDIR))
endif
ifeq (,$(MANDIR))
# fallback to /usr/man
MANDIR := /usr/man
endif
# depmod version for rpm builds
DEPVER := $(shell /sbin/depmod -V 2>/dev/null | \
awk 'BEGIN {FS="."} NR==1 {print $$2}')
###########################################################################
# Build rules
$(MANFILE).gz: ../$(MANFILE)
gzip -c $< > $@
install: default $(MANFILE).gz
# remove all old versions of the driver
find $(INSTALL_MOD_PATH)/lib/modules/$(KVER) -name $(TARGET) -exec rm -f {} \; || true
find $(INSTALL_MOD_PATH)/lib/modules/$(KVER) -name $(TARGET).gz -exec rm -f {} \; || true
install -D -m 644 $(TARGET) $(INSTALL_MOD_PATH)$(INSTDIR)/$(TARGET)
ifeq (,$(INSTALL_MOD_PATH))
/sbin/depmod -a || true
else
ifeq ($(DEPVER),1 )
/sbin/depmod -r $(INSTALL_MOD_PATH) -a || true
else
/sbin/depmod -b $(INSTALL_MOD_PATH) -a -n $(KVERSION) > /dev/null || true
endif
endif
install -D -m 644 $(MANFILE).gz $(INSTALL_MOD_PATH)$(MANDIR)/man$(MANSECTION)/$(MANFILE).gz
man -c -P'cat > /dev/null' $(MANFILE:.$(MANSECTION)=) || true
uninstall:
if [ -e $(INSTDIR)/$(TARGET) ] ; then \
rm -f $(INSTDIR)/$(TARGET) ; \
fi
/sbin/depmod -a
if [ -e $(MANDIR)/man$(MANSECTION)/$(MANFILE).gz ] ; then \
rm -f $(MANDIR)/man$(MANSECTION)/$(MANFILE).gz ; \
fi
.PHONY: clean install
clean:
ifeq ($(KOBJ),$(KSRC))
$(MAKE) -C $(KSRC) SUBDIRS=$(shell pwd) clean
else
$(MAKE) -C $(KSRC) O=$(KOBJ) SUBDIRS=$(shell pwd) clean
endif
rm -rf $(TARGET) $(TARGET:.ko=.o) $(TARGET:.ko=.mod.c) $(TARGET:.ko=.mod.o) $(CFILES:.c=.o) $(MANFILE).gz .*cmd .tmp_versions

37
io_engine/driver/affinity.py Executable file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python
import os
import sys
import subprocess
def execute(cmd):
try:
proc = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE)
return proc.communicate()[0]
except:
pass
return None
if os.getuid() != 0:
print 'You must be root!'
sys.exit(1)
num_cpus = len(execute('cat /proc/cpuinfo | grep processor').strip().split('\n'))
if len(sys.argv) < 2:
print 'usage: %s <interface name>' % sys.argv[0]
sys.exit(1)
ifname = sys.argv[1]
intrmap = execute('cat /proc/interrupts | grep %s-rx-' % ifname).strip().split('\n')
for intr in intrmap:
irq = int(intr.split()[0][:-1])
name = intr.split()[-1]
queue = int(name[name.rfind('-') + 1:])
cpu = queue
print 'echo %x > /proc/irq/%d/smp_affinity' % (1 << cpu, irq)
execute('echo %x > /proc/irq/%d/smp_affinity' % (1 << cpu, irq))

73
io_engine/driver/install.py Executable file
View File

@ -0,0 +1,73 @@
#!/usr/bin/env python
import sys
import os
import subprocess
import time
ITR = 956 # interrupt throttling rate
def execute(cmd):
try:
proc = subprocess.Popen(cmd, shell = True, stdout = subprocess.PIPE)
return proc.communicate()[0]
except:
pass
return None
def get_num_interfaces():
output_82598 = execute('lspci | grep 82598').strip()
num_82598 = len(output_82598.split('\n'))
if output_82598 == '':
num_82598 = 0
output_82599 = execute('lspci | grep 82599').strip()
num_82599 = len(output_82599.split('\n'))
if output_82599 == '':
num_82599 = 0
return num_82598 + num_82599
def get_num_cpus():
output = execute('cat /proc/cpuinfo | grep processor')
return len(output.strip().split('\n'))
if os.getuid() != 0:
print 'You must be root!'
sys.exit(1)
if len(sys.argv) < 3:
print 'usage: %s <# of RX queues> <# of TX queues>' % sys.argv[0]
print ' You can specify 0 instead of the number of queues (one queue for each core)'
sys.exit(1)
num_rx_queues = int(sys.argv[1])
num_tx_queues = int(sys.argv[2])
postfix = '43'
assert 0 <= num_rx_queues <= 16
num_ifs = get_num_interfaces()
num_cpus = get_num_cpus()
execute('lsmod | grep ps_ixgbe > /dev/null && sudo rmmod ps_ixgbe')
execute('insmod ./ps_ixgbe.ko RXQ=%s TXQ=%s InterruptThrottleRate=%s' %
(','.join([str(num_rx_queues)] * num_ifs),
','.join([str(num_tx_queues)] * num_ifs),
','.join([str(ITR)] * num_ifs))
)
time.sleep(3)
for i in range(num_ifs):
ifname = 'xge%d' % i
print 'setting %s...' % ifname
execute('ethtool -A %s autoneg off rx off tx off' % ifname)
execute('ifconfig %s 10.0.%d.%s mtu 1500 netmask 255.255.255.0' % (ifname, i, postfix))
print 'OK'
print execute('./affinity.py %s' % ifname).strip()
execute('rm -f /dev/packet_shader')
execute('mknod /dev/packet_shader c 1010 0')
execute('chmod 666 /dev/packet_shader')

552
io_engine/driver/ixgbe.h Normal file
View File

@ -0,0 +1,552 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _IXGBE_H_
#define _IXGBE_H_
#ifndef IXGBE_NO_LRO
#include <net/tcp.h>
#endif
#include <linux/pci.h>
#include <linux/netdevice.h>
#include <linux/vmalloc.h>
#ifdef SIOCETHTOOL
#include <linux/ethtool.h>
#endif
#ifdef NETIF_F_HW_VLAN_TX
#include <linux/if_vlan.h>
#endif
#if 0
#if defined(CONFIG_DCA) || defined(CONFIG_DCA_MODULE)
#define IXGBE_DCA
#include <linux/dca.h>
#endif
#endif
#include "ixgbe_dcb.h"
#include "kcompat.h"
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
#define IXGBE_FCOE
#include "ixgbe_fcoe.h"
#endif /* CONFIG_FCOE or CONFIG_FCOE_MODULE */
#include "ixgbe_api.h"
#define PFX "ixgbe: "
#define DPRINTK(nlevel, klevel, fmt, args...) \
((void)((NETIF_MSG_##nlevel & adapter->msg_enable) && \
printk(KERN_##klevel PFX "%s: %s: " fmt, adapter->netdev->name, \
__FUNCTION__ , ## args)))
/* TX/RX descriptor defines */
#define IXGBE_DEFAULT_TXD 4096
#define IXGBE_MAX_TXD 4096
#define IXGBE_MIN_TXD 64
#define IXGBE_DEFAULT_RXD 4096
#define IXGBE_MAX_RXD 4096
#define IXGBE_MIN_RXD 64
#define IXGBE_SUBWINDOW_BITS 10
#define IXGBE_SUBWINDOW_SIZE (1 << IXGBE_SUBWINDOW_BITS)
#define IXGBE_SUBWINDOW_MASK (IXGBE_SUBWINDOW_SIZE - 1)
#define IXGBE_MAX_SUBWINDOWS (IXGBE_MAX_TXD / IXGBE_SUBWINDOW_SIZE)
/* flow control */
#define IXGBE_DEFAULT_FCRTL 0x10000
#define IXGBE_MIN_FCRTL 0x40
#define IXGBE_MAX_FCRTL 0x7FF80
#define IXGBE_DEFAULT_FCRTH 0x20000
#define IXGBE_MIN_FCRTH 0x600
#define IXGBE_MAX_FCRTH 0x7FFF0
#define IXGBE_DEFAULT_FCPAUSE 0xFFFF
#define IXGBE_MIN_FCPAUSE 0
#define IXGBE_MAX_FCPAUSE 0xFFFF
/* Supported Rx Buffer Sizes */
#define IXGBE_RXBUFFER_64 64 /* Used for packet split */
#define IXGBE_RXBUFFER_128 128 /* Used for packet split */
#define IXGBE_RXBUFFER_256 256 /* Used for packet split */
#define IXGBE_RXBUFFER_2048 2048
#define IXGBE_RXBUFFER_4096 4096
#define IXGBE_RXBUFFER_8192 8192
#define IXGBE_MAX_RXBUFFER 16384 /* largest size for single descriptor */
#define IXGBE_RX_HDR_SIZE IXGBE_RXBUFFER_256
#define MAXIMUM_ETHERNET_VLAN_SIZE (VLAN_ETH_FRAME_LEN + ETH_FCS_LEN)
#if defined(IXGBE_DCB) || defined(IXGBE_RSS) || \
defined(IXGBE_VMDQ)
#define IXGBE_MQ
#endif
/* How many Rx Buffers do we bundle into one write to the hardware ? */
#define IXGBE_RX_BUFFER_WRITE 16 /* Must be power of 2 */
#define IXGBE_TX_FLAGS_CSUM (u32)(1)
#define IXGBE_TX_FLAGS_VLAN (u32)(1 << 1)
#define IXGBE_TX_FLAGS_TSO (u32)(1 << 2)
#define IXGBE_TX_FLAGS_IPV4 (u32)(1 << 3)
#define IXGBE_TX_FLAGS_FCOE (u32)(1 << 4)
#define IXGBE_TX_FLAGS_FSO (u32)(1 << 5)
#define IXGBE_TX_FLAGS_VLAN_MASK 0xffff0000
#define IXGBE_TX_FLAGS_VLAN_PRIO_MASK 0x0000e000
#define IXGBE_TX_FLAGS_VLAN_SHIFT 16
#define IXGBE_MAX_RSC_INT_RATE 162760
#ifndef IXGBE_NO_LRO
#define IXGBE_LRO_MAX 32 /*Maximum number of LRO descriptors*/
#define IXGBE_LRO_GLOBAL 10
struct ixgbe_lro_stats {
u32 flushed;
u32 coal;
u32 recycled;
};
struct ixgbe_lro_desc {
struct hlist_node lro_node;
struct sk_buff *skb;
u32 source_ip;
u32 dest_ip;
u16 source_port;
u16 dest_port;
u16 vlan_tag;
u16 len;
u32 next_seq;
u32 ack_seq;
u16 window;
u16 mss;
u16 opt_bytes;
u16 psh:1;
u32 tsval;
u32 tsecr;
u32 append_cnt;
};
struct ixgbe_lro_list {
struct hlist_head active;
struct hlist_head free;
int active_cnt;
struct ixgbe_lro_stats stats;
};
#endif /* IXGBE_NO_LRO */
/* wrapper around a pointer to a socket buffer,
* so a DMA handle can be stored along with the buffer */
struct ixgbe_tx_buffer {
unsigned long time_stamp;
u16 length;
u16 next_to_watch;
};
struct ixgbe_rx_buffer {
u16 length;
};
struct ixgbe_queue_stats {
u64 packets;
u64 bytes;
};
struct ____cacheline_aligned ixgbe_ring {
void *desc; /* descriptor ring memory */
union {
struct ixgbe_tx_buffer *tx_buffer_info;
struct ixgbe_rx_buffer *rx_buffer_info;
};
struct ixgbe_adapter *adapter;
u8 atr_sample_rate;
u8 atr_count;
u16 count; /* amount of descriptors */
u16 rx_buf_len;
u16 next_to_use;
u16 next_to_clean;
u8 queue_index; /* needed for multiqueue queue management */
u16 head;
u16 tail;
unsigned int total_bytes;
unsigned int total_packets;
#if defined(CONFIG_DCA) || defined(CONFIG_DCA_MODULE)
/* cpu for tx queue */
int cpu;
#endif
u16 reg_idx; /* holds the special value that gets the
* hardware register offset associated
* with this ring, which is different
* for DCB and RSS modes */
struct ixgbe_queue_stats stats;
unsigned long reinit_state;
u64 rsc_count; /* stat for coalesced packets */
unsigned int size; /* length in bytes */
dma_addr_t dma; /* phys. address of descriptor ring */
/* [queued, next_to_clean): packets waiting to be pulled */
u16 queued; /* only used for RX */
u8 *window[IXGBE_MAX_SUBWINDOWS];
dma_addr_t dma_window[IXGBE_MAX_SUBWINDOWS];
unsigned int window_size;
spinlock_t lock;
wait_queue_head_t *wq;
};
enum ixgbe_ring_f_enum {
RING_F_NONE = 0,
RING_F_DCB,
RING_F_VMDQ,
RING_F_RXQ,
RING_F_TXQ,
RING_F_FDIR,
RING_F_ARRAY_SIZE /* must be last in enum set */
};
#define IXGBE_MAX_DCB_INDICES 8
#define IXGBE_MAX_RSS_INDICES 16
#define IXGBE_MAX_VMDQ_INDICES 64
#define IXGBE_MAX_FDIR_INDICES 64
struct ixgbe_ring_feature {
int indices;
int mask;
};
#define MAX_RX_QUEUES 128
#define MAX_TX_QUEUES 128
#define MAX_RX_PACKET_BUFFERS ((adapter->flags & IXGBE_FLAG_DCB_ENABLED) \
? 8 : 1)
#define MAX_TX_PACKET_BUFFERS MAX_RX_PACKET_BUFFERS
/* MAX_MSIX_Q_VECTORS of these are allocated,
* but we only use one per queue-specific vector.
*/
struct ixgbe_q_vector {
struct ixgbe_adapter *adapter;
unsigned int v_idx; /* index of q_vector within array, also used for
* finding the bit in EICR and friends that
* represents the vector for this ring */
#ifdef CONFIG_IXGBE_NAPI
struct napi_struct napi;
#endif
DECLARE_BITMAP(rxr_idx, MAX_RX_QUEUES); /* Rx ring indices */
DECLARE_BITMAP(txr_idx, MAX_TX_QUEUES); /* Tx ring indices */
u8 rxr_count; /* Rx ring count assigned to this vector */
u8 txr_count; /* Tx ring count assigned to this vector */
u8 tx_itr;
u8 rx_itr;
u32 eitr;
#ifndef IXGBE_NO_LRO
struct ixgbe_lro_list *lrolist; /* LRO list for queue vector*/
#endif
char name[IFNAMSIZ + 9];
#ifndef HAVE_NETDEV_NAPI_LIST
struct net_device poll_dev;
#endif
};
/* Helper macros to switch between ints/sec and what the register uses.
* And yes, it's the same math going both ways. The lowest value
* supported by all of the ixgbe hardware is 8.
*/
#define EITR_INTS_PER_SEC_TO_REG(_eitr) \
((_eitr) ? (1000000000 / ((_eitr) * 256)) : 8)
#define EITR_REG_TO_INTS_PER_SEC EITR_INTS_PER_SEC_TO_REG
#define IXGBE_DESC_UNUSED(R) \
((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
(R)->next_to_clean - (R)->next_to_use - 1)
#define IXGBE_RX_DESC_ADV(R, i) \
(&(((union ixgbe_adv_rx_desc *)((R).desc))[i]))
#define IXGBE_TX_DESC_ADV(R, i) \
(&(((union ixgbe_adv_tx_desc *)((R).desc))[i]))
#define IXGBE_TX_CTXTDESC_ADV(R, i) \
(&(((struct ixgbe_adv_tx_context_desc *)((R).desc))[i]))
#define IXGBE_MAX_JUMBO_FRAME_SIZE 16128
#ifdef IXGBE_TCP_TIMER
#define TCP_TIMER_VECTOR 1
#else
#define TCP_TIMER_VECTOR 0
#endif
#define OTHER_VECTOR 1
#define NON_Q_VECTORS (OTHER_VECTOR + TCP_TIMER_VECTOR)
#define IXGBE_MAX_MSIX_VECTORS_82599 64
#define IXGBE_MAX_MSIX_Q_VECTORS_82599 64
#define IXGBE_MAX_MSIX_Q_VECTORS_82598 16
#define IXGBE_MAX_MSIX_VECTORS_82598 18
/*
* Only for array allocations in our adapter struct. On 82598, there will be
* unused entries in the array, but that's not a big deal. Also, in 82599,
* we can actually assign 64 queue vectors based on our extended-extended
* interrupt registers. This is different than 82598, which is limited to 16.
*/
#define MAX_MSIX_Q_VECTORS IXGBE_MAX_MSIX_Q_VECTORS_82599
#define MAX_MSIX_COUNT IXGBE_MAX_MSIX_VECTORS_82599
#if 0
#define MIN_MSIX_Q_VECTORS 2
#else
/* no TX interrupt - Sangjin */
#define MIN_MSIX_Q_VECTORS 1
#endif
#define MIN_MSIX_COUNT (MIN_MSIX_Q_VECTORS + NON_Q_VECTORS)
/* board specific private data structure */
struct ixgbe_adapter {
struct timer_list watchdog_timer;
#ifdef NETIF_F_HW_VLAN_TX
struct vlan_group *vlgrp;
#endif
int bd_number;
struct work_struct reset_task;
struct ixgbe_q_vector *q_vector[MAX_MSIX_Q_VECTORS];
struct ixgbe_dcb_config dcb_cfg;
struct ixgbe_dcb_config temp_dcb_cfg;
u8 dcb_set_bitmap;
enum ixgbe_fc_mode last_lfc_mode;
int numa_node;
/* Interrupt Throttle Rate */
u32 itr_setting;
u16 eitr_low;
u16 eitr_high;
/* TX */
struct ixgbe_ring *tx_ring; /* One per active queue */
int num_tx_queues;
u64 restart_queue;
u64 hw_csum_tx_good;
u64 lsc_int;
u64 hw_tso_ctxt;
u64 hw_tso6_ctxt;
u32 tx_timeout_count;
bool detect_tx_hung;
/* RX */
struct ixgbe_ring *rx_ring; /* One per active queue */
int num_rx_queues;
int num_rx_pools; /* == num_rx_queues in 82598 */
int num_rx_queues_per_pool; /* 1 if 82598, can be many if 82599 */
u64 hw_csum_rx_error;
u64 hw_rx_no_dma_resources;
u64 hw_csum_rx_good;
u64 non_eop_descs;
#ifndef CONFIG_IXGBE_NAPI
u64 rx_dropped_backlog; /* count drops from rx intr handler */
#endif
int num_msix_vectors;
int max_msix_q_vectors; /* true count of q_vectors for device */
struct ixgbe_ring_feature ring_feature[RING_F_ARRAY_SIZE];
struct msix_entry *msix_entries;
#ifdef IXGBE_TCP_TIMER
irqreturn_t (*msix_handlers[MAX_MSIX_COUNT])(int irq, void *data,
struct pt_regs *regs);
#endif
u32 alloc_rx_page_failed;
u32 alloc_rx_buff_failed;
/* Some features need tri-state capability,
* thus the additional *_CAPABLE flags.
*/
u32 flags;
#define IXGBE_FLAG_RX_CSUM_ENABLED (u32)(1)
#define IXGBE_FLAG_MSI_CAPABLE (u32)(1 << 1)
#define IXGBE_FLAG_MSI_ENABLED (u32)(1 << 2)
#define IXGBE_FLAG_MSIX_CAPABLE (u32)(1 << 3)
#define IXGBE_FLAG_MSIX_ENABLED (u32)(1 << 4)
#ifndef IXGBE_NO_LLI
#define IXGBE_FLAG_LLI_PUSH (u32)(1 << 5)
#endif
#define IXGBE_FLAG_RX_1BUF_CAPABLE (u32)(1 << 6)
#define IXGBE_FLAG_RX_PS_CAPABLE (u32)(1 << 7)
#define IXGBE_FLAG_RX_PS_ENABLED (u32)(1 << 8)
#define IXGBE_FLAG_IN_NETPOLL (u32)(1 << 9)
#define IXGBE_FLAG_DCA_ENABLED (u32)(1 << 10)
#define IXGBE_FLAG_DCA_CAPABLE (u32)(1 << 11)
#define IXGBE_FLAG_DCA_ENABLED_DATA (u32)(1 << 12)
#define IXGBE_FLAG_MQ_CAPABLE (u32)(1 << 13)
#define IXGBE_FLAG_DCB_ENABLED (u32)(1 << 14)
#define IXGBE_FLAG_DCB_CAPABLE (u32)(1 << 15)
#define IXGBE_FLAG_RSS_ENABLED (u32)(1 << 16)
#define IXGBE_FLAG_RSS_CAPABLE (u32)(1 << 17)
#define IXGBE_FLAG_VMDQ_CAPABLE (u32)(1 << 18)
#define IXGBE_FLAG_VMDQ_ENABLED (u32)(1 << 19)
#define IXGBE_FLAG_FAN_FAIL_CAPABLE (u32)(1 << 20)
#define IXGBE_FLAG_NEED_LINK_UPDATE (u32)(1 << 22)
#define IXGBE_FLAG_IN_WATCHDOG_TASK (u32)(1 << 23)
#define IXGBE_FLAG_IN_SFP_LINK_TASK (u32)(1 << 24)
#define IXGBE_FLAG_IN_SFP_MOD_TASK (u32)(1 << 25)
#define IXGBE_FLAG_FDIR_HASH_CAPABLE (u32)(1 << 26)
#define IXGBE_FLAG_FDIR_PERFECT_CAPABLE (u32)(1 << 27)
/* added - Sangjin */
#define IXGBE_FLAG_RX_KERNEL_ENABLE (u32)(1 << 28)
u32 flags2;
#ifndef IXGBE_NO_HW_RSC
#define IXGBE_FLAG2_RSC_CAPABLE (u32)(1)
#define IXGBE_FLAG2_RSC_ENABLED (u32)(1 << 1)
#endif /* IXGBE_NO_HW_RSC */
#ifndef IXGBE_NO_LRO
#define IXGBE_FLAG2_SWLRO_ENABLED (u32)(1 << 2)
#endif /* IXGBE_NO_LRO */
#define IXGBE_FLAG2_VMDQ_DEFAULT_OVERRIDE (u32)(1 << 3)
/* default to trying for four seconds */
#define IXGBE_TRY_LINK_TIMEOUT (4 * HZ)
/* OS defined structs */
struct net_device *netdev;
struct pci_dev *pdev;
struct net_device_stats net_stats;
#ifndef IXGBE_NO_LRO
struct ixgbe_lro_stats lro_stats;
#endif
#ifdef ETHTOOL_TEST
u32 test_icr;
struct ixgbe_ring test_tx_ring;
struct ixgbe_ring test_rx_ring;
#endif
/* structs defined in ixgbe_hw.h */
struct ixgbe_hw hw;
u16 msg_enable;
struct ixgbe_hw_stats stats;
#ifndef IXGBE_NO_LLI
u32 lli_port;
u32 lli_size;
u64 lli_int;
u32 lli_etype;
u32 lli_vlan_pri;
#endif /* IXGBE_NO_LLI */
/* Interrupt Throttle Rate */
u32 eitr_param;
unsigned long state;
u32 *config_space;
u64 tx_busy;
unsigned int tx_ring_count;
unsigned int rx_ring_count;
u32 link_speed;
bool link_up;
unsigned long link_check_timeout;
struct work_struct watchdog_task;
struct work_struct sfp_task;
struct timer_list sfp_timer;
struct work_struct multispeed_fiber_task;
struct work_struct sfp_config_module_task;
u64 flm;
u32 fdir_pballoc;
u32 atr_sample_rate;
spinlock_t fdir_perfect_lock;
struct work_struct fdir_reinit_task;
u64 rsc_count;
u32 wol;
u16 eeprom_version;
bool netdev_registered;
char lsc_int_name[IFNAMSIZ + 9];
#ifdef IXGBE_TCP_TIMER
char tcp_timer_name[IFNAMSIZ + 9];
#endif
};
enum ixbge_state_t {
__IXGBE_TESTING,
__IXGBE_RESETTING,
__IXGBE_DOWN,
__IXGBE_FDIR_INIT_DONE,
__IXGBE_SFP_MODULE_NOT_FOUND
};
#ifdef CONFIG_DCB
extern struct dcbnl_rtnl_ops dcbnl_ops;
extern int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
struct ixgbe_dcb_config *dst_dcb_cfg, int tc_max);
#endif
/* needed by ixgbe_main.c */
extern int ixgbe_validate_mac_addr(u8 *mc_addr);
extern void ixgbe_check_options(struct ixgbe_adapter *adapter);
extern void ixgbe_assign_netdev_ops(struct net_device *netdev);
/* needed by ixgbe_ethtool.c */
extern char ixgbe_driver_name[];
extern const char ixgbe_driver_version[];
extern int ixgbe_up(struct ixgbe_adapter *adapter);
extern void ixgbe_down(struct ixgbe_adapter *adapter);
extern void ixgbe_reinit_locked(struct ixgbe_adapter *adapter);
extern void ixgbe_reset(struct ixgbe_adapter *adapter);
extern void ixgbe_set_ethtool_ops(struct net_device *netdev);
extern int ixgbe_setup_rx_resources(struct ixgbe_adapter *,struct ixgbe_ring *);
extern int ixgbe_setup_tx_resources(struct ixgbe_adapter *,struct ixgbe_ring *);
extern void ixgbe_free_rx_resources(struct ixgbe_adapter *,struct ixgbe_ring *);
extern void ixgbe_free_tx_resources(struct ixgbe_adapter *,struct ixgbe_ring *);
extern void ixgbe_update_stats(struct ixgbe_adapter *adapter);
extern int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter);
extern void ixgbe_clear_interrupt_scheme(struct ixgbe_adapter *adapter);
extern bool ixgbe_is_ixgbe(struct pci_dev *pcidev);
void ixgbe_set_rx_mode(struct net_device *netdev);
#ifdef ETHTOOL_OPS_COMPAT
extern int ethtool_ioctl(struct ifreq *ifr);
#endif
extern int ixgbe_dcb_netlink_register(void);
extern int ixgbe_dcb_netlink_unregister(void);
#endif /* _IXGBE_H_ */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,959 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include "ixgbe_api.h"
#include "ixgbe_common.h"
extern s32 ixgbe_init_ops_82598(struct ixgbe_hw *hw);
extern s32 ixgbe_init_ops_82599(struct ixgbe_hw *hw);
/**
* ixgbe_init_shared_code - Initialize the shared code
* @hw: pointer to hardware structure
*
* This will assign function pointers and assign the MAC type and PHY code.
* Does not touch the hardware. This function must be called prior to any
* other function in the shared code. The ixgbe_hw structure should be
* memset to 0 prior to calling this function. The following fields in
* hw structure should be filled in prior to calling this function:
* hw_addr, back, device_id, vendor_id, subsystem_device_id,
* subsystem_vendor_id, and revision_id
**/
s32 ixgbe_init_shared_code(struct ixgbe_hw *hw)
{
s32 status;
/*
* Set the mac type
*/
ixgbe_set_mac_type(hw);
switch (hw->mac.type) {
case ixgbe_mac_82598EB:
status = ixgbe_init_ops_82598(hw);
break;
case ixgbe_mac_82599EB:
status = ixgbe_init_ops_82599(hw);
break;
default:
status = IXGBE_ERR_DEVICE_NOT_SUPPORTED;
break;
}
return status;
}
/**
* ixgbe_set_mac_type - Sets MAC type
* @hw: pointer to the HW structure
*
* This function sets the mac type of the adapter based on the
* vendor ID and device ID stored in the hw structure.
**/
s32 ixgbe_set_mac_type(struct ixgbe_hw *hw)
{
s32 ret_val = 0;
if (hw->vendor_id == IXGBE_INTEL_VENDOR_ID) {
switch (hw->device_id) {
case IXGBE_DEV_ID_82598:
case IXGBE_DEV_ID_82598_BX:
case IXGBE_DEV_ID_82598AF_SINGLE_PORT:
case IXGBE_DEV_ID_82598AF_DUAL_PORT:
case IXGBE_DEV_ID_82598AT:
case IXGBE_DEV_ID_82598AT2:
case IXGBE_DEV_ID_82598EB_CX4:
case IXGBE_DEV_ID_82598_CX4_DUAL_PORT:
case IXGBE_DEV_ID_82598_DA_DUAL_PORT:
case IXGBE_DEV_ID_82598_SR_DUAL_PORT_EM:
case IXGBE_DEV_ID_82598EB_XF_LR:
case IXGBE_DEV_ID_82598EB_SFP_LOM:
hw->mac.type = ixgbe_mac_82598EB;
break;
case IXGBE_DEV_ID_82599_KX4:
case IXGBE_DEV_ID_82599_XAUI_LOM:
case IXGBE_DEV_ID_82599_SFP:
case IXGBE_DEV_ID_82599_T3_LOM:
hw->mac.type = ixgbe_mac_82599EB;
break;
default:
ret_val = IXGBE_ERR_DEVICE_NOT_SUPPORTED;
break;
}
} else {
ret_val = IXGBE_ERR_DEVICE_NOT_SUPPORTED;
}
hw_dbg(hw, "ixgbe_set_mac_type found mac: %d, returns: %d\n",
hw->mac.type, ret_val);
return ret_val;
}
/**
* ixgbe_init_hw - Initialize the hardware
* @hw: pointer to hardware structure
*
* Initialize the hardware by resetting and then starting the hardware
**/
s32 ixgbe_init_hw(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.init_hw, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_reset_hw - Performs a hardware reset
* @hw: pointer to hardware structure
*
* Resets the hardware by resetting the transmit and receive units, masks and
* clears all interrupts, performs a PHY reset, and performs a MAC reset
**/
s32 ixgbe_reset_hw(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.reset_hw, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_start_hw - Prepares hardware for Rx/Tx
* @hw: pointer to hardware structure
*
* Starts the hardware by filling the bus info structure and media type,
* clears all on chip counters, initializes receive address registers,
* multicast table, VLAN filter table, calls routine to setup link and
* flow control settings, and leaves transmit and receive units disabled
* and uninitialized.
**/
s32 ixgbe_start_hw(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.start_hw, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_clear_hw_cntrs - Clear hardware counters
* @hw: pointer to hardware structure
*
* Clears all hardware statistics counters by reading them from the hardware
* Statistics counters are clear on read.
**/
s32 ixgbe_clear_hw_cntrs(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.clear_hw_cntrs, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_media_type - Get media type
* @hw: pointer to hardware structure
*
* Returns the media type (fiber, copper, backplane)
**/
enum ixgbe_media_type ixgbe_get_media_type(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.get_media_type, (hw),
ixgbe_media_type_unknown);
}
/**
* ixgbe_get_mac_addr - Get MAC address
* @hw: pointer to hardware structure
* @mac_addr: Adapter MAC address
*
* Reads the adapter's MAC address from the first Receive Address Register
* (RAR0) A reset of the adapter must have been performed prior to calling
* this function in order for the MAC address to have been loaded from the
* EEPROM into RAR0
**/
s32 ixgbe_get_mac_addr(struct ixgbe_hw *hw, u8 *mac_addr)
{
return ixgbe_call_func(hw, hw->mac.ops.get_mac_addr,
(hw, mac_addr), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_san_mac_addr - Get SAN MAC address
* @hw: pointer to hardware structure
* @san_mac_addr: SAN MAC address
*
* Reads the SAN MAC address from the EEPROM, if it's available. This is
* per-port, so set_lan_id() must be called before reading the addresses.
**/
s32 ixgbe_get_san_mac_addr(struct ixgbe_hw *hw, u8 *san_mac_addr)
{
return ixgbe_call_func(hw, hw->mac.ops.get_san_mac_addr,
(hw, san_mac_addr), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_set_san_mac_addr - Write a SAN MAC address
* @hw: pointer to hardware structure
* @san_mac_addr: SAN MAC address
*
* Writes A SAN MAC address to the EEPROM.
**/
s32 ixgbe_set_san_mac_addr(struct ixgbe_hw *hw, u8 *san_mac_addr)
{
return ixgbe_call_func(hw, hw->mac.ops.set_san_mac_addr,
(hw, san_mac_addr), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_device_caps - Get additional device capabilities
* @hw: pointer to hardware structure
* @device_caps: the EEPROM word for device capabilities
*
* Reads the extra device capabilities from the EEPROM
**/
s32 ixgbe_get_device_caps(struct ixgbe_hw *hw, u16 *device_caps)
{
return ixgbe_call_func(hw, hw->mac.ops.get_device_caps,
(hw, device_caps), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_bus_info - Set PCI bus info
* @hw: pointer to hardware structure
*
* Sets the PCI bus info (speed, width, type) within the ixgbe_hw structure
**/
s32 ixgbe_get_bus_info(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.get_bus_info, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_num_of_tx_queues - Get Tx queues
* @hw: pointer to hardware structure
*
* Returns the number of transmit queues for the given adapter.
**/
u32 ixgbe_get_num_of_tx_queues(struct ixgbe_hw *hw)
{
return hw->mac.max_tx_queues;
}
/**
* ixgbe_get_num_of_rx_queues - Get Rx queues
* @hw: pointer to hardware structure
*
* Returns the number of receive queues for the given adapter.
**/
u32 ixgbe_get_num_of_rx_queues(struct ixgbe_hw *hw)
{
return hw->mac.max_rx_queues;
}
/**
* ixgbe_stop_adapter - Disable Rx/Tx units
* @hw: pointer to hardware structure
*
* Sets the adapter_stopped flag within ixgbe_hw struct. Clears interrupts,
* disables transmit and receive units. The adapter_stopped flag is used by
* the shared code and drivers to determine if the adapter is in a stopped
* state and should not touch the hardware.
**/
s32 ixgbe_stop_adapter(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.stop_adapter, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_read_pba_num - Reads part number from EEPROM
* @hw: pointer to hardware structure
* @pba_num: stores the part number from the EEPROM
*
* Reads the part number from the EEPROM.
**/
s32 ixgbe_read_pba_num(struct ixgbe_hw *hw, u32 *pba_num)
{
return ixgbe_read_pba_num_generic(hw, pba_num);
}
/**
* ixgbe_identify_phy - Get PHY type
* @hw: pointer to hardware structure
*
* Determines the physical layer module found on the current adapter.
**/
s32 ixgbe_identify_phy(struct ixgbe_hw *hw)
{
s32 status = 0;
if (hw->phy.type == ixgbe_phy_unknown) {
status = ixgbe_call_func(hw,
hw->phy.ops.identify,
(hw),
IXGBE_NOT_IMPLEMENTED);
}
return status;
}
/**
* ixgbe_reset_phy - Perform a PHY reset
* @hw: pointer to hardware structure
**/
s32 ixgbe_reset_phy(struct ixgbe_hw *hw)
{
s32 status = 0;
if (hw->phy.type == ixgbe_phy_unknown) {
if (ixgbe_identify_phy(hw) != 0)
status = IXGBE_ERR_PHY;
}
if (status == 0) {
status = ixgbe_call_func(hw, hw->phy.ops.reset, (hw),
IXGBE_NOT_IMPLEMENTED);
}
return status;
}
/**
* ixgbe_get_phy_firmware_version -
* @hw: pointer to hardware structure
* @firmware_version: pointer to firmware version
**/
s32 ixgbe_get_phy_firmware_version(struct ixgbe_hw *hw, u16 *firmware_version)
{
s32 status = 0;
status = ixgbe_call_func(hw, hw->phy.ops.get_firmware_version,
(hw, firmware_version),
IXGBE_NOT_IMPLEMENTED);
return status;
}
/**
* ixgbe_read_phy_reg - Read PHY register
* @hw: pointer to hardware structure
* @reg_addr: 32 bit address of PHY register to read
* @phy_data: Pointer to read data from PHY register
*
* Reads a value from a specified PHY register
**/
s32 ixgbe_read_phy_reg(struct ixgbe_hw *hw, u32 reg_addr, u32 device_type,
u16 *phy_data)
{
if (hw->phy.id == 0)
ixgbe_identify_phy(hw);
return ixgbe_call_func(hw, hw->phy.ops.read_reg, (hw, reg_addr,
device_type, phy_data), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_write_phy_reg - Write PHY register
* @hw: pointer to hardware structure
* @reg_addr: 32 bit PHY register to write
* @phy_data: Data to write to the PHY register
*
* Writes a value to specified PHY register
**/
s32 ixgbe_write_phy_reg(struct ixgbe_hw *hw, u32 reg_addr, u32 device_type,
u16 phy_data)
{
if (hw->phy.id == 0)
ixgbe_identify_phy(hw);
return ixgbe_call_func(hw, hw->phy.ops.write_reg, (hw, reg_addr,
device_type, phy_data), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_setup_phy_link - Restart PHY autoneg
* @hw: pointer to hardware structure
*
* Restart autonegotiation and PHY and waits for completion.
**/
s32 ixgbe_setup_phy_link(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->phy.ops.setup_link, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_check_phy_link - Determine link and speed status
* @hw: pointer to hardware structure
*
* Reads a PHY register to determine if link is up and the current speed for
* the PHY.
**/
s32 ixgbe_check_phy_link(struct ixgbe_hw *hw, ixgbe_link_speed *speed,
bool *link_up)
{
return ixgbe_call_func(hw, hw->phy.ops.check_link, (hw, speed,
link_up), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_setup_phy_link_speed - Set auto advertise
* @hw: pointer to hardware structure
* @speed: new link speed
* @autoneg: true if autonegotiation enabled
*
* Sets the auto advertised capabilities
**/
s32 ixgbe_setup_phy_link_speed(struct ixgbe_hw *hw, ixgbe_link_speed speed,
bool autoneg,
bool autoneg_wait_to_complete)
{
return ixgbe_call_func(hw, hw->phy.ops.setup_link_speed, (hw, speed,
autoneg, autoneg_wait_to_complete),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_setup_link - Configure link settings
* @hw: pointer to hardware structure
*
* Configures link settings based on values in the ixgbe_hw struct.
* Restarts the link. Performs autonegotiation if needed.
**/
s32 ixgbe_setup_link(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.setup_link, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_check_link - Get link and speed status
* @hw: pointer to hardware structure
*
* Reads the links register to determine if link is up and the current speed
**/
s32 ixgbe_check_link(struct ixgbe_hw *hw, ixgbe_link_speed *speed,
bool *link_up, bool link_up_wait_to_complete)
{
return ixgbe_call_func(hw, hw->mac.ops.check_link, (hw, speed,
link_up, link_up_wait_to_complete),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_setup_link_speed - Set link speed
* @hw: pointer to hardware structure
* @speed: new link speed
* @autoneg: true if autonegotiation enabled
*
* Set the link speed and restarts the link.
**/
s32 ixgbe_setup_link_speed(struct ixgbe_hw *hw, ixgbe_link_speed speed,
bool autoneg,
bool autoneg_wait_to_complete)
{
return ixgbe_call_func(hw, hw->mac.ops.setup_link_speed, (hw, speed,
autoneg, autoneg_wait_to_complete),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_link_capabilities - Returns link capabilities
* @hw: pointer to hardware structure
*
* Determines the link capabilities of the current configuration.
**/
s32 ixgbe_get_link_capabilities(struct ixgbe_hw *hw, ixgbe_link_speed *speed,
bool *autoneg)
{
return ixgbe_call_func(hw, hw->mac.ops.get_link_capabilities, (hw,
speed, autoneg), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_led_on - Turn on LEDs
* @hw: pointer to hardware structure
* @index: led number to turn on
*
* Turns on the software controllable LEDs.
**/
s32 ixgbe_led_on(struct ixgbe_hw *hw, u32 index)
{
return ixgbe_call_func(hw, hw->mac.ops.led_on, (hw, index),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_led_off - Turn off LEDs
* @hw: pointer to hardware structure
* @index: led number to turn off
*
* Turns off the software controllable LEDs.
**/
s32 ixgbe_led_off(struct ixgbe_hw *hw, u32 index)
{
return ixgbe_call_func(hw, hw->mac.ops.led_off, (hw, index),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_blink_led_start - Blink LEDs
* @hw: pointer to hardware structure
* @index: led number to blink
*
* Blink LED based on index.
**/
s32 ixgbe_blink_led_start(struct ixgbe_hw *hw, u32 index)
{
return ixgbe_call_func(hw, hw->mac.ops.blink_led_start, (hw, index),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_blink_led_stop - Stop blinking LEDs
* @hw: pointer to hardware structure
*
* Stop blinking LED based on index.
**/
s32 ixgbe_blink_led_stop(struct ixgbe_hw *hw, u32 index)
{
return ixgbe_call_func(hw, hw->mac.ops.blink_led_stop, (hw, index),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_init_eeprom_params - Initialize EEPROM parameters
* @hw: pointer to hardware structure
*
* Initializes the EEPROM parameters ixgbe_eeprom_info within the
* ixgbe_hw struct in order to set up EEPROM access.
**/
s32 ixgbe_init_eeprom_params(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->eeprom.ops.init_params, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_write_eeprom - Write word to EEPROM
* @hw: pointer to hardware structure
* @offset: offset within the EEPROM to be written to
* @data: 16 bit word to be written to the EEPROM
*
* Writes 16 bit value to EEPROM. If ixgbe_eeprom_update_checksum is not
* called after this function, the EEPROM will most likely contain an
* invalid checksum.
**/
s32 ixgbe_write_eeprom(struct ixgbe_hw *hw, u16 offset, u16 data)
{
return ixgbe_call_func(hw, hw->eeprom.ops.write, (hw, offset, data),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_read_eeprom - Read word from EEPROM
* @hw: pointer to hardware structure
* @offset: offset within the EEPROM to be read
* @data: read 16 bit value from EEPROM
*
* Reads 16 bit value from EEPROM
**/
s32 ixgbe_read_eeprom(struct ixgbe_hw *hw, u16 offset, u16 *data)
{
return ixgbe_call_func(hw, hw->eeprom.ops.read, (hw, offset, data),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_validate_eeprom_checksum - Validate EEPROM checksum
* @hw: pointer to hardware structure
* @checksum_val: calculated checksum
*
* Performs checksum calculation and validates the EEPROM checksum
**/
s32 ixgbe_validate_eeprom_checksum(struct ixgbe_hw *hw, u16 *checksum_val)
{
return ixgbe_call_func(hw, hw->eeprom.ops.validate_checksum,
(hw, checksum_val), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_eeprom_update_checksum - Updates the EEPROM checksum
* @hw: pointer to hardware structure
**/
s32 ixgbe_update_eeprom_checksum(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->eeprom.ops.update_checksum, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_insert_mac_addr - Find a RAR for this mac address
* @hw: pointer to hardware structure
* @addr: Address to put into receive address register
* @vmdq: VMDq pool to assign
*
* Puts an ethernet address into a receive address register, or
* finds the rar that it is aleady in; adds to the pool list
**/
s32 ixgbe_insert_mac_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq)
{
return ixgbe_call_func(hw, hw->mac.ops.insert_mac_addr,
(hw, addr, vmdq),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_set_rar - Set Rx address register
* @hw: pointer to hardware structure
* @index: Receive address register to write
* @addr: Address to put into receive address register
* @vmdq: VMDq "set"
* @enable_addr: set flag that address is active
*
* Puts an ethernet address into a receive address register.
**/
s32 ixgbe_set_rar(struct ixgbe_hw *hw, u32 index, u8 *addr, u32 vmdq,
u32 enable_addr)
{
return ixgbe_call_func(hw, hw->mac.ops.set_rar, (hw, index, addr, vmdq,
enable_addr), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_clear_rar - Clear Rx address register
* @hw: pointer to hardware structure
* @index: Receive address register to write
*
* Puts an ethernet address into a receive address register.
**/
s32 ixgbe_clear_rar(struct ixgbe_hw *hw, u32 index)
{
return ixgbe_call_func(hw, hw->mac.ops.clear_rar, (hw, index),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_set_vmdq - Associate a VMDq index with a receive address
* @hw: pointer to hardware structure
* @rar: receive address register index to associate with VMDq index
* @vmdq: VMDq set or pool index
**/
s32 ixgbe_set_vmdq(struct ixgbe_hw *hw, u32 rar, u32 vmdq)
{
return ixgbe_call_func(hw, hw->mac.ops.set_vmdq, (hw, rar, vmdq),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_clear_vmdq - Disassociate a VMDq index from a receive address
* @hw: pointer to hardware structure
* @rar: receive address register index to disassociate with VMDq index
* @vmdq: VMDq set or pool index
**/
s32 ixgbe_clear_vmdq(struct ixgbe_hw *hw, u32 rar, u32 vmdq)
{
return ixgbe_call_func(hw, hw->mac.ops.clear_vmdq, (hw, rar, vmdq),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_init_rx_addrs - Initializes receive address filters.
* @hw: pointer to hardware structure
*
* Places the MAC address in receive address register 0 and clears the rest
* of the receive address registers. Clears the multicast table. Assumes
* the receiver is in reset when the routine is called.
**/
s32 ixgbe_init_rx_addrs(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.init_rx_addrs, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_num_rx_addrs - Returns the number of RAR entries.
* @hw: pointer to hardware structure
**/
u32 ixgbe_get_num_rx_addrs(struct ixgbe_hw *hw)
{
return hw->mac.num_rar_entries;
}
/**
* ixgbe_update_uc_addr_list - Updates the MAC's list of secondary addresses
* @hw: pointer to hardware structure
* @addr_list: the list of new multicast addresses
* @addr_count: number of addresses
* @func: iterator function to walk the multicast address list
*
* The given list replaces any existing list. Clears the secondary addrs from
* receive address registers. Uses unused receive address registers for the
* first secondary addresses, and falls back to promiscuous mode as needed.
**/
s32 ixgbe_update_uc_addr_list(struct ixgbe_hw *hw, u8 *addr_list,
u32 addr_count, ixgbe_mc_addr_itr func)
{
return ixgbe_call_func(hw, hw->mac.ops.update_uc_addr_list, (hw,
addr_list, addr_count, func),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_update_mc_addr_list - Updates the MAC's list of multicast addresses
* @hw: pointer to hardware structure
* @mc_addr_list: the list of new multicast addresses
* @mc_addr_count: number of addresses
* @func: iterator function to walk the multicast address list
*
* The given list replaces any existing list. Clears the MC addrs from receive
* address registers and the multicast table. Uses unused receive address
* registers for the first multicast addresses, and hashes the rest into the
* multicast table.
**/
s32 ixgbe_update_mc_addr_list(struct ixgbe_hw *hw, u8 *mc_addr_list,
u32 mc_addr_count, ixgbe_mc_addr_itr func)
{
return ixgbe_call_func(hw, hw->mac.ops.update_mc_addr_list, (hw,
mc_addr_list, mc_addr_count, func),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_enable_mc - Enable multicast address in RAR
* @hw: pointer to hardware structure
*
* Enables multicast address in RAR and the use of the multicast hash table.
**/
s32 ixgbe_enable_mc(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.enable_mc, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_disable_mc - Disable multicast address in RAR
* @hw: pointer to hardware structure
*
* Disables multicast address in RAR and the use of the multicast hash table.
**/
s32 ixgbe_disable_mc(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.disable_mc, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_clear_vfta - Clear VLAN filter table
* @hw: pointer to hardware structure
*
* Clears the VLAN filer table, and the VMDq index associated with the filter
**/
s32 ixgbe_clear_vfta(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.clear_vfta, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_set_vfta - Set VLAN filter table
* @hw: pointer to hardware structure
* @vlan: VLAN id to write to VLAN filter
* @vind: VMDq output index that maps queue to VLAN id in VFTA
* @vlan_on: boolean flag to turn on/off VLAN in VFTA
*
* Turn on/off specified VLAN in the VLAN filter table.
**/
s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on)
{
return ixgbe_call_func(hw, hw->mac.ops.set_vfta, (hw, vlan, vind,
vlan_on), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_fc_enable - Enable flow control
* @hw: pointer to hardware structure
* @packetbuf_num: packet buffer number (0-7)
*
* Configures the flow control settings based on SW configuration.
**/
s32 ixgbe_fc_enable(struct ixgbe_hw *hw, s32 packetbuf_num)
{
return ixgbe_call_func(hw, hw->mac.ops.fc_enable, (hw, packetbuf_num),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_read_analog_reg8 - Reads 8 bit analog register
* @hw: pointer to hardware structure
* @reg: analog register to read
* @val: read value
*
* Performs write operation to analog register specified.
**/
s32 ixgbe_read_analog_reg8(struct ixgbe_hw *hw, u32 reg, u8 *val)
{
return ixgbe_call_func(hw, hw->mac.ops.read_analog_reg8, (hw, reg,
val), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_write_analog_reg8 - Writes 8 bit analog register
* @hw: pointer to hardware structure
* @reg: analog register to write
* @val: value to write
*
* Performs write operation to Atlas analog register specified.
**/
s32 ixgbe_write_analog_reg8(struct ixgbe_hw *hw, u32 reg, u8 val)
{
return ixgbe_call_func(hw, hw->mac.ops.write_analog_reg8, (hw, reg,
val), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_init_uta_tables - Initializes Unicast Table Arrays.
* @hw: pointer to hardware structure
*
* Initializes the Unicast Table Arrays to zero on device load. This
* is part of the Rx init addr execution path.
**/
s32 ixgbe_init_uta_tables(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.init_uta_tables, (hw),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_read_i2c_byte - Reads 8 bit word over I2C at specified device address
* @hw: pointer to hardware structure
* @byte_offset: byte offset to read
* @data: value read
*
* Performs byte read operation to SFP module's EEPROM over I2C interface.
**/
s32 ixgbe_read_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr,
u8 *data)
{
return ixgbe_call_func(hw, hw->phy.ops.read_i2c_byte, (hw, byte_offset,
dev_addr, data), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_write_i2c_byte - Writes 8 bit word over I2C
* @hw: pointer to hardware structure
* @byte_offset: byte offset to write
* @data: value to write
*
* Performs byte write operation to SFP module's EEPROM over I2C interface
* at a specified device address.
**/
s32 ixgbe_write_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr,
u8 data)
{
return ixgbe_call_func(hw, hw->phy.ops.write_i2c_byte, (hw, byte_offset,
dev_addr, data), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_write_i2c_eeprom - Writes 8 bit EEPROM word over I2C interface
* @hw: pointer to hardware structure
* @byte_offset: EEPROM byte offset to write
* @eeprom_data: value to write
*
* Performs byte write operation to SFP module's EEPROM over I2C interface.
**/
s32 ixgbe_write_i2c_eeprom(struct ixgbe_hw *hw,
u8 byte_offset, u8 eeprom_data)
{
return ixgbe_call_func(hw, hw->phy.ops.write_i2c_eeprom,
(hw, byte_offset, eeprom_data),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_read_i2c_eeprom - Reads 8 bit EEPROM word over I2C interface
* @hw: pointer to hardware structure
* @byte_offset: EEPROM byte offset to read
* @eeprom_data: value read
*
* Performs byte read operation to SFP module's EEPROM over I2C interface.
**/
s32 ixgbe_read_i2c_eeprom(struct ixgbe_hw *hw, u8 byte_offset, u8 *eeprom_data)
{
return ixgbe_call_func(hw, hw->phy.ops.read_i2c_eeprom,
(hw, byte_offset, eeprom_data),
IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_get_supported_physical_layer - Returns physical layer type
* @hw: pointer to hardware structure
*
* Determines physical layer capabilities of the current configuration.
**/
u32 ixgbe_get_supported_physical_layer(struct ixgbe_hw *hw)
{
return ixgbe_call_func(hw, hw->mac.ops.get_supported_physical_layer,
(hw), IXGBE_PHYSICAL_LAYER_UNKNOWN);
}
/**
* ixgbe_enable_rx_dma - Enables Rx DMA unit, dependant on device specifics
* @hw: pointer to hardware structure
* @regval: bitfield to write to the Rx DMA register
*
* Enables the Rx DMA unit of the device.
**/
s32 ixgbe_enable_rx_dma(struct ixgbe_hw *hw, u32 regval)
{
return ixgbe_call_func(hw, hw->mac.ops.enable_rx_dma,
(hw, regval), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_acquire_swfw_semaphore - Acquire SWFW semaphore
* @hw: pointer to hardware structure
* @mask: Mask to specify which semaphore to acquire
*
* Acquires the SWFW semaphore through SW_FW_SYNC register for the specified
* function (CSR, PHY0, PHY1, EEPROM, Flash)
**/
s32 ixgbe_acquire_swfw_semaphore(struct ixgbe_hw *hw, u16 mask)
{
return ixgbe_call_func(hw, hw->mac.ops.acquire_swfw_sync,
(hw, mask), IXGBE_NOT_IMPLEMENTED);
}
/**
* ixgbe_release_swfw_semaphore - Release SWFW semaphore
* @hw: pointer to hardware structure
* @mask: Mask to specify which semaphore to release
*
* Releases the SWFW semaphore through SW_FW_SYNC register for the specified
* function (CSR, PHY0, PHY1, EEPROM, Flash)
**/
void ixgbe_release_swfw_semaphore(struct ixgbe_hw *hw, u16 mask)
{
if (hw->mac.ops.release_swfw_sync)
hw->mac.ops.release_swfw_sync(hw, mask);
}

View File

@ -0,0 +1,163 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _IXGBE_API_H_
#define _IXGBE_API_H_
#include "ixgbe_type.h"
s32 ixgbe_init_shared_code(struct ixgbe_hw *hw);
s32 ixgbe_set_mac_type(struct ixgbe_hw *hw);
s32 ixgbe_init_hw(struct ixgbe_hw *hw);
s32 ixgbe_reset_hw(struct ixgbe_hw *hw);
s32 ixgbe_start_hw(struct ixgbe_hw *hw);
s32 ixgbe_clear_hw_cntrs(struct ixgbe_hw *hw);
enum ixgbe_media_type ixgbe_get_media_type(struct ixgbe_hw *hw);
s32 ixgbe_get_mac_addr(struct ixgbe_hw *hw, u8 *mac_addr);
s32 ixgbe_get_bus_info(struct ixgbe_hw *hw);
u32 ixgbe_get_num_of_tx_queues(struct ixgbe_hw *hw);
u32 ixgbe_get_num_of_rx_queues(struct ixgbe_hw *hw);
s32 ixgbe_stop_adapter(struct ixgbe_hw *hw);
s32 ixgbe_read_pba_num(struct ixgbe_hw *hw, u32 *pba_num);
s32 ixgbe_identify_phy(struct ixgbe_hw *hw);
s32 ixgbe_reset_phy(struct ixgbe_hw *hw);
s32 ixgbe_read_phy_reg(struct ixgbe_hw *hw, u32 reg_addr, u32 device_type,
u16 *phy_data);
s32 ixgbe_write_phy_reg(struct ixgbe_hw *hw, u32 reg_addr, u32 device_type,
u16 phy_data);
s32 ixgbe_setup_phy_link(struct ixgbe_hw *hw);
s32 ixgbe_check_phy_link(struct ixgbe_hw *hw,
ixgbe_link_speed *speed,
bool *link_up);
s32 ixgbe_setup_phy_link_speed(struct ixgbe_hw *hw,
ixgbe_link_speed speed,
bool autoneg,
bool autoneg_wait_to_complete);
s32 ixgbe_setup_link(struct ixgbe_hw *hw);
s32 ixgbe_setup_link_speed(struct ixgbe_hw *hw, ixgbe_link_speed speed,
bool autoneg, bool autoneg_wait_to_complete);
s32 ixgbe_check_link(struct ixgbe_hw *hw, ixgbe_link_speed *speed,
bool *link_up, bool link_up_wait_to_complete);
s32 ixgbe_get_link_capabilities(struct ixgbe_hw *hw, ixgbe_link_speed *speed,
bool *autoneg);
s32 ixgbe_led_on(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_led_off(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_blink_led_start(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_blink_led_stop(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_init_eeprom_params(struct ixgbe_hw *hw);
s32 ixgbe_write_eeprom(struct ixgbe_hw *hw, u16 offset, u16 data);
s32 ixgbe_read_eeprom(struct ixgbe_hw *hw, u16 offset, u16 *data);
s32 ixgbe_validate_eeprom_checksum(struct ixgbe_hw *hw, u16 *checksum_val);
s32 ixgbe_update_eeprom_checksum(struct ixgbe_hw *hw);
s32 ixgbe_insert_mac_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq);
s32 ixgbe_set_rar(struct ixgbe_hw *hw, u32 index, u8 *addr, u32 vmdq,
u32 enable_addr);
s32 ixgbe_clear_rar(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_set_vmdq(struct ixgbe_hw *hw, u32 rar, u32 vmdq);
s32 ixgbe_clear_vmdq(struct ixgbe_hw *hw, u32 rar, u32 vmdq);
s32 ixgbe_init_rx_addrs(struct ixgbe_hw *hw);
u32 ixgbe_get_num_rx_addrs(struct ixgbe_hw *hw);
s32 ixgbe_update_uc_addr_list(struct ixgbe_hw *hw, u8 *addr_list,
u32 addr_count, ixgbe_mc_addr_itr func);
s32 ixgbe_update_mc_addr_list(struct ixgbe_hw *hw, u8 *mc_addr_list,
u32 mc_addr_count, ixgbe_mc_addr_itr func);
void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr_list, u32 vmdq);
s32 ixgbe_enable_mc(struct ixgbe_hw *hw);
s32 ixgbe_disable_mc(struct ixgbe_hw *hw);
s32 ixgbe_clear_vfta(struct ixgbe_hw *hw);
s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan,
u32 vind, bool vlan_on);
s32 ixgbe_fc_enable(struct ixgbe_hw *hw, s32 packetbuf_num);
void ixgbe_set_mta(struct ixgbe_hw *hw, u8 *mc_addr);
s32 ixgbe_get_phy_firmware_version(struct ixgbe_hw *hw,
u16 *firmware_version);
s32 ixgbe_read_analog_reg8(struct ixgbe_hw *hw, u32 reg, u8 *val);
s32 ixgbe_write_analog_reg8(struct ixgbe_hw *hw, u32 reg, u8 val);
s32 ixgbe_init_uta_tables(struct ixgbe_hw *hw);
s32 ixgbe_read_i2c_eeprom(struct ixgbe_hw *hw, u8 byte_offset, u8 *eeprom_data);
u32 ixgbe_get_supported_physical_layer(struct ixgbe_hw *hw);
s32 ixgbe_enable_rx_dma(struct ixgbe_hw *hw, u32 regval);
s32 ixgbe_reinit_fdir_tables_82599(struct ixgbe_hw *hw);
s32 ixgbe_init_fdir_signature_82599(struct ixgbe_hw *hw, u32 pballoc);
s32 ixgbe_init_fdir_perfect_82599(struct ixgbe_hw *hw, u32 pballoc);
s32 ixgbe_fdir_add_signature_filter_82599(struct ixgbe_hw *hw,
struct ixgbe_atr_input *input,
u8 queue);
s32 ixgbe_fdir_add_perfect_filter_82599(struct ixgbe_hw *hw,
struct ixgbe_atr_input *input,
u16 soft_id,
u8 queue);
u16 ixgbe_atr_compute_hash_82599(struct ixgbe_atr_input *input, u32 key);
s32 ixgbe_atr_set_vlan_id_82599(struct ixgbe_atr_input *input, u16 vlan_id);
s32 ixgbe_atr_set_src_ipv4_82599(struct ixgbe_atr_input *input, u32 src_addr);
s32 ixgbe_atr_set_dst_ipv4_82599(struct ixgbe_atr_input *input, u32 dst_addr);
s32 ixgbe_atr_set_src_ipv6_82599(struct ixgbe_atr_input *input, u32 src_addr_1,
u32 src_addr_2, u32 src_addr_3,
u32 src_addr_4);
s32 ixgbe_atr_set_dst_ipv6_82599(struct ixgbe_atr_input *input, u32 dst_addr_1,
u32 dst_addr_2, u32 dst_addr_3,
u32 dst_addr_4);
s32 ixgbe_atr_set_src_port_82599(struct ixgbe_atr_input *input, u16 src_port);
s32 ixgbe_atr_set_dst_port_82599(struct ixgbe_atr_input *input, u16 dst_port);
s32 ixgbe_atr_set_flex_byte_82599(struct ixgbe_atr_input *input, u16 flex_byte);
s32 ixgbe_atr_set_vm_pool_82599(struct ixgbe_atr_input *input, u8 vm_pool);
s32 ixgbe_atr_set_l4type_82599(struct ixgbe_atr_input *input, u8 l4type);
s32 ixgbe_atr_get_vlan_id_82599(struct ixgbe_atr_input *input, u16 *vlan_id);
s32 ixgbe_atr_get_src_ipv4_82599(struct ixgbe_atr_input *input, u32 *src_addr);
s32 ixgbe_atr_get_dst_ipv4_82599(struct ixgbe_atr_input *input, u32 *dst_addr);
s32 ixgbe_atr_get_src_ipv6_82599(struct ixgbe_atr_input *input, u32 *src_addr_1,
u32 *src_addr_2, u32 *src_addr_3,
u32 *src_addr_4);
s32 ixgbe_atr_get_dst_ipv6_82599(struct ixgbe_atr_input *input, u32 *dst_addr_1,
u32 *dst_addr_2, u32 *dst_addr_3,
u32 *dst_addr_4);
s32 ixgbe_atr_get_src_port_82599(struct ixgbe_atr_input *input, u16 *src_port);
s32 ixgbe_atr_get_dst_port_82599(struct ixgbe_atr_input *input, u16 *dst_port);
s32 ixgbe_atr_get_flex_byte_82599(struct ixgbe_atr_input *input,
u16 *flex_byte);
s32 ixgbe_atr_get_vm_pool_82599(struct ixgbe_atr_input *input, u8 *vm_pool);
s32 ixgbe_atr_get_l4type_82599(struct ixgbe_atr_input *input, u8 *l4type);
s32 ixgbe_read_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr,
u8 *data);
s32 ixgbe_write_i2c_byte(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr,
u8 data);
s32 ixgbe_write_i2c_eeprom(struct ixgbe_hw *hw, u8 byte_offset, u8 eeprom_data);
s32 ixgbe_get_san_mac_addr(struct ixgbe_hw *hw, u8 *san_mac_addr);
s32 ixgbe_set_san_mac_addr(struct ixgbe_hw *hw, u8 *san_mac_addr);
s32 ixgbe_get_device_caps(struct ixgbe_hw *hw, u16 *device_caps);
s32 ixgbe_acquire_swfw_semaphore(struct ixgbe_hw *hw, u16 mask);
void ixgbe_release_swfw_semaphore(struct ixgbe_hw *hw, u16 mask);
#endif /* _IXGBE_API_H_ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,83 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _IXGBE_COMMON_H_
#define _IXGBE_COMMON_H_
#include "ixgbe_type.h"
s32 ixgbe_init_ops_generic(struct ixgbe_hw *hw);
s32 ixgbe_init_hw_generic(struct ixgbe_hw *hw);
s32 ixgbe_start_hw_generic(struct ixgbe_hw *hw);
s32 ixgbe_clear_hw_cntrs_generic(struct ixgbe_hw *hw);
s32 ixgbe_read_pba_num_generic(struct ixgbe_hw *hw, u32 *pba_num);
s32 ixgbe_get_mac_addr_generic(struct ixgbe_hw *hw, u8 *mac_addr);
s32 ixgbe_get_bus_info_generic(struct ixgbe_hw *hw);
void ixgbe_set_lan_id_multi_port_pcie(struct ixgbe_hw *hw);
s32 ixgbe_stop_adapter_generic(struct ixgbe_hw *hw);
s32 ixgbe_led_on_generic(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_led_off_generic(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_init_eeprom_params_generic(struct ixgbe_hw *hw);
s32 ixgbe_write_eeprom_generic(struct ixgbe_hw *hw, u16 offset, u16 data);
s32 ixgbe_read_eeprom_generic(struct ixgbe_hw *hw, u16 offset, u16 *data);
s32 ixgbe_read_eeprom_bit_bang_generic(struct ixgbe_hw *hw, u16 offset,
u16 *data);
s32 ixgbe_validate_eeprom_checksum_generic(struct ixgbe_hw *hw,
u16 *checksum_val);
s32 ixgbe_update_eeprom_checksum_generic(struct ixgbe_hw *hw);
s32 ixgbe_set_rar_generic(struct ixgbe_hw *hw, u32 index, u8 *addr, u32 vmdq,
u32 enable_addr);
s32 ixgbe_clear_rar_generic(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_init_rx_addrs_generic(struct ixgbe_hw *hw);
s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
u32 mc_addr_count,
ixgbe_mc_addr_itr func);
s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list,
u32 addr_count, ixgbe_mc_addr_itr func);
void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq);
s32 ixgbe_enable_mc_generic(struct ixgbe_hw *hw);
s32 ixgbe_disable_mc_generic(struct ixgbe_hw *hw);
s32 ixgbe_enable_rx_dma_generic(struct ixgbe_hw *hw, u32 regval);
s32 ixgbe_setup_fc(struct ixgbe_hw *hw, s32 packetbuf_num);
s32 ixgbe_fc_enable_generic(struct ixgbe_hw *hw, s32 packtetbuf_num);
s32 ixgbe_fc_autoneg(struct ixgbe_hw *hw);
s32 ixgbe_validate_mac_addr(u8 *mac_addr);
s32 ixgbe_acquire_swfw_sync(struct ixgbe_hw *hw, u16 mask);
void ixgbe_release_swfw_sync(struct ixgbe_hw *hw, u16 mask);
s32 ixgbe_disable_pcie_master(struct ixgbe_hw *hw);
s32 ixgbe_read_analog_reg8_generic(struct ixgbe_hw *hw, u32 reg, u8 *val);
s32 ixgbe_write_analog_reg8_generic(struct ixgbe_hw *hw, u32 reg, u8 val);
s32 ixgbe_blink_led_start_generic(struct ixgbe_hw *hw, u32 index);
s32 ixgbe_blink_led_stop_generic(struct ixgbe_hw *hw, u32 index);
#endif /* IXGBE_COMMON */

View File

@ -0,0 +1,350 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include "ixgbe_type.h"
#include "ixgbe_dcb.h"
#include "ixgbe_dcb_82598.h"
#include "ixgbe_dcb_82599.h"
/**
* ixgbe_dcb_config - Struct containing DCB settings.
* @dcb_config: Pointer to DCB config structure
*
* This function checks DCB rules for DCB settings.
* The following rules are checked:
* 1. The sum of bandwidth percentages of all Bandwidth Groups must total 100%.
* 2. The sum of bandwidth percentages of all Traffic Classes within a Bandwidth
* Group must total 100.
* 3. A Traffic Class should not be set to both Link Strict Priority
* and Group Strict Priority.
* 4. Link strict Bandwidth Groups can only have link strict traffic classes
* with zero bandwidth.
*/
s32 ixgbe_dcb_check_config(struct ixgbe_dcb_config *dcb_config)
{
struct tc_bw_alloc *p;
s32 ret_val = 0;
u8 i, j, bw = 0, bw_id;
u8 bw_sum[2][MAX_BW_GROUP];
bool link_strict[2][MAX_BW_GROUP];
memset(bw_sum, 0, sizeof(bw_sum));
memset(link_strict, 0, sizeof(link_strict));
/* First Tx, then Rx */
for (i = 0; i < 2; i++) {
/* Check each traffic class for rule violation */
for (j = 0; j < MAX_TRAFFIC_CLASS; j++) {
p = &dcb_config->tc_config[j].path[i];
bw = p->bwg_percent;
bw_id = p->bwg_id;
if (bw_id >= MAX_BW_GROUP) {
ret_val = DCB_ERR_CONFIG;
goto err_config;
}
if (p->prio_type == prio_link) {
link_strict[i][bw_id] = true;
/* Link strict should have zero bandwidth */
if (bw) {
ret_val = DCB_ERR_LS_BW_NONZERO;
goto err_config;
}
} else if (!bw) {
/*
* Traffic classes without link strict
* should have non-zero bandwidth.
*/
ret_val = DCB_ERR_TC_BW_ZERO;
goto err_config;
}
bw_sum[i][bw_id] += bw;
}
bw = 0;
/* Check each bandwidth group for rule violation */
for (j = 0; j < MAX_BW_GROUP; j++) {
bw += dcb_config->bw_percentage[i][j];
/*
* Sum of bandwidth percentages of all traffic classes
* within a Bandwidth Group must total 100 except for
* link strict group (zero bandwidth).
*/
if (link_strict[i][j]) {
if (bw_sum[i][j]) {
/*
* Link strict group should have zero
* bandwidth.
*/
ret_val = DCB_ERR_LS_BWG_NONZERO;
goto err_config;
}
} else if (bw_sum[i][j] != BW_PERCENT &&
bw_sum[i][j] != 0) {
ret_val = DCB_ERR_TC_BW;
goto err_config;
}
}
if (bw != BW_PERCENT) {
ret_val = DCB_ERR_BW_GROUP;
goto err_config;
}
}
return DCB_SUCCESS;
err_config:
hw_dbg(hw, "DCB error code %d while checking %s settings.\n",
ret_val, (j == DCB_TX_CONFIG) ? "Tx" : "Rx");
return ret_val;
}
/**
* ixgbe_dcb_calculate_tc_credits - Calculates traffic class credits
* @ixgbe_dcb_config: Struct containing DCB settings.
* @direction: Configuring either Tx or Rx.
*
* This function calculates the credits allocated to each traffic class.
* It should be called only after the rules are checked by
* ixgbe_dcb_check_config().
*/
s32 ixgbe_dcb_calculate_tc_credits(struct ixgbe_dcb_config *dcb_config,
u8 direction)
{
struct tc_bw_alloc *p;
s32 ret_val = 0;
/* Initialization values default for Tx settings */
u32 credit_refill = 0;
u32 credit_max = 0;
u16 link_percentage = 0;
u8 bw_percent = 0;
u8 i;
if (dcb_config == NULL) {
ret_val = DCB_ERR_CONFIG;
goto out;
}
/* Find out the link percentage for each TC first */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
p = &dcb_config->tc_config[i].path[direction];
bw_percent = dcb_config->bw_percentage[direction][p->bwg_id];
link_percentage = p->bwg_percent;
/* Must be careful of integer division for very small nums */
link_percentage = (link_percentage * bw_percent) / 100;
if (p->bwg_percent > 0 && link_percentage == 0)
link_percentage = 1;
/* Save link_percentage for reference */
p->link_percent = (u8)link_percentage;
/* Calculate credit refill and save it */
credit_refill = link_percentage * MINIMUM_CREDIT_REFILL;
p->data_credits_refill = (u16)credit_refill;
/* Calculate maximum credit for the TC */
credit_max = (link_percentage * MAX_CREDIT) / 100;
/*
* Adjustment based on rule checking, if the percentage
* of a TC is too small, the maximum credit may not be
* enough to send out a jumbo frame in data plane arbitration.
*/
if (credit_max && (credit_max < MINIMUM_CREDIT_FOR_JUMBO))
credit_max = MINIMUM_CREDIT_FOR_JUMBO;
if (direction == DCB_TX_CONFIG) {
/*
* Adjustment based on rule checking, if the
* percentage of a TC is too small, the maximum
* credit may not be enough to send out a TSO
* packet in descriptor plane arbitration.
*/
if (credit_max && (credit_max < MINIMUM_CREDIT_FOR_TSO))
credit_max = MINIMUM_CREDIT_FOR_TSO;
dcb_config->tc_config[i].desc_credits_max =
(u16)credit_max;
}
p->data_credits_max = (u16)credit_max;
}
out:
return ret_val;
}
/**
* ixgbe_dcb_get_tc_stats - Returns status of each traffic class
* @hw: pointer to hardware structure
* @stats: pointer to statistics structure
* @tc_count: Number of elements in bwg_array.
*
* This function returns the status data for each of the Traffic Classes in use.
*/
s32 ixgbe_dcb_get_tc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats,
u8 tc_count)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_get_tc_stats_82598(hw, stats, tc_count);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_get_tc_stats_82599(hw, stats, tc_count);
return ret;
}
/**
* ixgbe_dcb_get_pfc_stats - Returns CBFC status of each traffic class
* @hw: pointer to hardware structure
* @stats: pointer to statistics structure
* @tc_count: Number of elements in bwg_array.
*
* This function returns the CBFC status data for each of the Traffic Classes.
*/
s32 ixgbe_dcb_get_pfc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats,
u8 tc_count)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_get_pfc_stats_82598(hw, stats, tc_count);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_get_pfc_stats_82599(hw, stats, tc_count);
return ret;
}
/**
* ixgbe_dcb_config_rx_arbiter - Config Rx arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Rx Data Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_rx_arbiter(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_config_rx_arbiter_82598(hw, dcb_config);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_config_rx_arbiter_82599(hw, dcb_config);
return ret;
}
/**
* ixgbe_dcb_config_tx_desc_arbiter - Config Tx Desc arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Tx Descriptor Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_tx_desc_arbiter(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_config_tx_desc_arbiter_82598(hw, dcb_config);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_config_tx_desc_arbiter_82599(hw, dcb_config);
return ret;
}
/**
* ixgbe_dcb_config_tx_data_arbiter - Config Tx data arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Tx Data Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_tx_data_arbiter(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_config_tx_data_arbiter_82598(hw, dcb_config);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_config_tx_data_arbiter_82599(hw, dcb_config);
return ret;
}
/**
* ixgbe_dcb_config_pfc - Config priority flow control
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Priority Flow Control for each traffic class.
*/
s32 ixgbe_dcb_config_pfc(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_config_pfc_82598(hw, dcb_config);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_config_pfc_82599(hw, dcb_config);
return ret;
}
/**
* ixgbe_dcb_config_tc_stats - Config traffic class statistics
* @hw: pointer to hardware structure
*
* Configure queue statistics registers, all queues belonging to same traffic
* class uses a single set of queue statistics counters.
*/
s32 ixgbe_dcb_config_tc_stats(struct ixgbe_hw *hw)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_config_tc_stats_82598(hw);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_config_tc_stats_82599(hw);
return ret;
}
/**
* ixgbe_dcb_hw_config - Config and enable DCB
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure dcb settings and enable dcb mode.
*/
s32 ixgbe_dcb_hw_config(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
s32 ret = 0;
if (hw->mac.type == ixgbe_mac_82598EB)
ret = ixgbe_dcb_hw_config_82598(hw, dcb_config);
else if (hw->mac.type == ixgbe_mac_82599EB)
ret = ixgbe_dcb_hw_config_82599(hw, dcb_config);
return ret;
}

View File

@ -0,0 +1,193 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _DCB_CONFIG_H_
#define _DCB_CONFIG_H_
#include "ixgbe_type.h"
/* DCB data structures */
#define IXGBE_MAX_PACKET_BUFFERS 8
#define MAX_USER_PRIORITY 8
#define MAX_TRAFFIC_CLASS 8
#define MAX_BW_GROUP 8
#define BW_PERCENT 100
#define DCB_TX_CONFIG 0
#define DCB_RX_CONFIG 1
/* DCB error Codes */
#define DCB_SUCCESS 0
#define DCB_ERR_CONFIG -1
#define DCB_ERR_PARAM -2
/* Transmit and receive Errors */
/* Error in bandwidth group allocation */
#define DCB_ERR_BW_GROUP -3
/* Error in traffic class bandwidth allocation */
#define DCB_ERR_TC_BW -4
/* Traffic class has both link strict and group strict enabled */
#define DCB_ERR_LS_GS -5
/* Link strict traffic class has non zero bandwidth */
#define DCB_ERR_LS_BW_NONZERO -6
/* Link strict bandwidth group has non zero bandwidth */
#define DCB_ERR_LS_BWG_NONZERO -7
/* Traffic class has zero bandwidth */
#define DCB_ERR_TC_BW_ZERO -8
#define DCB_NOT_IMPLEMENTED 0x7FFFFFFF
struct dcb_pfc_tc_debug {
u8 tc;
u8 pause_status;
u64 pause_quanta;
};
enum strict_prio_type {
prio_none = 0,
prio_group,
prio_link
};
/* DCB capability definitions */
#define IXGBE_DCB_PG_SUPPORT 0x00000001
#define IXGBE_DCB_PFC_SUPPORT 0x00000002
#define IXGBE_DCB_BCN_SUPPORT 0x00000004
#define IXGBE_DCB_UP2TC_SUPPORT 0x00000008
#define IXGBE_DCB_GSP_SUPPORT 0x00000010
#define IXGBE_DCB_8_TC_SUPPORT 0x80
struct dcb_support {
/* DCB capabilities */
u32 capabilities;
/* Each bit represents a number of TCs configurable in the hw.
* If 8 traffic classes can be configured, the value is 0x80.
*/
u8 traffic_classes;
u8 pfc_traffic_classes;
};
/* Traffic class bandwidth allocation per direction */
struct tc_bw_alloc {
u8 bwg_id; /* Bandwidth Group (BWG) ID */
u8 bwg_percent; /* % of BWG's bandwidth */
u8 link_percent; /* % of link bandwidth */
u8 up_to_tc_bitmap; /* User Priority to Traffic Class mapping */
u16 data_credits_refill; /* Credit refill amount in 64B granularity */
u16 data_credits_max; /* Max credits for a configured packet buffer
* in 64B granularity.*/
enum strict_prio_type prio_type; /* Link or Group Strict Priority */
};
enum dcb_pfc_type {
pfc_disabled = 0,
pfc_enabled_full,
pfc_enabled_tx,
pfc_enabled_rx
};
/* Traffic class configuration */
struct tc_configuration {
struct tc_bw_alloc path[2]; /* One each for Tx/Rx */
enum dcb_pfc_type dcb_pfc; /* Class based flow control setting */
u16 desc_credits_max; /* For Tx Descriptor arbitration */
u8 tc; /* Traffic class (TC) */
};
enum dcb_rx_pba_cfg {
pba_equal, /* PBA[0-7] each use 64KB FIFO */
pba_80_48 /* PBA[0-3] each use 80KB, PBA[4-7] each use 48KB */
};
struct dcb_num_tcs {
u8 pg_tcs;
u8 pfc_tcs;
};
struct ixgbe_dcb_config {
struct tc_configuration tc_config[MAX_TRAFFIC_CLASS];
struct dcb_support support;
struct dcb_num_tcs num_tcs;
u8 bw_percentage[2][MAX_BW_GROUP]; /* One each for Tx/Rx */
bool pfc_mode_enable;
bool round_robin_enable;
enum dcb_rx_pba_cfg rx_pba_cfg;
u32 dcb_cfg_version; /* Not used...OS-specific? */
u32 link_speed; /* For bandwidth allocation validation purpose */
};
/* DCB driver APIs */
/* DCB rule checking function.*/
s32 ixgbe_dcb_check_config(struct ixgbe_dcb_config *config);
/* DCB credits calculation */
s32 ixgbe_dcb_calculate_tc_credits(struct ixgbe_dcb_config *config,
u8 direction);
/* DCB PFC functions */
s32 ixgbe_dcb_config_pfc(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_get_pfc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats,
u8 tc_count);
/* DCB traffic class stats */
s32 ixgbe_dcb_config_tc_stats(struct ixgbe_hw *);
s32 ixgbe_dcb_get_tc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats,
u8 tc_count);
/* DCB config arbiters */
s32 ixgbe_dcb_config_tx_desc_arbiter(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_config_tx_data_arbiter(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_config_rx_arbiter(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
/* DCB hw initialization */
s32 ixgbe_dcb_hw_config(struct ixgbe_hw *hw, struct ixgbe_dcb_config *config);
/* DCB definitions for credit calculation */
#define MAX_CREDIT_REFILL 511 /* 0x1FF * 64B = 32704B */
#define MINIMUM_CREDIT_REFILL 5 /* 5*64B = 320B */
#define MINIMUM_CREDIT_FOR_JUMBO 145 /* 145 = UpperBound((9*1024+54)/64B)
* for 9KB jumbo frame */
#define DCB_MAX_TSO_SIZE 32*1024 /* MAX TSO packet size supported
* in DCB mode */
#define MINIMUM_CREDIT_FOR_TSO (DCB_MAX_TSO_SIZE/64 + 1) /* 513 for 32KB TSO
* packet */
#define MAX_CREDIT 4095 /* Maximum credit supported:
* 256KB * 1204 / 64B */
#endif /* _DCB_CONFIG_H */

View File

@ -0,0 +1,408 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include "ixgbe_type.h"
#include "ixgbe_dcb.h"
#include "ixgbe_dcb_82598.h"
/**
* ixgbe_dcb_get_tc_stats_82598 - Return status data for each traffic class
* @hw: pointer to hardware structure
* @stats: pointer to statistics structure
* @tc_count: Number of elements in bwg_array.
*
* This function returns the status data for each of the Traffic Classes in use.
*/
s32 ixgbe_dcb_get_tc_stats_82598(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count)
{
int tc;
if (tc_count > MAX_TRAFFIC_CLASS)
return DCB_ERR_PARAM;
/* Statistics pertaining to each traffic class */
for (tc = 0; tc < tc_count; tc++) {
/* Transmitted Packets */
stats->qptc[tc] += IXGBE_READ_REG(hw, IXGBE_QPTC(tc));
/* Transmitted Bytes */
stats->qbtc[tc] += IXGBE_READ_REG(hw, IXGBE_QBTC(tc));
/* Received Packets */
stats->qprc[tc] += IXGBE_READ_REG(hw, IXGBE_QPRC(tc));
/* Received Bytes */
stats->qbrc[tc] += IXGBE_READ_REG(hw, IXGBE_QBRC(tc));
#if 0
/* Can we get rid of these?? Consequently, getting rid
* of the tc_stats structure.
*/
tc_stats_array[up]->in_overflow_discards = 0;
tc_stats_array[up]->out_overflow_discards = 0;
#endif
}
return 0;
}
/**
* ixgbe_dcb_get_pfc_stats_82598 - Returns CBFC status data
* @hw: pointer to hardware structure
* @stats: pointer to statistics structure
* @tc_count: Number of elements in bwg_array.
*
* This function returns the CBFC status data for each of the Traffic Classes.
*/
s32 ixgbe_dcb_get_pfc_stats_82598(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count)
{
int tc;
if (tc_count > MAX_TRAFFIC_CLASS)
return DCB_ERR_PARAM;
for (tc = 0; tc < tc_count; tc++) {
/* Priority XOFF Transmitted */
stats->pxofftxc[tc] += IXGBE_READ_REG(hw, IXGBE_PXOFFTXC(tc));
/* Priority XOFF Received */
stats->pxoffrxc[tc] += IXGBE_READ_REG(hw, IXGBE_PXOFFRXC(tc));
}
return 0;
}
/**
* ixgbe_dcb_config_packet_buffers_82598 - Configure packet buffers
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure packet buffers for DCB mode.
*/
s32 ixgbe_dcb_config_packet_buffers_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
s32 ret_val = 0;
u32 value = IXGBE_RXPBSIZE_64KB;
u8 i = 0;
/* Setup Rx packet buffer sizes */
switch (dcb_config->rx_pba_cfg) {
case pba_80_48:
/* Setup the first four at 80KB */
value = IXGBE_RXPBSIZE_80KB;
for (; i < 4; i++)
IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), value);
/* Setup the last four at 48KB...don't re-init i */
value = IXGBE_RXPBSIZE_48KB;
/* Fall Through */
case pba_equal:
default:
for (; i < IXGBE_MAX_PACKET_BUFFERS; i++)
IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), value);
/* Setup Tx packet buffer sizes */
for (i = 0; i < IXGBE_MAX_PACKET_BUFFERS; i++) {
IXGBE_WRITE_REG(hw, IXGBE_TXPBSIZE(i),
IXGBE_TXPBSIZE_40KB);
}
break;
}
return ret_val;
}
/**
* ixgbe_dcb_config_rx_arbiter_82598 - Config Rx data arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Rx Data Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_rx_arbiter_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
struct tc_bw_alloc *p;
u32 reg = 0;
u32 credit_refill = 0;
u32 credit_max = 0;
u8 i = 0;
reg = IXGBE_READ_REG(hw, IXGBE_RUPPBMR) | IXGBE_RUPPBMR_MQA;
IXGBE_WRITE_REG(hw, IXGBE_RUPPBMR, reg);
reg = IXGBE_READ_REG(hw, IXGBE_RMCS);
/* Enable Arbiter */
reg &= ~IXGBE_RMCS_ARBDIS;
/* Enable Receive Recycle within the BWG */
reg |= IXGBE_RMCS_RRM;
/* Enable Deficit Fixed Priority arbitration*/
reg |= IXGBE_RMCS_DFP;
IXGBE_WRITE_REG(hw, IXGBE_RMCS, reg);
/* Configure traffic class credits and priority */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
p = &dcb_config->tc_config[i].path[DCB_RX_CONFIG];
credit_refill = p->data_credits_refill;
credit_max = p->data_credits_max;
reg = credit_refill | (credit_max << IXGBE_RT2CR_MCL_SHIFT);
if (p->prio_type == prio_link)
reg |= IXGBE_RT2CR_LSP;
IXGBE_WRITE_REG(hw, IXGBE_RT2CR(i), reg);
}
reg = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
reg |= IXGBE_RDRXCTL_RDMTS_1_2;
reg |= IXGBE_RDRXCTL_MPBEN;
reg |= IXGBE_RDRXCTL_MCEN;
IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg);
reg = IXGBE_READ_REG(hw, IXGBE_RXCTRL);
/* Make sure there is enough descriptors before arbitration */
reg &= ~IXGBE_RXCTRL_DMBYPS;
IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, reg);
return 0;
}
/**
* ixgbe_dcb_config_tx_desc_arbiter_82598 - Config Tx Desc. arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Tx Descriptor Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_tx_desc_arbiter_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
struct tc_bw_alloc *p;
u32 reg, max_credits;
u8 i;
reg = IXGBE_READ_REG(hw, IXGBE_DPMCS);
/* Enable arbiter */
reg &= ~IXGBE_DPMCS_ARBDIS;
if (!(dcb_config->round_robin_enable)) {
/* Enable DFP and Recycle mode */
reg |= (IXGBE_DPMCS_TDPAC | IXGBE_DPMCS_TRM);
}
reg |= IXGBE_DPMCS_TSOEF;
/* Configure Max TSO packet size 34KB including payload and headers */
reg |= (0x4 << IXGBE_DPMCS_MTSOS_SHIFT);
IXGBE_WRITE_REG(hw, IXGBE_DPMCS, reg);
/* Configure traffic class credits and priority */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
p = &dcb_config->tc_config[i].path[DCB_TX_CONFIG];
max_credits = dcb_config->tc_config[i].desc_credits_max;
reg = max_credits << IXGBE_TDTQ2TCCR_MCL_SHIFT;
reg |= p->data_credits_refill;
reg |= (u32)(p->bwg_id) << IXGBE_TDTQ2TCCR_BWG_SHIFT;
if (p->prio_type == prio_group)
reg |= IXGBE_TDTQ2TCCR_GSP;
if (p->prio_type == prio_link)
reg |= IXGBE_TDTQ2TCCR_LSP;
IXGBE_WRITE_REG(hw, IXGBE_TDTQ2TCCR(i), reg);
}
return 0;
}
/**
* ixgbe_dcb_config_tx_data_arbiter_82598 - Config Tx data arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Tx Data Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_tx_data_arbiter_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
struct tc_bw_alloc *p;
u32 reg;
u8 i;
reg = IXGBE_READ_REG(hw, IXGBE_PDPMCS);
/* Enable Data Plane Arbiter */
reg &= ~IXGBE_PDPMCS_ARBDIS;
/* Enable DFP and Transmit Recycle Mode */
reg |= (IXGBE_PDPMCS_TPPAC | IXGBE_PDPMCS_TRM);
IXGBE_WRITE_REG(hw, IXGBE_PDPMCS, reg);
/* Configure traffic class credits and priority */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
p = &dcb_config->tc_config[i].path[DCB_TX_CONFIG];
reg = p->data_credits_refill;
reg |= (u32)(p->data_credits_max) << IXGBE_TDPT2TCCR_MCL_SHIFT;
reg |= (u32)(p->bwg_id) << IXGBE_TDPT2TCCR_BWG_SHIFT;
if (p->prio_type == prio_group)
reg |= IXGBE_TDPT2TCCR_GSP;
if (p->prio_type == prio_link)
reg |= IXGBE_TDPT2TCCR_LSP;
IXGBE_WRITE_REG(hw, IXGBE_TDPT2TCCR(i), reg);
}
/* Enable Tx packet buffer division */
reg = IXGBE_READ_REG(hw, IXGBE_DTXCTL);
reg |= IXGBE_DTXCTL_ENDBUBD;
IXGBE_WRITE_REG(hw, IXGBE_DTXCTL, reg);
return 0;
}
/**
* ixgbe_dcb_config_pfc_82598 - Config priority flow control
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Priority Flow Control for each traffic class.
*/
s32 ixgbe_dcb_config_pfc_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
u32 reg, rx_pba_size;
u8 i;
if (!dcb_config->pfc_mode_enable)
goto out;
/* Enable Transmit Priority Flow Control */
reg = IXGBE_READ_REG(hw, IXGBE_RMCS);
reg &= ~IXGBE_RMCS_TFCE_802_3X;
/* correct the reporting of our flow control status */
reg |= IXGBE_RMCS_TFCE_PRIORITY;
IXGBE_WRITE_REG(hw, IXGBE_RMCS, reg);
/* Enable Receive Priority Flow Control */
reg = IXGBE_READ_REG(hw, IXGBE_FCTRL);
reg &= ~IXGBE_FCTRL_RFCE;
reg |= IXGBE_FCTRL_RPFCE;
IXGBE_WRITE_REG(hw, IXGBE_FCTRL, reg);
/*
* Configure flow control thresholds and enable priority flow control
* for each traffic class.
*/
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
if (dcb_config->rx_pba_cfg == pba_equal) {
rx_pba_size = IXGBE_RXPBSIZE_64KB;
} else {
rx_pba_size = (i < 4) ? IXGBE_RXPBSIZE_80KB
: IXGBE_RXPBSIZE_48KB;
}
reg = ((rx_pba_size >> 5) & 0xFFF0);
if (dcb_config->tc_config[i].dcb_pfc == pfc_enabled_tx ||
dcb_config->tc_config[i].dcb_pfc == pfc_enabled_full)
reg |= IXGBE_FCRTL_XONE;
IXGBE_WRITE_REG(hw, IXGBE_FCRTL(i), reg);
reg = ((rx_pba_size >> 2) & 0xFFF0);
if (dcb_config->tc_config[i].dcb_pfc == pfc_enabled_tx ||
dcb_config->tc_config[i].dcb_pfc == pfc_enabled_full)
reg |= IXGBE_FCRTH_FCEN;
IXGBE_WRITE_REG(hw, IXGBE_FCRTH(i), reg);
}
/* Configure pause time */
for (i = 0; i < (MAX_TRAFFIC_CLASS >> 1); i++)
IXGBE_WRITE_REG(hw, IXGBE_FCTTV(i), 0x68006800);
/* Configure flow control refresh threshold value */
IXGBE_WRITE_REG(hw, IXGBE_FCRTV, 0x3400);
out:
return 0;
}
/**
* ixgbe_dcb_config_tc_stats_82598 - Configure traffic class statistics
* @hw: pointer to hardware structure
*
* Configure queue statistics registers, all queues belonging to same traffic
* class uses a single set of queue statistics counters.
*/
s32 ixgbe_dcb_config_tc_stats_82598(struct ixgbe_hw *hw)
{
u32 reg = 0;
u8 i = 0;
u8 j = 0;
/* Receive Queues stats setting - 8 queues per statistics reg */
for (i = 0, j = 0; i < 15 && j < 8; i = i + 2, j++) {
reg = IXGBE_READ_REG(hw, IXGBE_RQSMR(i));
reg |= ((0x1010101) * j);
IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i), reg);
reg = IXGBE_READ_REG(hw, IXGBE_RQSMR(i + 1));
reg |= ((0x1010101) * j);
IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i + 1), reg);
}
/* Transmit Queues stats setting - 4 queues per statistics reg*/
for (i = 0; i < 8; i++) {
reg = IXGBE_READ_REG(hw, IXGBE_TQSMR(i));
reg |= ((0x1010101) * i);
IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i), reg);
}
return 0;
}
/**
* ixgbe_dcb_hw_config_82598 - Config and enable DCB
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure dcb settings and enable dcb mode.
*/
s32 ixgbe_dcb_hw_config_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
ixgbe_dcb_config_packet_buffers_82598(hw, dcb_config);
ixgbe_dcb_config_rx_arbiter_82598(hw, dcb_config);
ixgbe_dcb_config_tx_desc_arbiter_82598(hw, dcb_config);
ixgbe_dcb_config_tx_data_arbiter_82598(hw, dcb_config);
ixgbe_dcb_config_pfc_82598(hw, dcb_config);
ixgbe_dcb_config_tc_stats_82598(hw);
return 0;
}

View File

@ -0,0 +1,99 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _DCB_82598_CONFIG_H_
#define _DCB_82598_CONFIG_H_
/* DCB register definitions */
#define IXGBE_DPMCS_MTSOS_SHIFT 16
#define IXGBE_DPMCS_TDPAC 0x00000001 /* 0 Round Robin,
* 1 DFP - Deficit Fixed Priority */
#define IXGBE_DPMCS_TRM 0x00000010 /* Transmit Recycle Mode */
#define IXGBE_DPMCS_ARBDIS 0x00000040 /* DCB arbiter disable */
#define IXGBE_DPMCS_TSOEF 0x00080000 /* TSO Expand Factor: 0=x4, 1=x2 */
#define IXGBE_RUPPBMR_MQA 0x80000000 /* Enable UP to queue mapping */
#define IXGBE_RT2CR_MCL_SHIFT 12 /* Offset to Max Credit Limit setting */
#define IXGBE_RT2CR_LSP 0x80000000 /* LSP enable bit */
#define IXGBE_RDRXCTL_MPBEN 0x00000010 /* DMA config for multiple packet
* buffers enable */
#define IXGBE_RDRXCTL_MCEN 0x00000040 /* DMA config for multiple cores
* (RSS) enable */
#define IXGBE_TDTQ2TCCR_MCL_SHIFT 12
#define IXGBE_TDTQ2TCCR_BWG_SHIFT 9
#define IXGBE_TDTQ2TCCR_GSP 0x40000000
#define IXGBE_TDTQ2TCCR_LSP 0x80000000
#define IXGBE_TDPT2TCCR_MCL_SHIFT 12
#define IXGBE_TDPT2TCCR_BWG_SHIFT 9
#define IXGBE_TDPT2TCCR_GSP 0x40000000
#define IXGBE_TDPT2TCCR_LSP 0x80000000
#define IXGBE_PDPMCS_TPPAC 0x00000020 /* 0 Round Robin,
* 1 DFP - Deficit Fixed Priority */
#define IXGBE_PDPMCS_ARBDIS 0x00000040 /* Arbiter disable */
#define IXGBE_PDPMCS_TRM 0x00000100 /* Transmit Recycle Mode enable */
#define IXGBE_DTXCTL_ENDBUBD 0x00000004 /* Enable DBU buffer division */
#define IXGBE_TXPBSIZE_40KB 0x0000A000 /* 40KB Packet Buffer */
#define IXGBE_RXPBSIZE_48KB 0x0000C000 /* 48KB Packet Buffer */
#define IXGBE_RXPBSIZE_64KB 0x00010000 /* 64KB Packet Buffer */
#define IXGBE_RXPBSIZE_80KB 0x00014000 /* 80KB Packet Buffer */
/* DCB hardware-specific driver APIs */
/* DCB PFC functions */
s32 ixgbe_dcb_config_pfc_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_get_pfc_stats_82598(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count);
/* DCB traffic class stats */
s32 ixgbe_dcb_config_tc_stats_82598(struct ixgbe_hw *hw);
s32 ixgbe_dcb_get_tc_stats_82598(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count);
/* DCB config arbiters */
s32 ixgbe_dcb_config_tx_desc_arbiter_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_config_tx_data_arbiter_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_config_rx_arbiter_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
/* DCB hw initialization */
s32 ixgbe_dcb_hw_config_82598(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *config);
#endif /* _DCB_82598_CONFIG_H */

View File

@ -0,0 +1,501 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include "ixgbe_type.h"
#include "ixgbe_dcb.h"
#include "ixgbe_dcb_82599.h"
/**
* ixgbe_dcb_get_tc_stats_82599 - Returns status for each traffic class
* @hw: pointer to hardware structure
* @stats: pointer to statistics structure
* @tc_count: Number of elements in bwg_array.
*
* This function returns the status data for each of the Traffic Classes in use.
*/
s32 ixgbe_dcb_get_tc_stats_82599(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count)
{
int tc;
if (tc_count > MAX_TRAFFIC_CLASS)
return DCB_ERR_PARAM;
/* Statistics pertaining to each traffic class */
for (tc = 0; tc < tc_count; tc++) {
/* Transmitted Packets */
stats->qptc[tc] += IXGBE_READ_REG(hw, IXGBE_QPTC(tc));
/* Transmitted Bytes */
stats->qbtc[tc] += IXGBE_READ_REG(hw, IXGBE_QBTC(tc));
/* Received Packets */
stats->qprc[tc] += IXGBE_READ_REG(hw, IXGBE_QPRC(tc));
/* Received Bytes */
stats->qbrc[tc] += IXGBE_READ_REG(hw, IXGBE_QBRC(tc));
#if 0
/* Can we get rid of these?? Consequently, getting rid
* of the tc_stats structure.
*/
tc_stats_array[up]->in_overflow_discards = 0;
tc_stats_array[up]->out_overflow_discards = 0;
#endif
}
return 0;
}
/**
* ixgbe_dcb_get_pfc_stats_82599 - Return CBFC status data
* @hw: pointer to hardware structure
* @stats: pointer to statistics structure
* @tc_count: Number of elements in bwg_array.
*
* This function returns the CBFC status data for each of the Traffic Classes.
*/
s32 ixgbe_dcb_get_pfc_stats_82599(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count)
{
int tc;
if (tc_count > MAX_TRAFFIC_CLASS)
return DCB_ERR_PARAM;
for (tc = 0; tc < tc_count; tc++) {
/* Priority XOFF Transmitted */
stats->pxofftxc[tc] += IXGBE_READ_REG(hw, IXGBE_PXOFFTXC(tc));
/* Priority XOFF Received */
stats->pxoffrxc[tc] += IXGBE_READ_REG(hw, IXGBE_PXOFFRXCNT(tc));
}
return 0;
}
/**
* ixgbe_dcb_config_packet_buffers_82599 - Configure DCB packet buffers
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure packet buffers for DCB mode.
*/
s32 ixgbe_dcb_config_packet_buffers_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
s32 ret_val = 0;
u32 value = IXGBE_RXPBSIZE_64KB;
u8 i = 0;
/* Setup Rx packet buffer sizes */
switch (dcb_config->rx_pba_cfg) {
case pba_80_48:
/* Setup the first four at 80KB */
value = IXGBE_RXPBSIZE_80KB;
for (; i < 4; i++)
IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), value);
/* Setup the last four at 48KB...don't re-init i */
value = IXGBE_RXPBSIZE_48KB;
/* Fall Through */
case pba_equal:
default:
for (; i < IXGBE_MAX_PACKET_BUFFERS; i++)
IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), value);
/* Setup Tx packet buffer sizes */
for (i = 0; i < IXGBE_MAX_PACKET_BUFFERS; i++) {
IXGBE_WRITE_REG(hw, IXGBE_TXPBSIZE(i),
IXGBE_TXPBSIZE_20KB);
IXGBE_WRITE_REG(hw, IXGBE_TXPBTHRESH(i),
IXGBE_TXPBTHRESH_DCB);
}
break;
}
return ret_val;
}
/**
* ixgbe_dcb_config_rx_arbiter_82599 - Config Rx Data arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Rx Packet Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_rx_arbiter_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
struct tc_bw_alloc *p;
u32 reg = 0;
u32 credit_refill = 0;
u32 credit_max = 0;
u8 i = 0;
/*
* Disable the arbiter before changing parameters
* (always enable recycle mode; WSP)
*/
reg = IXGBE_RTRPCS_RRM | IXGBE_RTRPCS_RAC | IXGBE_RTRPCS_ARBDIS;
IXGBE_WRITE_REG(hw, IXGBE_RTRPCS, reg);
/* Map all traffic classes to their UP, 1 to 1 */
reg = 0;
for (i = 0; i < MAX_TRAFFIC_CLASS; i++)
reg |= (i << (i * IXGBE_RTRUP2TC_UP_SHIFT));
IXGBE_WRITE_REG(hw, IXGBE_RTRUP2TC, reg);
/* Configure traffic class credits and priority */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
p = &dcb_config->tc_config[i].path[DCB_RX_CONFIG];
credit_refill = p->data_credits_refill;
credit_max = p->data_credits_max;
reg = credit_refill | (credit_max << IXGBE_RTRPT4C_MCL_SHIFT);
reg |= (u32)(p->bwg_id) << IXGBE_RTRPT4C_BWG_SHIFT;
if (p->prio_type == prio_link)
reg |= IXGBE_RTRPT4C_LSP;
IXGBE_WRITE_REG(hw, IXGBE_RTRPT4C(i), reg);
}
/*
* Configure Rx packet plane (recycle mode; WSP) and
* enable arbiter
*/
reg = IXGBE_RTRPCS_RRM | IXGBE_RTRPCS_RAC;
IXGBE_WRITE_REG(hw, IXGBE_RTRPCS, reg);
return 0;
}
/**
* ixgbe_dcb_config_tx_desc_arbiter_82599 - Config Tx Desc. arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Tx Descriptor Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_tx_desc_arbiter_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
struct tc_bw_alloc *p;
u32 reg, max_credits;
u8 i;
/* Clear the per-Tx queue credits; we use per-TC instead */
for (i = 0; i < 128; i++) {
IXGBE_WRITE_REG(hw, IXGBE_RTTDQSEL, i);
IXGBE_WRITE_REG(hw, IXGBE_RTTDT1C, 0);
}
/* Configure traffic class credits and priority */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
p = &dcb_config->tc_config[i].path[DCB_TX_CONFIG];
max_credits = dcb_config->tc_config[i].desc_credits_max;
reg = max_credits << IXGBE_RTTDT2C_MCL_SHIFT;
reg |= p->data_credits_refill;
reg |= (u32)(p->bwg_id) << IXGBE_RTTDT2C_BWG_SHIFT;
if (p->prio_type == prio_group)
reg |= IXGBE_RTTDT2C_GSP;
if (p->prio_type == prio_link)
reg |= IXGBE_RTTDT2C_LSP;
IXGBE_WRITE_REG(hw, IXGBE_RTTDT2C(i), reg);
}
/*
* Configure Tx descriptor plane (recycle mode; WSP) and
* enable arbiter
*/
reg = IXGBE_RTTDCS_TDPAC | IXGBE_RTTDCS_TDRM;
IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, reg);
return 0;
}
/**
* ixgbe_dcb_config_tx_data_arbiter_82599 - Config Tx Data arbiter
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Tx Packet Arbiter and credits for each traffic class.
*/
s32 ixgbe_dcb_config_tx_data_arbiter_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
struct tc_bw_alloc *p;
u32 reg;
u8 i;
/*
* Disable the arbiter before changing parameters
* (always enable recycle mode; SP; arb delay)
*/
reg = IXGBE_RTTPCS_TPPAC | IXGBE_RTTPCS_TPRM |
(IXGBE_RTTPCS_ARBD_DCB << IXGBE_RTTPCS_ARBD_SHIFT) |
IXGBE_RTTPCS_ARBDIS;
IXGBE_WRITE_REG(hw, IXGBE_RTTPCS, reg);
/* Map all traffic classes to their UP, 1 to 1 */
reg = 0;
for (i = 0; i < MAX_TRAFFIC_CLASS; i++)
reg |= (i << (i * IXGBE_RTTUP2TC_UP_SHIFT));
IXGBE_WRITE_REG(hw, IXGBE_RTTUP2TC, reg);
/* Configure traffic class credits and priority */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
p = &dcb_config->tc_config[i].path[DCB_TX_CONFIG];
reg = p->data_credits_refill;
reg |= (u32)(p->data_credits_max) << IXGBE_RTTPT2C_MCL_SHIFT;
reg |= (u32)(p->bwg_id) << IXGBE_RTTPT2C_BWG_SHIFT;
if (p->prio_type == prio_group)
reg |= IXGBE_RTTPT2C_GSP;
if (p->prio_type == prio_link)
reg |= IXGBE_RTTPT2C_LSP;
IXGBE_WRITE_REG(hw, IXGBE_RTTPT2C(i), reg);
}
/*
* Configure Tx packet plane (recycle mode; SP; arb delay) and
* enable arbiter
*/
reg = IXGBE_RTTPCS_TPPAC | IXGBE_RTTPCS_TPRM |
(IXGBE_RTTPCS_ARBD_DCB << IXGBE_RTTPCS_ARBD_SHIFT);
IXGBE_WRITE_REG(hw, IXGBE_RTTPCS, reg);
return 0;
}
/**
* ixgbe_dcb_config_pfc_82599 - Configure priority flow control
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure Priority Flow Control (PFC) for each traffic class.
*/
s32 ixgbe_dcb_config_pfc_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
u32 i, reg, rx_pba_size;
/* If PFC is disabled globally then fall back to LFC. */
if (!dcb_config->pfc_mode_enable) {
for (i = 0; i < MAX_TRAFFIC_CLASS; i++)
hw->mac.ops.fc_enable(hw, i);
goto out;
}
/* Configure PFC Tx thresholds per TC */
for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
if (dcb_config->rx_pba_cfg == pba_equal)
rx_pba_size = IXGBE_RXPBSIZE_64KB;
else
rx_pba_size = (i < 4) ? IXGBE_RXPBSIZE_80KB
: IXGBE_RXPBSIZE_48KB;
reg = ((rx_pba_size >> 5) & 0xFFE0);
if (dcb_config->tc_config[i].dcb_pfc == pfc_enabled_full ||
dcb_config->tc_config[i].dcb_pfc == pfc_enabled_tx)
reg |= IXGBE_FCRTL_XONE;
IXGBE_WRITE_REG(hw, IXGBE_FCRTL_82599(i), reg);
reg = ((rx_pba_size >> 2) & 0xFFE0);
if (dcb_config->tc_config[i].dcb_pfc == pfc_enabled_full ||
dcb_config->tc_config[i].dcb_pfc == pfc_enabled_tx)
reg |= IXGBE_FCRTH_FCEN;
IXGBE_WRITE_REG(hw, IXGBE_FCRTH_82599(i), reg);
}
/* Configure pause time (2 TCs per register) */
reg = hw->fc.pause_time | (hw->fc.pause_time << 16);
for (i = 0; i < (MAX_TRAFFIC_CLASS / 2); i++)
IXGBE_WRITE_REG(hw, IXGBE_FCTTV(i), reg);
/* Configure flow control refresh threshold value */
IXGBE_WRITE_REG(hw, IXGBE_FCRTV, hw->fc.pause_time / 2);
/* Enable Transmit PFC */
reg = IXGBE_FCCFG_TFCE_PRIORITY;
IXGBE_WRITE_REG(hw, IXGBE_FCCFG, reg);
/*
* Enable Receive PFC
* We will always honor XOFF frames we receive when
* we are in PFC mode.
*/
reg = IXGBE_READ_REG(hw, IXGBE_MFLCN);
reg &= ~IXGBE_MFLCN_RFCE;
reg |= IXGBE_MFLCN_RPFCE;
IXGBE_WRITE_REG(hw, IXGBE_MFLCN, reg);
out:
return 0;
}
/**
* ixgbe_dcb_config_tc_stats_82599 - Config traffic class statistics
* @hw: pointer to hardware structure
*
* Configure queue statistics registers, all queues belonging to same traffic
* class uses a single set of queue statistics counters.
*/
s32 ixgbe_dcb_config_tc_stats_82599(struct ixgbe_hw *hw)
{
u32 reg = 0;
u8 i = 0;
/*
* Receive Queues stats setting
* 32 RQSMR registers, each configuring 4 queues.
* Set all 16 queues of each TC to the same stat
* with TC 'n' going to stat 'n'.
*/
for (i = 0; i < 32; i++) {
reg = 0x01010101 * (i / 4);
IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i), reg);
}
/*
* Transmit Queues stats setting
* 32 TQSM registers, each controlling 4 queues.
* Set all queues of each TC to the same stat
* with TC 'n' going to stat 'n'.
* Tx queues are allocated non-uniformly to TCs:
* 32, 32, 16, 16, 8, 8, 8, 8.
*/
for (i = 0; i < 32; i++) {
if (i < 8)
reg = 0x00000000;
else if (i < 16)
reg = 0x01010101;
else if (i < 20)
reg = 0x02020202;
else if (i < 24)
reg = 0x03030303;
else if (i < 26)
reg = 0x04040404;
else if (i < 28)
reg = 0x05050505;
else if (i < 30)
reg = 0x06060606;
else
reg = 0x07070707;
IXGBE_WRITE_REG(hw, IXGBE_TQSM(i), reg);
}
return 0;
}
/**
* ixgbe_dcb_config_82599 - Configure general DCB parameters
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure general DCB parameters.
*/
s32 ixgbe_dcb_config_82599(struct ixgbe_hw *hw)
{
u32 reg;
u32 q;
/* Disable the Tx desc arbiter so that MTQC can be changed */
reg = IXGBE_READ_REG(hw, IXGBE_RTTDCS);
reg |= IXGBE_RTTDCS_ARBDIS;
IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, reg);
/* Enable DCB for Rx with 8 TCs */
reg = IXGBE_READ_REG(hw, IXGBE_MRQC);
switch (reg & IXGBE_MRQC_MRQE_MASK) {
case 0:
case IXGBE_MRQC_RT4TCEN:
/* RSS disabled cases */
reg = (reg & ~IXGBE_MRQC_MRQE_MASK) | IXGBE_MRQC_RT8TCEN;
break;
case IXGBE_MRQC_RSSEN:
case IXGBE_MRQC_RTRSS4TCEN:
/* RSS enabled cases */
reg = (reg & ~IXGBE_MRQC_MRQE_MASK) | IXGBE_MRQC_RTRSS8TCEN;
break;
default:
/* Unsupported value, assume stale data, overwrite no RSS */
reg = (reg & ~IXGBE_MRQC_MRQE_MASK) | IXGBE_MRQC_RT8TCEN;
}
IXGBE_WRITE_REG(hw, IXGBE_MRQC, reg);
/* Enable DCB for Tx with 8 TCs */
reg = IXGBE_MTQC_RT_ENA | IXGBE_MTQC_8TC_8TQ;
IXGBE_WRITE_REG(hw, IXGBE_MTQC, reg);
/* Disable drop for all queues */
for (q=0; q < 128; q++) {
IXGBE_WRITE_REG(hw, IXGBE_QDE, q << IXGBE_QDE_IDX_SHIFT);
}
/* Enable the Tx desc arbiter */
reg = IXGBE_READ_REG(hw, IXGBE_RTTDCS);
reg &= ~IXGBE_RTTDCS_ARBDIS;
IXGBE_WRITE_REG(hw, IXGBE_RTTDCS, reg);
return 0;
}
/**
* ixgbe_dcb_hw_config_82599 - Configure and enable DCB
* @hw: pointer to hardware structure
* @dcb_config: pointer to ixgbe_dcb_config structure
*
* Configure dcb settings and enable dcb mode.
*/
s32 ixgbe_dcb_hw_config_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config)
{
u32 pap = 0;
ixgbe_dcb_config_packet_buffers_82599(hw, dcb_config);
ixgbe_dcb_config_82599(hw);
ixgbe_dcb_config_rx_arbiter_82599(hw, dcb_config);
ixgbe_dcb_config_tx_desc_arbiter_82599(hw, dcb_config);
ixgbe_dcb_config_tx_data_arbiter_82599(hw, dcb_config);
ixgbe_dcb_config_pfc_82599(hw, dcb_config);
ixgbe_dcb_config_tc_stats_82599(hw);
/*
* TODO: For DCB SV purpose only,
* remove it before product release
*/
if (dcb_config->link_speed > 0 && dcb_config->link_speed <= 9) {
pap = IXGBE_READ_REG(hw, IXGBE_PAP);
pap |= (dcb_config->link_speed << 16);
IXGBE_WRITE_REG(hw, IXGBE_PAP, pap);
}
return 0;
}

View File

@ -0,0 +1,125 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _DCB_82599_CONFIG_H_
#define _DCB_82599_CONFIG_H_
/* DCB register definitions */
#define IXGBE_RTTDCS_TDPAC 0x00000001 /* 0 Round Robin,
* 1 WSP - Weighted Strict Priority
*/
#define IXGBE_RTTDCS_VMPAC 0x00000002 /* 0 Round Robin,
* 1 WRR - Weighted Round Robin
*/
#define IXGBE_RTTDCS_TDRM 0x00000010 /* Transmit Recycle Mode */
#define IXGBE_RTTDCS_BDPM 0x00400000 /* Bypass Data Pipe - must clear! */
#define IXGBE_RTTDCS_BPBFSM 0x00800000 /* Bypass PB Free Space - must
* clear!
*/
#define IXGBE_RTTDCS_SPEED_CHG 0x80000000 /* Link speed change */
/* Receive UP2TC mapping */
#define IXGBE_RTRUP2TC_UP_SHIFT 3
/* Transmit UP2TC mapping */
#define IXGBE_RTTUP2TC_UP_SHIFT 3
#define IXGBE_RTRPT4C_MCL_SHIFT 12 /* Offset to Max Credit Limit setting */
#define IXGBE_RTRPT4C_BWG_SHIFT 9 /* Offset to BWG index */
#define IXGBE_RTRPT4C_GSP 0x40000000 /* GSP enable bit */
#define IXGBE_RTRPT4C_LSP 0x80000000 /* LSP enable bit */
#define IXGBE_RDRXCTL_MPBEN 0x00000010 /* DMA config for multiple packet
* buffers enable
*/
#define IXGBE_RDRXCTL_MCEN 0x00000040 /* DMA config for multiple cores
* (RSS) enable
*/
/* RTRPCS Bit Masks */
#define IXGBE_RTRPCS_RRM 0x00000002 /* Receive Recycle Mode enable */
/* Receive Arbitration Control: 0 Round Robin, 1 DFP */
#define IXGBE_RTRPCS_RAC 0x00000004
#define IXGBE_RTRPCS_ARBDIS 0x00000040 /* Arbitration disable bit */
/* RTTDT2C Bit Masks */
#define IXGBE_RTTDT2C_MCL_SHIFT 12
#define IXGBE_RTTDT2C_BWG_SHIFT 9
#define IXGBE_RTTDT2C_GSP 0x40000000
#define IXGBE_RTTDT2C_LSP 0x80000000
#define IXGBE_RTTPT2C_MCL_SHIFT 12
#define IXGBE_RTTPT2C_BWG_SHIFT 9
#define IXGBE_RTTPT2C_GSP 0x40000000
#define IXGBE_RTTPT2C_LSP 0x80000000
/* RTTPCS Bit Masks */
#define IXGBE_RTTPCS_TPPAC 0x00000020 /* 0 Round Robin,
* 1 SP - Strict Priority
*/
#define IXGBE_RTTPCS_ARBDIS 0x00000040 /* Arbiter disable */
#define IXGBE_RTTPCS_TPRM 0x00000100 /* Transmit Recycle Mode enable */
#define IXGBE_RTTPCS_ARBD_SHIFT 22
#define IXGBE_RTTPCS_ARBD_DCB 0x4 /* Arbitration delay in DCB mode */
#define IXGBE_TXPBSIZE_20KB 0x00005000 /* 20KB Packet Buffer */
#define IXGBE_TXPBSIZE_40KB 0x0000A000 /* 40KB Packet Buffer */
#define IXGBE_RXPBSIZE_48KB 0x0000C000 /* 48KB Packet Buffer */
#define IXGBE_RXPBSIZE_64KB 0x00010000 /* 64KB Packet Buffer */
#define IXGBE_RXPBSIZE_80KB 0x00014000 /* 80KB Packet Buffer */
#define IXGBE_RXPBSIZE_128KB 0x00020000 /* 128KB Packet Buffer */
#define IXGBE_TXPBTHRESH_DCB 0xA /* THRESH value for DCB mode */
/* DCB hardware-specific driver APIs */
/* DCB PFC functions */
s32 ixgbe_dcb_config_pfc_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_get_pfc_stats_82599(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count);
/* DCB traffic class stats */
s32 ixgbe_dcb_config_tc_stats_82599(struct ixgbe_hw *hw);
s32 ixgbe_dcb_get_tc_stats_82599(struct ixgbe_hw *hw,
struct ixgbe_hw_stats *stats,
u8 tc_count);
/* DCB config arbiters */
s32 ixgbe_dcb_config_tx_desc_arbiter_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_config_tx_data_arbiter_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
s32 ixgbe_dcb_config_rx_arbiter_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *dcb_config);
/* DCB hw initialization */
s32 ixgbe_dcb_hw_config_82599(struct ixgbe_hw *hw,
struct ixgbe_dcb_config *config);
#endif /* _DCB_82599_CONFIG_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,29 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include "ixgbe.h"

View File

@ -0,0 +1,32 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _IXGBE_FCOE_H
#define _IXGBE_FCOE_H
#endif /* _IXGBE_FCOE_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,107 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
/* glue for the OS independent part of ixgbe
* includes register access macros
*/
#ifndef _IXGBE_OSDEP_H_
#define _IXGBE_OSDEP_H_
#include <linux/pci.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/sched.h>
#include "kcompat.h"
#ifndef msleep
#define msleep(x) do { if(in_interrupt()) { \
/* Don't mdelay in interrupt context! */ \
BUG(); \
} else { \
msleep(x); \
} } while (0)
#endif
#undef ASSERT
#ifdef DBG
#define hw_dbg(hw, S, A...) printk(KERN_DEBUG S, ## A)
#else
#define hw_dbg(hw, S, A...) do {} while (0)
#endif
#ifdef DBG
#define IXGBE_WRITE_REG(a, reg, value) do {\
switch (reg) { \
case IXGBE_EIMS: \
case IXGBE_EIMC: \
case IXGBE_EIAM: \
case IXGBE_EIAC: \
case IXGBE_EICR: \
case IXGBE_EICS: \
printk("%s: Reg - 0x%05X, value - 0x%08X\n", __FUNCTION__, \
reg, (u32)(value)); \
default: \
break; \
} \
writel((value), ((a)->hw_addr + (reg))); \
} while (0)
#else
#define IXGBE_WRITE_REG(a, reg, value) writel((value), ((a)->hw_addr + (reg)))
#endif
#define IXGBE_READ_REG(a, reg) readl((a)->hw_addr + (reg))
#define IXGBE_WRITE_REG_ARRAY(a, reg, offset, value) ( \
writel((value), ((a)->hw_addr + (reg) + ((offset) << 2))))
#define IXGBE_READ_REG_ARRAY(a, reg, offset) ( \
readl((a)->hw_addr + (reg) + ((offset) << 2)))
#ifndef writeq
#define writeq(val, addr) writel((u32) (val), addr); \
writel((u32) (val >> 32), (addr + 4));
#endif
#define IXGBE_WRITE_REG64(a, reg, value) writeq((value), ((a)->hw_addr + (reg)))
#define IXGBE_WRITE_FLUSH(a) IXGBE_READ_REG(a, IXGBE_STATUS)
struct ixgbe_hw;
extern u16 ixgbe_read_pci_cfg_word(struct ixgbe_hw *hw, u32 reg);
extern void ixgbe_write_pci_cfg_word(struct ixgbe_hw *hw, u32 reg, u16 value);
#define IXGBE_READ_PCIE_WORD ixgbe_read_pci_cfg_word
#define IXGBE_WRITE_PCIE_WORD ixgbe_write_pci_cfg_word
#define IXGBE_EEPROM_GRANT_ATTEMPS 100
#define IXGBE_HTONL(_i) htonl(_i)
#define IXGBE_HTONS(_i) htons(_i)
#endif /* _IXGBE_OSDEP_H_ */

View File

@ -0,0 +1,964 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include <linux/types.h>
#include <linux/module.h>
#include "ixgbe.h"
/* This is the only thing that needs to be changed to adjust the
* maximum number of ports that the driver can manage.
*/
#define IXGBE_MAX_NIC 8
#define OPTION_UNSET -1
#define OPTION_DISABLED 0
#define OPTION_ENABLED 1
/* All parameters are treated the same, as an integer array of values.
* This macro just reduces the need to repeat the same declaration code
* over and over (plus this helps to avoid typo bugs).
*/
#define IXGBE_PARAM_INIT { [0 ... IXGBE_MAX_NIC] = OPTION_UNSET }
#ifndef module_param_array
/* Module Parameters are always initialized to -1, so that the driver
* can tell the difference between no user specified value or the
* user asking for the default value.
* The true default values are loaded in when ixgbe_check_options is called.
*
* This is a GCC extension to ANSI C.
* See the item "Labeled Elements in Initializers" in the section
* "Extensions to the C Language Family" of the GCC documentation.
*/
#define IXGBE_PARAM(X, desc) \
static const int __devinitdata X[IXGBE_MAX_NIC+1] = IXGBE_PARAM_INIT; \
MODULE_PARM(X, "1-" __MODULE_STRING(IXGBE_MAX_NIC) "i"); \
MODULE_PARM_DESC(X, desc);
#else
#define IXGBE_PARAM(X, desc) \
static int __devinitdata X[IXGBE_MAX_NIC+1] = IXGBE_PARAM_INIT; \
static unsigned int num_##X; \
module_param_array_named(X, X, int, &num_##X, 0); \
MODULE_PARM_DESC(X, desc);
#endif
IXGBE_PARAM(RXKernel, "Disable(0) or enable(1) Linux TCP/IP stack RX, default 0");
/* Interrupt Type
*
* Valid Range: 0-2
* - 0 - Legacy Interrupt
* - 1 - MSI Interrupt
* - 2 - MSI-X Interrupt(s)
*
* Default Value: 2
*/
IXGBE_PARAM(InterruptType, "Change Interrupt Mode (0=Legacy, 1=MSI, 2=MSI-X), default 2");
#define IXGBE_INT_LEGACY 0
#define IXGBE_INT_MSI 1
#define IXGBE_INT_MSIX 2
#define IXGBE_DEFAULT_INT IXGBE_INT_MSIX
#if defined(CONFIG_DCA) || defined(CONFIG_DCA_MODULE)
/* DCA - Direct Cache Access (DCA) Control
*
* This option allows the device to hint to DCA enabled processors
* which CPU should have its cache warmed with the data being
* transferred over PCIe. This can increase performance by reducing
* cache misses. ixgbe hardware supports DCA for:
* tx descriptor writeback
* rx descriptor writeback
* rx data
* rx data header only (in packet split mode)
*
* enabling option 2 can cause cache thrash in some tests, particularly
* if the CPU is completely utilized
*
* Valid Range: 0 - 2
* - 0 - disables DCA
* - 1 - enables DCA
* - 2 - enables DCA with rx data included
*
* Default Value: 2
*/
#define IXGBE_MAX_DCA 0
IXGBE_PARAM(DCA, "Disable or enable Direct Cache Access, 0=disabled, 1=descriptor only, 2=descriptor and data");
#endif
/* RXQ - The number of RX queues with RSS distribution
*
* Valid Range: 0-16
* - 0 - Default, min(16, num_online_cpus())
* - 1-16 - sets the Desc. Q's to the specified value.
*
* Default Value: 0
*/
IXGBE_PARAM(RXQ, "Number of RX queues, default 0=number of cpus");
/* TXQ - The number of TX queues
*
* Valid Range: 0-16
* - 0 - Default, min(16, num_online_cpus())
* - 1-16 - sets the Desc. Q's to the specified value.
*
* Default Value: 0
*/
IXGBE_PARAM(TXQ, "Number of TX queues, default 0=number of cpus");
/* Interrupt Throttle Rate (interrupts/sec)
*
* Valid Range: 956-488281 (0=off, 1=dynamic)
*
* Default Value: 8000
*/
#define DEFAULT_ITR 956
IXGBE_PARAM(InterruptThrottleRate, "Maximum interrupts per second, per vector, (956-488281), default 8000");
#define MAX_ITR IXGBE_MAX_INT_RATE
#define MIN_ITR IXGBE_MIN_INT_RATE
#ifndef IXGBE_NO_LLI
/* LLIPort (Low Latency Interrupt TCP Port)
*
* Valid Range: 0 - 65535
*
* Default Value: 0 (disabled)
*/
IXGBE_PARAM(LLIPort, "Low Latency Interrupt TCP Port (0-65535)");
#define DEFAULT_LLIPORT 0
#define MAX_LLIPORT 0xFFFF
#define MIN_LLIPORT 0
/* LLIPush (Low Latency Interrupt on TCP Push flag)
*
* Valid Range: 0,1
*
* Default Value: 0 (disabled)
*/
IXGBE_PARAM(LLIPush, "Low Latency Interrupt on TCP Push flag (0,1)");
#define DEFAULT_LLIPUSH 0
#define MAX_LLIPUSH 1
#define MIN_LLIPUSH 0
/* LLISize (Low Latency Interrupt on Packet Size)
*
* Valid Range: 0 - 1500
*
* Default Value: 0 (disabled)
*/
IXGBE_PARAM(LLISize, "Low Latency Interrupt on Packet Size (0-1500)");
#define DEFAULT_LLISIZE 0
#define MAX_LLISIZE 1500
#define MIN_LLISIZE 0
/* LLIEType (Low Latency Interrupt Ethernet Type)
*
* Valid Range: 0 - 0x8fff
*
* Default Value: 0 (disabled)
*/
IXGBE_PARAM(LLIEType, "Low Latency Interrupt Ethernet Protocol Type");
#define DEFAULT_LLIETYPE 0
#define MAX_LLIETYPE 0x8fff
#define MIN_LLIETYPE 0
/* LLIVLANP (Low Latency Interrupt on VLAN priority threshold)
*
* Valid Range: 0 - 7
*
* Default Value: 0 (disabled)
*/
IXGBE_PARAM(LLIVLANP, "Low Latency Interrupt on VLAN priority threshold");
#define DEFAULT_LLIVLANP 0
#define MAX_LLIVLANP 7
#define MIN_LLIVLANP 0
#endif /* IXGBE_NO_LLI */
/* Rx buffer mode
*
* Valid Range: 0-2 0 = 1buf_mode_always, 1 = ps_mode_always and 2 = optimal
*
* Default Value: 2
*/
IXGBE_PARAM(RxBufferMode, "0=1 descriptor per packet,\n"
"\t\t\t1=use packet split, multiple descriptors per jumbo frame\n"
"\t\t\t2 (default)=use 1buf mode for 1500 mtu, packet split for jumbo");
#define IXGBE_RXBUFMODE_1BUF_ALWAYS 0
#define IXGBE_RXBUFMODE_PS_ALWAYS 1
#define IXGBE_RXBUFMODE_OPTIMAL 2
#define IXGBE_DEFAULT_RXBUFMODE IXGBE_RXBUFMODE_1BUF_ALWAYS
/* Flow Director filtering mode
*
* Valid Range: 0-2 0 = off, 1 = Hashing (ATR), and 2 = perfect filters
*
* Default Value: 1 (ATR)
*/
IXGBE_PARAM(FdirMode, "Flow Director filtering modes:\n"
"\t\t\t0 = Filtering off\n"
"\t\t\t1 = Signature Hashing filters (SW ATR)\n"
"\t\t\t2 = Perfect Filters");
#define IXGBE_FDIR_FILTER_OFF 0
#define IXGBE_FDIR_FILTER_HASH 1
#define IXGBE_FDIR_FILTER_PERFECT 2
/* #define IXGBE_DEFAULT_FDIR_FILTER IXGBE_FDIR_FILTER_HASH */
#define IXGBE_DEFAULT_FDIR_FILTER IXGBE_FDIR_FILTER_OFF
/* Flow Director packet buffer allocation level
*
* Valid Range: 0-2 0 = 8k hash/2k perfect, 1 = 16k hash/4k perfect,
* 2 = 32k hash/8k perfect
*
* Default Value: 0
*/
IXGBE_PARAM(FdirPballoc, "Flow Director packet buffer allocation level:\n"
"\t\t\t0 = 8k hash filters or 2k perfect filters\n"
"\t\t\t1 = 16k hash filters or 4k perfect filters\n"
"\t\t\t2 = 32k hash filters or 8k perfect filters");
#define IXGBE_FDIR_PBALLOC_64K 0
#define IXGBE_FDIR_PBALLOC_128K 1
#define IXGBE_FDIR_PBALLOC_256K 2
#define IXGBE_DEFAULT_FDIR_PBALLOC IXGBE_FDIR_PBALLOC_64K
/* Software ATR packet sample rate
*
* Valid Range: 0-100 0 = off, 1-100 = rate of Tx packet inspection
*
* Default Value: 20
*/
IXGBE_PARAM(AtrSampleRate, "Software ATR Tx packet sample rate");
#define IXGBE_MAX_ATR_SAMPLE_RATE 100
#define IXGBE_MIN_ATR_SAMPLE_RATE 1
#define IXGBE_ATR_SAMPLE_RATE_OFF 0
#define IXGBE_DEFAULT_ATR_SAMPLE_RATE 20
struct ixgbe_option {
enum { enable_option, range_option, list_option } type;
const char *name;
const char *err;
int def;
union {
struct { /* range_option info */
int min;
int max;
} r;
struct { /* list_option info */
int nr;
const struct ixgbe_opt_list {
int i;
char *str;
} *p;
} l;
} arg;
};
static int __devinit ixgbe_validate_option(unsigned int *value,
struct ixgbe_option *opt)
{
if (*value == OPTION_UNSET) {
*value = opt->def;
return 0;
}
switch (opt->type) {
case enable_option:
switch (*value) {
case OPTION_ENABLED:
printk(KERN_INFO "ixgbe: %s Enabled\n", opt->name);
return 0;
case OPTION_DISABLED:
printk(KERN_INFO "ixgbe: %s Disabled\n", opt->name);
return 0;
}
break;
case range_option:
if (*value >= opt->arg.r.min && *value <= opt->arg.r.max) {
printk(KERN_INFO "ixgbe: %s set to %d\n", opt->name, *value);
return 0;
}
break;
case list_option: {
int i;
const struct ixgbe_opt_list *ent;
for (i = 0; i < opt->arg.l.nr; i++) {
ent = &opt->arg.l.p[i];
if (*value == ent->i) {
if (ent->str[0] != '\0')
printk(KERN_INFO "%s\n", ent->str);
return 0;
}
}
}
break;
default:
BUG();
}
printk(KERN_INFO "ixgbe: Invalid %s specified (%d), %s\n",
opt->name, *value, opt->err);
*value = opt->def;
return -1;
}
#define LIST_LEN(l) (sizeof(l) / sizeof(l[0]))
/**
* ixgbe_check_options - Range Checking for Command Line Parameters
* @adapter: board private structure
*
* This routine checks all command line parameters for valid user
* input. If an invalid value is given, or if no user specified
* value exists, a default value is used. The final value is stored
* in a variable in the adapter structure.
**/
void __devinit ixgbe_check_options(struct ixgbe_adapter *adapter)
{
int bd = adapter->bd_number;
u32 *aflags = &adapter->flags;
struct ixgbe_ring_feature *feature = adapter->ring_feature;
if (bd >= IXGBE_MAX_NIC) {
printk(KERN_NOTICE
"Warning: no configuration for board #%d\n", bd);
printk(KERN_NOTICE "Using defaults for all values\n");
#ifndef module_param_array
bd = IXGBE_MAX_NIC;
#endif
}
{ /* Linux RX Stack Support */
static struct ixgbe_option opt = {
.type = enable_option,
.name = "Linux TCP/IP stack RX",
.err = "defaulting to Disabled",
.def = OPTION_DISABLED
};
#ifdef module_param_array
if (num_RXKernel > bd) {
#endif
unsigned int tmp = RXKernel[bd];
ixgbe_validate_option(&tmp, &opt);
if (tmp)
*aflags |= IXGBE_FLAG_RX_KERNEL_ENABLE;
else
*aflags &= ~IXGBE_FLAG_RX_KERNEL_ENABLE;
#ifdef module_param_array
} else {
if (opt.def == OPTION_ENABLED)
*aflags |= IXGBE_FLAG_RX_KERNEL_ENABLE;
else
*aflags &= ~IXGBE_FLAG_RX_KERNEL_ENABLE;
}
#endif
}
{ /* Interrupt Type */
unsigned int i_type;
static struct ixgbe_option opt = {
.type = range_option,
.name = "Interrupt Type",
.err =
"using default of "__MODULE_STRING(IXGBE_DEFAULT_INT),
.def = IXGBE_DEFAULT_INT,
.arg = { .r = { .min = IXGBE_INT_LEGACY,
.max = IXGBE_INT_MSIX}}
};
#ifdef module_param_array
if (num_InterruptType > bd) {
#endif
i_type = InterruptType[bd];
ixgbe_validate_option(&i_type, &opt);
switch (i_type) {
case IXGBE_INT_MSIX:
if (!(*aflags & IXGBE_FLAG_MSIX_CAPABLE))
printk(KERN_INFO
"Ignoring MSI-X setting; "
"support unavailable\n");
break;
case IXGBE_INT_MSI:
if (!(*aflags & IXGBE_FLAG_MSI_CAPABLE)) {
printk(KERN_INFO
"Ignoring MSI setting; "
"support unavailable\n");
} else {
*aflags &= ~IXGBE_FLAG_MSIX_CAPABLE;
*aflags &= ~IXGBE_FLAG_DCB_CAPABLE;
}
break;
case IXGBE_INT_LEGACY:
default:
*aflags &= ~IXGBE_FLAG_MSIX_CAPABLE;
*aflags &= ~IXGBE_FLAG_MSI_CAPABLE;
*aflags &= ~IXGBE_FLAG_DCB_CAPABLE;
break;
}
#ifdef module_param_array
} else {
*aflags |= IXGBE_FLAG_MSIX_CAPABLE;
*aflags |= IXGBE_FLAG_MSI_CAPABLE;
}
#endif
}
#if defined(CONFIG_DCA) || defined(CONFIG_DCA_MODULE)
{ /* Direct Cache Access (DCA) */
static struct ixgbe_option opt = {
.type = range_option,
.name = "Direct Cache Access (DCA)",
.err = "defaulting to Enabled",
.def = OPTION_DISABLED,
.arg = { .r = { .min = OPTION_DISABLED,
.max = IXGBE_MAX_DCA}}
};
unsigned int dca = opt.def;
#ifdef module_param_array
if (num_DCA > bd) {
#endif
dca = DCA[bd];
ixgbe_validate_option(&dca, &opt);
if (!dca)
*aflags &= ~IXGBE_FLAG_DCA_CAPABLE;
/* Check Interoperability */
if (!(*aflags & IXGBE_FLAG_DCA_CAPABLE)) {
DPRINTK(PROBE, INFO, "DCA is disabled\n");
*aflags &= ~IXGBE_FLAG_DCA_ENABLED;
}
if (dca == IXGBE_MAX_DCA) {
DPRINTK(PROBE, INFO,
"DCA enabled for rx data\n");
adapter->flags |= IXGBE_FLAG_DCA_ENABLED_DATA;
}
#ifdef module_param_array
} else {
/* make sure to clear the capability flag if the
* option is disabled by default above */
if (opt.def == OPTION_DISABLED)
*aflags &= ~IXGBE_FLAG_DCA_CAPABLE;
}
#endif
if (dca == IXGBE_MAX_DCA)
adapter->flags |= IXGBE_FLAG_DCA_ENABLED_DATA;
}
#endif /* CONFIG_DCA or CONFIG_DCA_MODULE */
{ /* # of RX queues with RSS (RXQ) */
static struct ixgbe_option opt = {
.type = range_option,
.name = "RX queues (RXQ)",
.err = "using default.",
.def = 0,
.arg = { .r = { .min = 0,
.max = IXGBE_MAX_RSS_INDICES}}
};
unsigned int rxq = RXQ[bd];
#ifdef module_param_array
if (num_RXQ > bd) {
#endif
switch (rxq) {
case 0:
/*
* Base it off num_online_cpus() with
* a hardware limit cap.
*/
rxq = min(IXGBE_MAX_RSS_INDICES,
(int)num_online_cpus());
break;
default:
ixgbe_validate_option(&rxq, &opt);
break;
}
feature[RING_F_RXQ].indices = rxq;
*aflags |= IXGBE_FLAG_RSS_ENABLED;
#ifdef module_param_array
} else {
rxq = min(IXGBE_MAX_RSS_INDICES,
(int)num_online_cpus());
feature[RING_F_RXQ].indices = rxq;
*aflags |= IXGBE_FLAG_RSS_ENABLED;
}
#endif
}
{ /* # of TX queues (TXQ) */
static struct ixgbe_option opt = {
.type = range_option,
.name = "TX queues (TXQ)",
.err = "using default.",
.def = 0,
.arg = { .r = { .min = 0,
.max = IXGBE_MAX_RSS_INDICES}}
};
unsigned int txq = TXQ[bd];
#ifdef module_param_array
if (num_TXQ > bd) {
#endif
switch (txq) {
case 0:
/*
* Base it off num_online_cpus() with
* a hardware limit cap.
*/
txq = min(IXGBE_MAX_RSS_INDICES,
(int)num_online_cpus());
break;
default:
ixgbe_validate_option(&txq, &opt);
break;
}
feature[RING_F_TXQ].indices = txq;
#ifdef module_param_array
} else {
txq = min(IXGBE_MAX_RSS_INDICES,
(int)num_online_cpus());
feature[RING_F_TXQ].indices = txq;
}
#endif
}
{ /* Interrupt Throttling Rate */
static struct ixgbe_option opt = {
.type = range_option,
.name = "Interrupt Throttling Rate (ints/sec)",
.err = "using default of "__MODULE_STRING(DEFAULT_ITR),
.def = DEFAULT_ITR,
.arg = { .r = { .min = MIN_ITR,
.max = MAX_ITR }}
};
#ifdef module_param_array
if (num_InterruptThrottleRate > bd) {
#endif
u32 eitr = InterruptThrottleRate[bd];
switch (eitr) {
case 0:
DPRINTK(PROBE, INFO, "%s turned off\n",
opt.name);
/*
* zero is a special value, we don't want to
* turn off ITR completely, just set it to an
* insane interrupt rate
*/
adapter->eitr_param = IXGBE_MAX_INT_RATE;
adapter->itr_setting = 0;
break;
case 1:
DPRINTK(PROBE, INFO, "dynamic interrupt "
"throttling enabled\n");
adapter->eitr_param = 20000;
adapter->itr_setting = 1;
break;
default:
ixgbe_validate_option(&eitr, &opt);
adapter->eitr_param = eitr;
/* the first bit is used as control */
adapter->itr_setting = eitr & ~1;
break;
}
#ifdef module_param_array
} else {
adapter->eitr_param = DEFAULT_ITR;
adapter->itr_setting = DEFAULT_ITR;
}
#endif
}
#ifndef IXGBE_NO_LLI
{ /* Low Latency Interrupt TCP Port*/
static struct ixgbe_option opt = {
.type = range_option,
.name = "Low Latency Interrupt TCP Port",
.err = "using default of "
__MODULE_STRING(DEFAULT_LLIPORT),
.def = DEFAULT_LLIPORT,
.arg = { .r = { .min = MIN_LLIPORT,
.max = MAX_LLIPORT }}
};
#ifdef module_param_array
if (num_LLIPort > bd) {
#endif
adapter->lli_port = LLIPort[bd];
if (adapter->lli_port) {
ixgbe_validate_option(&adapter->lli_port, &opt);
} else {
DPRINTK(PROBE, INFO, "%s turned off\n",
opt.name);
}
#ifdef module_param_array
} else {
adapter->lli_port = opt.def;
}
#endif
}
{ /* Low Latency Interrupt on Packet Size */
static struct ixgbe_option opt = {
.type = range_option,
.name = "Low Latency Interrupt on Packet Size",
.err = "using default of "
__MODULE_STRING(DEFAULT_LLISIZE),
.def = DEFAULT_LLISIZE,
.arg = { .r = { .min = MIN_LLISIZE,
.max = MAX_LLISIZE }}
};
#ifdef module_param_array
if (num_LLISize > bd) {
#endif
adapter->lli_size = LLISize[bd];
if (adapter->lli_size) {
ixgbe_validate_option(&adapter->lli_size, &opt);
} else {
DPRINTK(PROBE, INFO, "%s turned off\n",
opt.name);
}
#ifdef module_param_array
} else {
adapter->lli_size = opt.def;
}
#endif
}
{ /*Low Latency Interrupt on TCP Push flag*/
static struct ixgbe_option opt = {
.type = enable_option,
.name = "Low Latency Interrupt on TCP Push flag",
.err = "defaulting to Disabled",
.def = OPTION_DISABLED
};
#ifdef module_param_array
if (num_LLIPush > bd) {
#endif
unsigned int lli_push = LLIPush[bd];
ixgbe_validate_option(&lli_push, &opt);
if (lli_push)
*aflags |= IXGBE_FLAG_LLI_PUSH;
else
*aflags &= ~IXGBE_FLAG_LLI_PUSH;
#ifdef module_param_array
} else {
if (opt.def == OPTION_ENABLED)
*aflags |= IXGBE_FLAG_LLI_PUSH;
else
*aflags &= ~IXGBE_FLAG_LLI_PUSH;
}
#endif
}
{ /* Low Latency Interrupt EtherType*/
static struct ixgbe_option opt = {
.type = range_option,
.name = "Low Latency Interrupt on Ethernet Protocol Type",
.err = "using default of "
__MODULE_STRING(DEFAULT_LLIETYPE),
.def = DEFAULT_LLIETYPE,
.arg = { .r = { .min = MIN_LLIETYPE,
.max = MAX_LLIETYPE }}
};
#ifdef module_param_array
if (num_LLIEType > bd) {
#endif
adapter->lli_etype = LLIEType[bd];
if (adapter->lli_etype) {
ixgbe_validate_option(&adapter->lli_etype, &opt);
} else {
DPRINTK(PROBE, INFO, "%s turned off\n",
opt.name);
}
#ifdef module_param_array
} else {
adapter->lli_etype = opt.def;
}
#endif
}
{ /* LLI VLAN Priority */
static struct ixgbe_option opt = {
.type = range_option,
.name = "Low Latency Interrupt on VLAN priority threashold",
.err = "using default of "
__MODULE_STRING(DEFAULT_LLIVLANP),
.def = DEFAULT_LLIVLANP,
.arg = { .r = { .min = MIN_LLIVLANP,
.max = MAX_LLIVLANP }}
};
#ifdef module_param_array
if (num_LLIVLANP > bd) {
#endif
adapter->lli_vlan_pri = LLIVLANP[bd];
if (adapter->lli_vlan_pri) {
ixgbe_validate_option(&adapter->lli_vlan_pri, &opt);
} else {
DPRINTK(PROBE, INFO, "%s turned off\n",
opt.name);
}
#ifdef module_param_array
} else {
adapter->lli_vlan_pri = opt.def;
}
#endif
}
#endif /* IXGBE_NO_LLI */
{ /* Rx buffer mode */
unsigned int rx_buf_mode;
static struct ixgbe_option opt = {
.type = range_option,
.name = "Rx buffer mode",
.err = "using default of "
__MODULE_STRING(IXGBE_DEFAULT_RXBUFMODE),
.def = IXGBE_DEFAULT_RXBUFMODE,
.arg = {.r = {.min = IXGBE_RXBUFMODE_1BUF_ALWAYS,
.max = IXGBE_RXBUFMODE_OPTIMAL}}
};
#ifdef module_param_array
if (num_RxBufferMode > bd) {
#endif
rx_buf_mode = RxBufferMode[bd];
ixgbe_validate_option(&rx_buf_mode, &opt);
switch (rx_buf_mode) {
case IXGBE_RXBUFMODE_OPTIMAL:
*aflags |= IXGBE_FLAG_RX_1BUF_CAPABLE;
*aflags |= IXGBE_FLAG_RX_PS_CAPABLE;
break;
case IXGBE_RXBUFMODE_PS_ALWAYS:
*aflags |= IXGBE_FLAG_RX_PS_CAPABLE;
break;
case IXGBE_RXBUFMODE_1BUF_ALWAYS:
*aflags |= IXGBE_FLAG_RX_1BUF_CAPABLE;
default:
break;
}
#ifdef module_param_array
} else {
*aflags |= IXGBE_FLAG_RX_1BUF_CAPABLE;
*aflags |= IXGBE_FLAG_RX_PS_CAPABLE;
}
#endif
}
{ /* Flow Director filtering mode */
unsigned int fdir_filter_mode;
static struct ixgbe_option opt = {
.type = range_option,
.name = "Flow Director filtering mode",
.err = "using default of "
__MODULE_STRING(IXGBE_DEFAULT_FDIR_FILTER),
.def = IXGBE_DEFAULT_FDIR_FILTER,
.arg = {.r = {.min = IXGBE_FDIR_FILTER_OFF,
.max = IXGBE_FDIR_FILTER_PERFECT}}
};
*aflags &= ~IXGBE_FLAG_FDIR_HASH_CAPABLE;
*aflags &= ~IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
if (adapter->hw.mac.type == ixgbe_mac_82598EB)
goto no_flow_director;
#ifdef module_param_array
if (num_FdirMode > bd) {
#endif
#ifdef HAVE_TX_MQ
fdir_filter_mode = FdirMode[bd];
#else
fdir_filter_mode = IXGBE_FDIR_FILTER_OFF;
#endif /* HAVE_TX_MQ */
ixgbe_validate_option(&fdir_filter_mode, &opt);
switch (fdir_filter_mode) {
case IXGBE_FDIR_FILTER_OFF:
DPRINTK(PROBE, INFO, "Flow Director disabled\n");
break;
case IXGBE_FDIR_FILTER_HASH:
*aflags |= IXGBE_FLAG_FDIR_HASH_CAPABLE;
*aflags &= ~IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
feature[RING_F_FDIR].indices =
IXGBE_MAX_FDIR_INDICES;
DPRINTK(PROBE, INFO,
"Flow Director hash filtering enabled\n");
break;
case IXGBE_FDIR_FILTER_PERFECT:
*aflags &= ~IXGBE_FLAG_FDIR_HASH_CAPABLE;
*aflags |= IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
feature[RING_F_FDIR].indices =
IXGBE_MAX_FDIR_INDICES;
spin_lock_init(&adapter->fdir_perfect_lock);
DPRINTK(PROBE, INFO,
"Flow Director perfect filtering enabled\n");
break;
default:
break;
}
#ifdef module_param_array
} else {
#ifdef HAVE_TX_MQ
if (opt.def != IXGBE_FDIR_FILTER_OFF) {
*aflags |= IXGBE_FLAG_FDIR_HASH_CAPABLE;
feature[RING_F_FDIR].indices = IXGBE_MAX_FDIR_INDICES;
DPRINTK(PROBE, INFO,
"Flow Director hash filtering enabled\n");
} else {
#endif /* HAVE_TX_MQ */
*aflags &= ~IXGBE_FLAG_FDIR_HASH_CAPABLE;
*aflags &= ~IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
feature[RING_F_FDIR].indices = 0;
DPRINTK(PROBE, INFO,
"Flow Director hash filtering disabled\n");
#ifdef HAVE_TX_MQ
}
#endif /* HAVE_TX_MQ */
}
/* Check interoperability */
if ((*aflags & IXGBE_FLAG_FDIR_HASH_CAPABLE) ||
(*aflags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE)) {
if (!(*aflags & IXGBE_FLAG_MQ_CAPABLE)) {
DPRINTK(PROBE, INFO,
"Flow Director is not supported "
"while multiple queues are disabled. "
"Disabling Flow Director\n");
*aflags &= ~IXGBE_FLAG_FDIR_HASH_CAPABLE;
*aflags &= ~IXGBE_FLAG_FDIR_PERFECT_CAPABLE;
}
}
#endif
no_flow_director:
/* empty code line with semi-colon */ ;
}
{ /* Flow Director packet buffer allocation */
unsigned int fdir_pballoc_mode;
static struct ixgbe_option opt = {
.type = range_option,
.name = "Flow Director packet buffer allocation",
.err = "using default of "
__MODULE_STRING(IXGBE_DEFAULT_FDIR_PBALLOC),
.def = IXGBE_DEFAULT_FDIR_PBALLOC,
.arg = {.r = {.min = IXGBE_FDIR_PBALLOC_64K,
.max = IXGBE_FDIR_PBALLOC_256K}}
};
char pstring[10];
if ((adapter->hw.mac.type == ixgbe_mac_82598EB) ||
(!(*aflags & (IXGBE_FLAG_FDIR_HASH_CAPABLE |
IXGBE_FLAG_FDIR_PERFECT_CAPABLE))))
goto no_fdir_pballoc;
#ifdef module_param_array
if (num_FdirPballoc > bd) {
#endif
fdir_pballoc_mode = FdirPballoc[bd];
ixgbe_validate_option(&fdir_pballoc_mode, &opt);
switch (fdir_pballoc_mode) {
case IXGBE_FDIR_PBALLOC_64K:
adapter->fdir_pballoc = IXGBE_FDIR_PBALLOC_64K;
sprintf(pstring, "64kB");
break;
case IXGBE_FDIR_PBALLOC_128K:
adapter->fdir_pballoc = IXGBE_FDIR_PBALLOC_128K;
sprintf(pstring, "128kB");
break;
case IXGBE_FDIR_PBALLOC_256K:
adapter->fdir_pballoc = IXGBE_FDIR_PBALLOC_256K;
sprintf(pstring, "256kB");
break;
default:
break;
}
DPRINTK(PROBE, INFO,
"Flow Director allocated %s of packet buffer\n",
pstring);
#ifdef module_param_array
} else {
adapter->fdir_pballoc = opt.def;
DPRINTK(PROBE, INFO,
"Flow Director allocated 64kB of packet buffer\n");
}
#endif
no_fdir_pballoc:
/* empty code line with semi-colon */ ;
}
{ /* Flow Director ATR Tx sample packet rate */
static struct ixgbe_option opt = {
.type = range_option,
.name = "Software ATR Tx packet sample rate",
.err = "using default of "
__MODULE_STRING(IXGBE_DEFAULT_ATR_SAMPLE_RATE),
.def = IXGBE_DEFAULT_ATR_SAMPLE_RATE,
.arg = {.r = {.min = IXGBE_ATR_SAMPLE_RATE_OFF,
.max = IXGBE_MAX_ATR_SAMPLE_RATE}}
};
static const char atr_string[] =
"ATR Tx Packet sample rate set to";
adapter->atr_sample_rate = IXGBE_ATR_SAMPLE_RATE_OFF;
if (adapter->hw.mac.type == ixgbe_mac_82598EB)
goto no_fdir_sample;
/* no sample rate for perfect filtering */
if (*aflags & IXGBE_FLAG_FDIR_PERFECT_CAPABLE)
goto no_fdir_sample;
#ifdef module_param_array
if (num_AtrSampleRate > bd) {
#endif
/* Only enable the sample rate if hashing (ATR) is on */
if (*aflags & IXGBE_FLAG_FDIR_HASH_CAPABLE)
adapter->atr_sample_rate = AtrSampleRate[bd];
if (adapter->atr_sample_rate) {
ixgbe_validate_option(&adapter->atr_sample_rate,
&opt);
DPRINTK(PROBE, INFO, "%s %d\n", atr_string,
adapter->atr_sample_rate);
}
#ifdef module_param_array
} else {
/* Only enable the sample rate if hashing (ATR) is on */
if (*aflags & IXGBE_FLAG_FDIR_HASH_CAPABLE)
adapter->atr_sample_rate = opt.def;
DPRINTK(PROBE, INFO, "%s default of %d\n", atr_string,
adapter->atr_sample_rate);
}
#endif
no_fdir_sample:
/* empty code line with semi-colon */ ;
}
}

1651
io_engine/driver/ixgbe_phy.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,123 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#ifndef _IXGBE_PHY_H_
#define _IXGBE_PHY_H_
#include "ixgbe_type.h"
#define IXGBE_I2C_EEPROM_DEV_ADDR 0xA0
/* EEPROM byte offsets */
#define IXGBE_SFF_IDENTIFIER 0x0
#define IXGBE_SFF_IDENTIFIER_SFP 0x3
#define IXGBE_SFF_VENDOR_OUI_BYTE0 0x25
#define IXGBE_SFF_VENDOR_OUI_BYTE1 0x26
#define IXGBE_SFF_VENDOR_OUI_BYTE2 0x27
#define IXGBE_SFF_1GBE_COMP_CODES 0x6
#define IXGBE_SFF_10GBE_COMP_CODES 0x3
#define IXGBE_SFF_CABLE_TECHNOLOGY 0x8
/* Bitmasks */
#define IXGBE_SFF_DA_PASSIVE_CABLE 0x4
#define IXGBE_SFF_1GBASESX_CAPABLE 0x1
#define IXGBE_SFF_1GBASELX_CAPABLE 0x2
#define IXGBE_SFF_10GBASESR_CAPABLE 0x10
#define IXGBE_SFF_10GBASELR_CAPABLE 0x20
#define IXGBE_I2C_EEPROM_READ_MASK 0x100
#define IXGBE_I2C_EEPROM_STATUS_MASK 0x3
#define IXGBE_I2C_EEPROM_STATUS_NO_OPERATION 0x0
#define IXGBE_I2C_EEPROM_STATUS_PASS 0x1
#define IXGBE_I2C_EEPROM_STATUS_FAIL 0x2
#define IXGBE_I2C_EEPROM_STATUS_IN_PROGRESS 0x3
/* Bit-shift macros */
#define IXGBE_SFF_VENDOR_OUI_BYTE0_SHIFT 24
#define IXGBE_SFF_VENDOR_OUI_BYTE1_SHIFT 16
#define IXGBE_SFF_VENDOR_OUI_BYTE2_SHIFT 8
/* Vendor OUIs: format of OUI is 0x[byte0][byte1][byte2][00] */
#define IXGBE_SFF_VENDOR_OUI_TYCO 0x00407600
#define IXGBE_SFF_VENDOR_OUI_FTL 0x00906500
#define IXGBE_SFF_VENDOR_OUI_AVAGO 0x00176A00
#define IXGBE_SFF_VENDOR_OUI_INTEL 0x001B2100
/* I2C SDA and SCL timing parameters for standard mode */
#define IXGBE_I2C_T_HD_STA 4
#define IXGBE_I2C_T_LOW 5
#define IXGBE_I2C_T_HIGH 4
#define IXGBE_I2C_T_SU_STA 5
#define IXGBE_I2C_T_HD_DATA 5
#define IXGBE_I2C_T_SU_DATA 1
#define IXGBE_I2C_T_RISE 1
#define IXGBE_I2C_T_FALL 1
#define IXGBE_I2C_T_SU_STO 4
#define IXGBE_I2C_T_BUF 5
s32 ixgbe_init_phy_ops_generic(struct ixgbe_hw *hw);
bool ixgbe_validate_phy_addr(struct ixgbe_hw *hw, u32 phy_addr);
enum ixgbe_phy_type ixgbe_get_phy_type_from_id(u32 phy_id);
s32 ixgbe_get_phy_id(struct ixgbe_hw *hw);
s32 ixgbe_identify_phy_generic(struct ixgbe_hw *hw);
s32 ixgbe_reset_phy_generic(struct ixgbe_hw *hw);
s32 ixgbe_read_phy_reg_generic(struct ixgbe_hw *hw, u32 reg_addr,
u32 device_type, u16 *phy_data);
s32 ixgbe_write_phy_reg_generic(struct ixgbe_hw *hw, u32 reg_addr,
u32 device_type, u16 phy_data);
s32 ixgbe_setup_phy_link_generic(struct ixgbe_hw *hw);
s32 ixgbe_setup_phy_link_speed_generic(struct ixgbe_hw *hw,
ixgbe_link_speed speed,
bool autoneg,
bool autoneg_wait_to_complete);
s32 ixgbe_get_copper_link_capabilities_generic(struct ixgbe_hw *hw,
ixgbe_link_speed *speed,
bool *autoneg);
/* PHY specific */
s32 ixgbe_check_phy_link_tnx(struct ixgbe_hw *hw,
ixgbe_link_speed *speed,
bool *link_up);
s32 ixgbe_setup_phy_link_tnx(struct ixgbe_hw *hw);
s32 ixgbe_get_phy_firmware_version_tnx(struct ixgbe_hw *hw,
u16 *firmware_version);
s32 ixgbe_get_phy_firmware_version_aq(struct ixgbe_hw *hw,
u16 *firmware_version);
s32 ixgbe_reset_phy_nl(struct ixgbe_hw *hw);
s32 ixgbe_identify_sfp_module_generic(struct ixgbe_hw *hw);
s32 ixgbe_get_sfp_init_sequence_offsets(struct ixgbe_hw *hw,
u16 *list_offset,
u16 *data_offset);
s32 ixgbe_read_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset,
u8 dev_addr, u8 *data);
s32 ixgbe_write_i2c_byte_generic(struct ixgbe_hw *hw, u8 byte_offset,
u8 dev_addr, u8 data);
s32 ixgbe_read_i2c_eeprom_generic(struct ixgbe_hw *hw, u8 byte_offset,
u8 *eeprom_data);
s32 ixgbe_write_i2c_eeprom_generic(struct ixgbe_hw *hw, u8 byte_offset,
u8 eeprom_data);
#endif /* _IXGBE_PHY_H_ */

View File

@ -0,0 +1,29 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include "ixgbe.h"

File diff suppressed because it is too large Load Diff

596
io_engine/driver/kcompat.c Normal file
View File

@ -0,0 +1,596 @@
/*******************************************************************************
Intel 10 Gigabit PCI Express Linux driver
Copyright(c) 1999 - 2009 Intel Corporation.
This program is free software; you can redistribute it and/or modify it
under the terms and conditions of the GNU General Public License,
version 2, as published by the Free Software Foundation.
This program is distributed in the hope it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
The full GNU General Public License is included in this distribution in
the file called "COPYING".
Contact Information:
e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
*******************************************************************************/
#include "ixgbe.h"
#include "kcompat.h"
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,4,21) )
struct sk_buff *
_kc_skb_pad(struct sk_buff *skb, int pad)
{
struct sk_buff *nskb;
/* If the skbuff is non linear tailroom is always zero.. */
if(skb_tailroom(skb) >= pad)
{
memset(skb->data+skb->len, 0, pad);
return skb;
}
nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC);
kfree_skb(skb);
if(nskb)
memset(nskb->data+nskb->len, 0, pad);
return nskb;
}
#endif /* < 2.4.21 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,4,13) )
/**************************************/
/* PCI DMA MAPPING */
#if defined(CONFIG_HIGHMEM)
#ifndef PCI_DRAM_OFFSET
#define PCI_DRAM_OFFSET 0
#endif
u64
_kc_pci_map_page(struct pci_dev *dev, struct page *page, unsigned long offset,
size_t size, int direction)
{
return (((u64) (page - mem_map) << PAGE_SHIFT) + offset +
PCI_DRAM_OFFSET);
}
#else /* CONFIG_HIGHMEM */
u64
_kc_pci_map_page(struct pci_dev *dev, struct page *page, unsigned long offset,
size_t size, int direction)
{
return pci_map_single(dev, (void *)page_address(page) + offset, size,
direction);
}
#endif /* CONFIG_HIGHMEM */
void
_kc_pci_unmap_page(struct pci_dev *dev, u64 dma_addr, size_t size,
int direction)
{
return pci_unmap_single(dev, dma_addr, size, direction);
}
#endif /* 2.4.13 => 2.4.3 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,4,3) )
/**************************************/
/* PCI DRIVER API */
int
_kc_pci_set_dma_mask(struct pci_dev *dev, dma_addr_t mask)
{
if (!pci_dma_supported(dev, mask))
return -EIO;
dev->dma_mask = mask;
return 0;
}
int
_kc_pci_request_regions(struct pci_dev *dev, char *res_name)
{
int i;
for (i = 0; i < 6; i++) {
if (pci_resource_len(dev, i) == 0)
continue;
if (pci_resource_flags(dev, i) & IORESOURCE_IO) {
if (!request_region(pci_resource_start(dev, i), pci_resource_len(dev, i), res_name)) {
pci_release_regions(dev);
return -EBUSY;
}
} else if (pci_resource_flags(dev, i) & IORESOURCE_MEM) {
if (!request_mem_region(pci_resource_start(dev, i), pci_resource_len(dev, i), res_name)) {
pci_release_regions(dev);
return -EBUSY;
}
}
}
return 0;
}
void
_kc_pci_release_regions(struct pci_dev *dev)
{
int i;
for (i = 0; i < 6; i++) {
if (pci_resource_len(dev, i) == 0)
continue;
if (pci_resource_flags(dev, i) & IORESOURCE_IO)
release_region(pci_resource_start(dev, i), pci_resource_len(dev, i));
else if (pci_resource_flags(dev, i) & IORESOURCE_MEM)
release_mem_region(pci_resource_start(dev, i), pci_resource_len(dev, i));
}
}
/**************************************/
/* NETWORK DRIVER API */
struct net_device *
_kc_alloc_etherdev(int sizeof_priv)
{
struct net_device *dev;
int alloc_size;
alloc_size = sizeof(*dev) + sizeof_priv + IFNAMSIZ + 31;
dev = kmalloc(alloc_size, GFP_KERNEL);
if (!dev)
return NULL;
memset(dev, 0, alloc_size);
if (sizeof_priv)
dev->priv = (void *) (((unsigned long)(dev + 1) + 31) & ~31);
dev->name[0] = '\0';
ether_setup(dev);
return dev;
}
int
_kc_is_valid_ether_addr(u8 *addr)
{
const char zaddr[6] = { 0, };
return !(addr[0] & 1) && memcmp(addr, zaddr, 6);
}
#endif /* 2.4.3 => 2.4.0 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,4,6) )
int
_kc_pci_set_power_state(struct pci_dev *dev, int state)
{
return 0;
}
int
_kc_pci_enable_wake(struct pci_dev *pdev, u32 state, int enable)
{
return 0;
}
#endif /* 2.4.6 => 2.4.3 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) )
void _kc_skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page,
int off, int size)
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
frag->page = page;
frag->page_offset = off;
frag->size = size;
skb_shinfo(skb)->nr_frags = i + 1;
}
/*
* Original Copyright:
* find_next_bit.c: fallback find next bit implementation
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
/**
* find_next_bit - find the next set bit in a memory region
* @addr: The address to base the search on
* @offset: The bitnumber to start searching at
* @size: The maximum size to search
*/
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
const unsigned long *p = addr + BITOP_WORD(offset);
unsigned long result = offset & ~(BITS_PER_LONG-1);
unsigned long tmp;
if (offset >= size)
return size;
size -= result;
offset %= BITS_PER_LONG;
if (offset) {
tmp = *(p++);
tmp &= (~0UL << offset);
if (size < BITS_PER_LONG)
goto found_first;
if (tmp)
goto found_middle;
size -= BITS_PER_LONG;
result += BITS_PER_LONG;
}
while (size & ~(BITS_PER_LONG-1)) {
if ((tmp = *(p++)))
goto found_middle;
result += BITS_PER_LONG;
size -= BITS_PER_LONG;
}
if (!size)
return result;
tmp = *p;
found_first:
tmp &= (~0UL >> (BITS_PER_LONG - size));
if (tmp == 0UL) /* Are any bits set? */
return result + size; /* Nope. */
found_middle:
return result + ffs(tmp);
}
#endif /* 2.6.0 => 2.4.6 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) )
void *_kc_kzalloc(size_t size, int flags)
{
void *ret = kmalloc(size, flags);
if (ret)
memset(ret, 0, size);
return ret;
}
#endif /* <= 2.6.13 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) )
struct sk_buff *_kc_netdev_alloc_skb(struct net_device *dev,
unsigned int length)
{
/* 16 == NET_PAD_SKB */
struct sk_buff *skb;
skb = alloc_skb(length + 16, GFP_ATOMIC);
if (likely(skb != NULL)) {
skb_reserve(skb, 16);
skb->dev = dev;
}
return skb;
}
#endif /* <= 2.6.17 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) )
int _kc_pci_save_state(struct pci_dev *pdev)
{
struct net_device *netdev = pci_get_drvdata(pdev);
struct adapter_struct *adapter = netdev_priv(netdev);
int size = PCI_CONFIG_SPACE_LEN, i;
u16 pcie_cap_offset = pci_find_capability(pdev, PCI_CAP_ID_EXP);
u16 pcie_link_status;
if (pcie_cap_offset) {
if (!pci_read_config_word(pdev,
pcie_cap_offset + PCIE_LINK_STATUS,
&pcie_link_status))
size = PCIE_CONFIG_SPACE_LEN;
}
pci_config_space_ich8lan();
#ifdef HAVE_PCI_ERS
if (adapter->config_space == NULL)
#else
WARN_ON(adapter->config_space != NULL);
#endif
adapter->config_space = kmalloc(size, GFP_KERNEL);
if (!adapter->config_space) {
printk(KERN_ERR "Out of memory in pci_save_state\n");
return -ENOMEM;
}
for (i = 0; i < (size / 4); i++)
pci_read_config_dword(pdev, i * 4, &adapter->config_space[i]);
return 0;
}
void _kc_pci_restore_state(struct pci_dev * pdev)
{
struct net_device *netdev = pci_get_drvdata(pdev);
struct adapter_struct *adapter = netdev_priv(netdev);
int size = PCI_CONFIG_SPACE_LEN, i;
u16 pcie_cap_offset;
u16 pcie_link_status;
if (adapter->config_space != NULL) {
pcie_cap_offset = pci_find_capability(pdev, PCI_CAP_ID_EXP);
if (pcie_cap_offset &&
!pci_read_config_word(pdev,
pcie_cap_offset + PCIE_LINK_STATUS,
&pcie_link_status))
size = PCIE_CONFIG_SPACE_LEN;
pci_config_space_ich8lan();
for (i = 0; i < (size / 4); i++)
pci_write_config_dword(pdev, i * 4, adapter->config_space[i]);
#ifndef HAVE_PCI_ERS
kfree(adapter->config_space);
adapter->config_space = NULL;
#endif
}
}
#ifdef HAVE_PCI_ERS
void _kc_free_netdev(struct net_device *netdev)
{
struct adapter_struct *adapter = netdev_priv(netdev);
if (adapter->config_space != NULL)
kfree(adapter->config_space);
#ifdef CONFIG_SYSFS
if (netdev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)netdev - netdev->padded);
} else {
BUG_ON(netdev->reg_state != NETREG_UNREGISTERED);
netdev->reg_state = NETREG_RELEASED;
class_device_put(&netdev->class_dev);
}
#else
kfree((char *)netdev - netdev->padded);
#endif
}
#endif
#endif /* <= 2.6.18 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) )
int ixgbe_dcb_netlink_register()
{
return 0;
}
int ixgbe_dcb_netlink_unregister()
{
return 0;
}
#endif /* < 2.6.23 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) )
#ifdef NAPI
struct net_device *napi_to_poll_dev(struct napi_struct *napi)
{
struct adapter_q_vector *q_vector = container_of(napi,
struct adapter_q_vector,
napi);
return &q_vector->poll_dev;
}
int __kc_adapter_clean(struct net_device *netdev, int *budget)
{
int work_done;
int work_to_do = min(*budget, netdev->quota);
/* kcompat.h netif_napi_add puts napi struct in "fake netdev->priv" */
struct napi_struct *napi = netdev->priv;
work_done = napi->poll(napi, work_to_do);
*budget -= work_done;
netdev->quota -= work_done;
return (work_done >= work_to_do) ? 1 : 0;
}
#endif /* NAPI */
#endif /* <= 2.6.24 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) )
#ifdef HAVE_TX_MQ
void _kc_netif_tx_stop_all_queues(struct net_device *netdev)
{
struct adapter_struct *adapter = netdev_priv(netdev);
int i;
netif_stop_queue(netdev);
if (netif_is_multiqueue(netdev))
for (i = 0; i < adapter->num_tx_queues; i++)
netif_stop_subqueue(netdev, i);
}
void _kc_netif_tx_wake_all_queues(struct net_device *netdev)
{
struct adapter_struct *adapter = netdev_priv(netdev);
int i;
netif_wake_queue(netdev);
if (netif_is_multiqueue(netdev))
for (i = 0; i < adapter->num_tx_queues; i++)
netif_wake_subqueue(netdev, i);
}
void _kc_netif_tx_start_all_queues(struct net_device *netdev)
{
struct adapter_struct *adapter = netdev_priv(netdev);
int i;
netif_start_queue(netdev);
if (netif_is_multiqueue(netdev))
for (i = 0; i < adapter->num_tx_queues; i++)
netif_start_subqueue(netdev, i);
}
#endif /* HAVE_TX_MQ */
#endif /* < 2.6.27 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) )
int
_kc_pci_prepare_to_sleep(struct pci_dev *dev)
{
pci_power_t target_state;
int error;
target_state = pci_choose_state(dev, PMSG_SUSPEND);
pci_enable_wake(dev, target_state, true);
error = pci_set_power_state(dev, target_state);
if (error)
pci_enable_wake(dev, target_state, false);
return error;
}
int
_kc_pci_wake_from_d3(struct pci_dev *dev, bool enable)
{
int err;
err = pci_enable_wake(dev, PCI_D3cold, enable);
if (err)
goto out;
err = pci_enable_wake(dev, PCI_D3hot, enable);
out:
return err;
}
#endif /* < 2.6.28 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) )
void _kc_pci_disable_link_state(struct pci_dev *pdev, int state)
{
struct pci_dev *parent = pdev->bus->self;
u16 link_state;
int pos;
if (!parent)
return;
pos = pci_find_capability(parent, PCI_CAP_ID_EXP);
if (pos) {
pci_read_config_word(parent, pos + PCI_EXP_LNKCTL, &link_state);
link_state &= ~state;
pci_write_config_word(parent, pos + PCI_EXP_LNKCTL, link_state);
}
}
#endif /* < 2.6.29 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) )
#ifdef HAVE_NETDEV_SELECT_QUEUE
#include <net/ip.h>
static u32 _kc_simple_tx_hashrnd;
static u32 _kc_simple_tx_hashrnd_initialized;
u16 _kc_skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
{
u32 addr1, addr2, ports;
u32 hash, ihl;
u8 ip_proto = 0;
if (unlikely(!_kc_simple_tx_hashrnd_initialized)) {
get_random_bytes(&_kc_simple_tx_hashrnd, 4);
_kc_simple_tx_hashrnd_initialized = 1;
}
switch (skb->protocol) {
case htons(ETH_P_IP):
if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
ip_proto = ip_hdr(skb)->protocol;
addr1 = ip_hdr(skb)->saddr;
addr2 = ip_hdr(skb)->daddr;
ihl = ip_hdr(skb)->ihl;
break;
case htons(ETH_P_IPV6):
ip_proto = ipv6_hdr(skb)->nexthdr;
addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
ihl = (40 >> 2);
break;
default:
return 0;
}
switch (ip_proto) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_DCCP:
case IPPROTO_ESP:
case IPPROTO_AH:
case IPPROTO_SCTP:
case IPPROTO_UDPLITE:
ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
break;
default:
ports = 0;
break;
}
hash = jhash_3words(addr1, addr2, ports, _kc_simple_tx_hashrnd);
return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
}
#endif /* HAVE_NETDEV_SELECT_QUEUE */
#endif /* < 2.6.30 */
/*****************************************************************************/
#if ( LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) )
int _kc_ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
{
unsigned long features = dev->features;
if (data & ~supported)
return -EINVAL;
#ifdef NETIF_F_LRO
features &= ~NETIF_F_LRO;
if (data & ETH_FLAG_LRO)
features |= NETIF_F_LRO;
#endif
#ifdef NETIF_F_NTUPLE
features &= ~NETIF_F_NTUPLE;
if (data & ETH_FLAG_NTUPLE)
features |= NETIF_F_NTUPLE;
#endif
#ifdef NETIF_F_RXHASH
features &= ~NETIF_F_RXHASH;
if (data & ETH_FLAG_RXHASH)
features |= NETIF_F_RXHASH;
#endif
dev->features = features;
return 0;
}
#endif /* < 2.6.36 */

1961
io_engine/driver/kcompat.h Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

269
io_engine/include/ps.h Normal file
View File

@ -0,0 +1,269 @@
#ifndef _PS_H_
#define _PS_H_
#define MAX_DEVICES 16
#define MAX_RINGS 64
/* IN: option for ps_wait(); */
#define PS_CTL_IN 0x1 /* The associated queue is available to read */
#define PS_CTL_OUT 0x2 /* The associated queue is available to write */
/* The associated queue is available to write or read */
#define PS_CTL_INOUT (PS_CTL_IN | PS_CTL_OUT)
/* OUT: return values for ps_wait() */
#define PS_SEND_AVAILABLE 0x1 /* The associated queue is available to read */
#define PS_RECEIVE_AVAILABLE 0x2 /* The associated queue is available to write */
/* The associated queue is available to read and write */
#define PS_ALL_AVAILABLE (PS_SEND_AVAILABLE | PS_RECEIVE_AVAILABLE)
#define PS_SEND_MIN 256
#ifdef __KERNEL__
#define PS_MAJOR 1010
#define PS_NAME "packet_shader"
#define MAX_BUFS (12*4)
struct ____cacheline_aligned ps_context {
struct semaphore sem;
wait_queue_head_t wq;
int num_attached;
struct ixgbe_ring *rx_rings[MAX_RINGS];
int next_ring;
struct ps_pkt_info *info;
/* char *buf; */
int num_bufs;
int buf_refcnt[MAX_BUFS];
char *kbufs[MAX_BUFS];
char __user *ubufs[MAX_BUFS];
};
#else /* __KERNEL__ */
#include <string.h>
#include <stdint.h>
#include <pthread.h>
#include <linux/types.h>
#define __user
#ifndef IFNAMSIZ
#define IFNAMSIZ 16
#endif
#ifndef ETH_ALEN
#define ETH_ALEN 6
#endif
#define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1)
#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
unsigned int sum;
asm(" movl (%1), %0\n"
" subl $4, %2\n"
" jbe 2f\n"
" addl 4(%1), %0\n"
" adcl 8(%1), %0\n"
" adcl 12(%1), %0\n"
"1: adcl 16(%1), %0\n"
" lea 4(%1), %1\n"
" decl %2\n"
" jne 1b\n"
" adcl $0, %0\n"
" movl %0, %2\n"
" shrl $16, %0\n"
" addw %w2, %w0\n"
" adcl $0, %0\n"
" notl %0\n"
"2:"
/* Since the input registers which are loaded with iph and ih
are modified, we must also specify them as outputs, or gcc
will assume they contain their original values. */
: "=r" (sum), "=r" (iph), "=r" (ihl)
: "1" (iph), "2" (ihl)
: "memory");
return (__sum16)sum;
}
#endif /* __KERNEL__ */
struct ps_device {
char name[IFNAMSIZ];
char dev_addr[ETH_ALEN];
uint32_t ip_addr; /* network order */
/* NOTE: this is different from kernel's internal index */
int ifindex;
/* This is kernel's ifindex. */
int kifindex;
int num_rx_queues;
int num_tx_queues;
};
struct ps_queue {
int ifindex;
int qidx;
};
#define MAX_PACKET_SIZE 2048
#define MAX_CHUNK_SIZE 4096
#define ENTRY_CNT 4096
#define PS_CHECKSUM_RX_UNKNOWN 0
#define PS_CHECKSUM_RX_GOOD 1
#define PS_CHECKSUM_RX_BAD 2
struct ps_pkt_info {
uint32_t offset;
uint16_t len;
uint8_t checksum_rx;
};
struct ps_chunk {
/* number of packets to send/recv */
int cnt;
int recv_blocking;
/*
for RX: output (where did these packets come from?)
for TX: input (which interface do you want to xmit?)
*/
struct ps_queue queue;
struct ps_pkt_info __user *info;
char __user *buf;
};
struct ps_chunk_buf {
uint16_t cnt;
uint16_t next_to_use;
uint16_t next_to_send;
uint32_t next_offset;
struct ps_queue queue;
void __user *lock;
struct ps_pkt_info __user *info;
char __user *buf;
};
struct ps_packet {
int ifindex;
int len;
char __user *buf;
};
#define NID_ZERO(isp) (isp = 0)
#define NID_SET(id, isp) (isp |= 1 << id)
#define NID_CLR(id, isp) (isp &= ~(1 << id))
#define NID_ISSET(id, isp) (isp & (1 << id))
// maximum number of interface descriptor is 16
typedef uint16_t nids_set;
struct ps_event {
long timeout;
int qidx;
nids_set rx_nids;
nids_set tx_nids;
};
static inline void prefetcht0(void *p)
{
asm volatile("prefetcht0 (%0)\n\t"
:
: "r" (p)
);
}
static inline void prefetchnta(void *p)
{
asm volatile("prefetchnta (%0)\n\t"
:
: "r" (p)
);
}
static inline void memcpy_aligned(void *to, const void *from, size_t len)
{
if (len <= 64) {
memcpy(to, from, 64);
} else if (len <= 128) {
memcpy(to, from, 64);
memcpy((uint8_t *)to + 64, (uint8_t *)from + 64, 64);
} else {
size_t offset;
for (offset = 0; offset < len; offset += 64)
memcpy((uint8_t *)to + offset,
(uint8_t *)from + offset,
64);
}
}
#define PS_IOC_LIST_DEVICES 0
#define PS_IOC_ATTACH_RX_DEVICE 1
#define PS_IOC_DETACH_RX_DEVICE 2
#define PS_IOC_RECV_CHUNK 3
#define PS_IOC_SEND_CHUNK 4
#define PS_IOC_SLOWPATH_PACKET 5
#define PS_IOC_RECV_CHUNK_IFIDX 6
#define PS_IOC_SEND_CHUNK_BUF 7
#define PS_IOC_GET_TXENTRY 8
#define PS_IOC_SELECT 9
#ifndef __KERNEL__
struct ps_handle {
int fd;
uint64_t rx_chunks[MAX_DEVICES];
uint64_t rx_packets[MAX_DEVICES];
uint64_t rx_bytes[MAX_DEVICES];
uint64_t tx_chunks[MAX_DEVICES];
uint64_t tx_packets[MAX_DEVICES];
uint64_t tx_bytes[MAX_DEVICES];
void *priv;
};
int ps_list_devices(struct ps_device *devices);
int ps_init_handle(struct ps_handle *handle);
void ps_close_handle(struct ps_handle *handle);
int ps_attach_rx_device(struct ps_handle *handle, struct ps_queue *queue);
int ps_detach_rx_device(struct ps_handle *handle, struct ps_queue *queue);
int ps_alloc_chunk(struct ps_handle *handle, struct ps_chunk *chunk);
void ps_free_chunk(struct ps_chunk *chunk);
int ps_alloc_chunk_buf(struct ps_handle *handle,
int ifidx, int qidx, struct ps_chunk_buf *c_buf);
void ps_free_chunk_buf(struct ps_chunk_buf *c_buf);
char* ps_assign_chunk_buf(struct ps_chunk_buf *c_buf, int len);
int ps_recv_chunk(struct ps_handle *handle, struct ps_chunk *chunk);
int ps_recv_chunk_ifidx(struct ps_handle *handle, struct ps_chunk *chunk, int ifidx);
int ps_send_chunk(struct ps_handle *handle, struct ps_chunk *chunk);
int ps_send_chunk_buf(struct ps_handle *handle, struct ps_chunk_buf *chunk);
int ps_select(struct ps_handle *handle, struct ps_event * event);
int ps_get_txentry(struct ps_handle *handle, struct ps_queue * queue);
int ps_slowpath_packet(struct ps_handle *handle, struct ps_packet *packet);
void dump_packet(char *buf, int len);
void dump_chunk(struct ps_chunk *chunk);
int get_num_cpus();
int bind_cpu(int cpu);
uint64_t rdtsc();
#endif
#endif /* _PS_H_ */

10
io_engine/lib/Makefile Normal file
View File

@ -0,0 +1,10 @@
.PHONY: clean
all: pslib.a
pslib.a: pslib.c
gcc -O3 -c -g -Wall -o pslib.o pslib.c
ar rcs libps.a pslib.o
clean:
rm -f *.o *.a

262
io_engine/lib/pslib.c Normal file
View File

@ -0,0 +1,262 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <fcntl.h>
#include <unistd.h>
#include <sched.h>
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include "../include/ps.h"
int ps_list_devices(struct ps_device *devices)
{
struct ps_handle handle;
int ret;
if (ps_init_handle(&handle))
return -1;
ret = ioctl(handle.fd, PS_IOC_LIST_DEVICES, devices);
ps_close_handle(&handle);
return ret;
}
int ps_init_handle(struct ps_handle *handle)
{
memset(handle, 0, sizeof(struct ps_handle));
handle->fd = open("/dev/packet_shader", O_RDWR);
if (handle->fd == -1)
return -1;
return 0;
}
void ps_close_handle(struct ps_handle *handle)
{
close(handle->fd);
handle->fd = -1;
}
int ps_attach_rx_device(struct ps_handle *handle, struct ps_queue *queue)
{
return ioctl(handle->fd, PS_IOC_ATTACH_RX_DEVICE, queue);
}
int ps_detach_rx_device(struct ps_handle *handle, struct ps_queue *queue)
{
return ioctl(handle->fd, PS_IOC_DETACH_RX_DEVICE, queue);
}
int ps_alloc_chunk(struct ps_handle *handle, struct ps_chunk *chunk)
{
memset(chunk, 0, sizeof(*chunk));
chunk->info = (struct ps_pkt_info *)malloc(
sizeof(struct ps_pkt_info) * MAX_CHUNK_SIZE);
if (!chunk->info)
return -1;
chunk->buf = (char *)mmap(NULL, MAX_PACKET_SIZE * MAX_CHUNK_SIZE,
PROT_READ | PROT_WRITE, MAP_SHARED,
handle->fd, 0);
if ((long)chunk->buf == -1)
return -1;
return 0;
}
void ps_free_chunk(struct ps_chunk *chunk)
{
free(chunk->info);
munmap(chunk->buf, MAX_PACKET_SIZE * MAX_CHUNK_SIZE);
chunk->info = NULL;
chunk->buf = NULL;
}
int ps_alloc_chunk_buf(struct ps_handle *handle,
int ifidx, int qidx, struct ps_chunk_buf *c_buf)
{
memset(c_buf, 0, sizeof(*c_buf));
c_buf->info = (struct ps_pkt_info *)malloc(
sizeof(struct ps_pkt_info) * ENTRY_CNT);
if (!c_buf->info)
return -1;
c_buf->buf = (char *)mmap(NULL, MAX_PACKET_SIZE * ENTRY_CNT,
PROT_READ | PROT_WRITE, MAP_SHARED,
handle->fd, 0);
if ((long)c_buf->buf == -1)
return -1;
c_buf->lock = (pthread_mutex_t *) malloc(
sizeof(pthread_mutex_t));
c_buf->queue.ifindex = ifidx;
c_buf->queue.qidx = qidx;
c_buf->cnt = 0;
c_buf->next_to_use = 0;
c_buf->next_to_send = 0;
c_buf->next_offset = 0;
if (pthread_mutex_init(c_buf->lock, NULL)) {
perror("pthread_mutex_init of c_buf->lock\n");
return -1;
}
return 0;
}
void ps_free_chunk_buf(struct ps_chunk_buf *c_buf)
{
free(c_buf->info);
munmap(c_buf->buf, MAX_PACKET_SIZE * ENTRY_CNT);
c_buf->info = NULL;
c_buf->buf = NULL;
}
char* ps_assign_chunk_buf(struct ps_chunk_buf *c_buf, int len) {
int w_idx;
if (c_buf->cnt >= ENTRY_CNT)
return NULL;
pthread_mutex_lock(c_buf->lock);
w_idx = c_buf->next_to_use;
c_buf->cnt++;
c_buf->info[w_idx].len = len;
c_buf->info[w_idx].offset = c_buf->next_offset;
c_buf->next_offset += (len + 63) / 64 * 64;
c_buf->next_to_use = (w_idx + 1) % ENTRY_CNT;
if(c_buf->next_to_use == 0)
c_buf->next_offset = 0;
pthread_mutex_unlock(c_buf->lock);
return c_buf->buf + c_buf->info[w_idx].offset;
}
int ps_recv_chunk(struct ps_handle *handle, struct ps_chunk *chunk)
{
int cnt;
cnt = ioctl(handle->fd, PS_IOC_RECV_CHUNK, chunk);
if (cnt > 0) {
int i;
int ifindex = chunk->queue.ifindex;
handle->rx_chunks[ifindex]++;
handle->rx_packets[ifindex] += cnt;
for (i = 0; i < cnt; i++)
handle->rx_bytes[ifindex] += chunk->info[i].len;
}
return cnt;
}
int ps_recv_chunk_ifidx(struct ps_handle *handle, struct ps_chunk *chunk, int ifidx)
{
int cnt;
chunk->queue.ifindex = ifidx;
cnt = ioctl(handle->fd, PS_IOC_RECV_CHUNK_IFIDX, chunk);
if (cnt > 0) {
int i;
int ifindex = chunk->queue.ifindex;
handle->rx_chunks[ifindex]++;
handle->rx_packets[ifindex] += cnt;
for (i = 0; i < cnt; i++)
handle->rx_bytes[ifindex] += chunk->info[i].len;
}
return cnt;
}
/* Send the given chunk to the modified driver. */
int ps_send_chunk(struct ps_handle *handle, struct ps_chunk *chunk)
{
int cnt;
cnt = ioctl(handle->fd, PS_IOC_SEND_CHUNK, chunk);
if (cnt > 0) {
int i;
int ifindex = chunk->queue.ifindex;
handle->tx_chunks[ifindex]++;
handle->tx_packets[ifindex] += cnt;
for (i = 0; i < cnt; i++)
handle->tx_bytes[ifindex] += chunk->info[i].len;
}
return cnt;
}
/* Send the given chunk to the modified driver. */
int ps_send_chunk_buf(struct ps_handle *handle, struct ps_chunk_buf *c_buf)
{
int cnt;
if(c_buf->cnt <= 0)
return 0;
pthread_mutex_lock(c_buf->lock);
cnt = ioctl(handle->fd, PS_IOC_SEND_CHUNK_BUF, c_buf);
if (cnt > 0) {
int i;
int ifindex = c_buf->queue.ifindex;
handle->tx_chunks[ifindex]++;
handle->tx_packets[ifindex] += cnt;
for (i = 0; i < cnt; i++)
handle->tx_bytes[ifindex] += c_buf->info[i].len;
c_buf->cnt -= cnt;
c_buf->next_to_send = (c_buf->next_to_send + cnt) % ENTRY_CNT;
}
pthread_mutex_unlock(c_buf->lock);
return cnt;
}
int ps_select(struct ps_handle *handle, struct ps_event * event)
{
return ioctl(handle->fd, PS_IOC_SELECT, event);
}
/* Get the remain number of tx_entry in a tx_ring */
int ps_get_txentry(struct ps_handle *handle, struct ps_queue *queue)
{
return ioctl(handle->fd, PS_IOC_GET_TXENTRY, queue);
}
int ps_slowpath_packet(struct ps_handle *handle, struct ps_packet *packet)
{
return ioctl(handle->fd, PS_IOC_SLOWPATH_PACKET, packet);
}

73
mtcp/src/Makefile Normal file
View File

@ -0,0 +1,73 @@
.PHONY: clean
### TARGET ###
MTCP_LIB_DIR=../lib
MTCP_LIB=libmtcp.a
MTCP_HDR_DIR=../include
MTCP_HDR = mtcp_api.h mtcp_epoll.h
### GCC ###
GCC = gcc
GCC_OPT = -m64 -Wall
#DBG_OPT = -DDBGMSG -DDBGFUNC -DSTREAM -DSTATE -DTSTAT -DAPP -DEPOLL
#DBG_OPT = -DDBGMSG -DDBGFUNC -DSTREAM -DSTATE
#DBG_OPT += -DPKTDUMP
#DBG_OPT += -DDUMP_STREAM
#GCC_OPT += -g -DNETSTAT -DINFO -DDBGERR -DDBGCERR
GCC_OPT += -DNDEBUG -O3 -DNETSTAT -DINFO -DDBGERR -DDBGCERR
GCC_OPT += $(DBG_OPT)
### LIBRARIES AND INCLUDES ###
INC_DIR=./include
INC= -I$(INC_DIR)
LIBS = -lnuma -lpthread -lrt
LIBS += -lps
# PacketShader LIBRARY and HEADER
PS_DIR=../../../io_engine/io_engine-2.0.38.2
LIB_DIR = -L$(PS_DIR)/lib
INC += -I$(PS_DIR)/include
ifeq ($(wildcard /usr/lib/libhugetlbfs.so),)
else
GCC_OPT += -DHUGEPAGE
endif
### SOURCE CODE ###
SRCS = core.c tcp_stream.c config.c api.c eventpoll.c socket.c pipe.c \
tcp_util.c eth_in.c ip_in.c tcp_in.c eth_out.c ip_out.c tcp_out.c \
arp.c timer.c cpu.c rss.c addr_pool.c fhash.c memory_mgt.c logger.c debug.c \
tcp_rb_frag_queue.c tcp_ring_buffer.c tcp_send_buffer.c tcp_sb_queue.c tcp_stream_queue.c
OBJS = $(patsubst %.c,%.o,$(SRCS))
DEPS = $(patsubst %.c,.%.d,$(SRCS))
### GOALS ###
default: $(OBJS) $(MTCP_HDR)
ar rvs $(MTCP_LIB_DIR)/$(MTCP_LIB) $(OBJS)
$(OBJS): %.o: %.c Makefile
$(GCC) $(GCC_OPT) $(INC) -c $< -o $@
$(DEPS): .%.d: %.c Makefile
$(GCC) $(GCC_OPT) $(INC) -MM $(CFLAGS) $< > $@
-include $(DEPS)
$(MTCP_HDR):
cp $(INC_DIR)/$@ $(MTCP_HDR_DIR)/$@
clean: clean-library
rm -f *.o *~ core
rm -f .*.d
clean-library:
rm -f $(MTCP_LIB_DIR)/*
rm -f $(MTCP_HDR_DIR)/*

302
mtcp/src/addr_pool.c Normal file
View File

@ -0,0 +1,302 @@
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "addr_pool.h"
#include "rss.h"
#include "debug.h"
#define MIN_PORT (1025)
#define MAX_PORT (65535 + 1)
/*----------------------------------------------------------------------------*/
struct addr_entry
{
struct sockaddr_in addr;
TAILQ_ENTRY(addr_entry) addr_link;
};
/*----------------------------------------------------------------------------*/
struct addr_map
{
struct addr_entry *addrmap[MAX_PORT];
};
/*----------------------------------------------------------------------------*/
struct addr_pool
{
struct addr_entry *pool; /* address pool */
struct addr_map *mapper; /* address map */
uint32_t addr_base; /* in host order */
int num_addr; /* number of addresses in use */
int num_entry;
int num_free;
int num_used;
pthread_mutex_t lock;
TAILQ_HEAD(, addr_entry) free_list;
TAILQ_HEAD(, addr_entry) used_list;
};
/*----------------------------------------------------------------------------*/
addr_pool_t
CreateAddressPool(in_addr_t addr_base, int num_addr)
{
struct addr_pool *ap;
int num_entry;
int i, j, cnt;
in_addr_t addr;
uint32_t addr_h;
ap = (addr_pool_t)calloc(1, sizeof(struct addr_pool));
if (!ap)
return NULL;
/* initialize address pool */
num_entry = num_addr * (MAX_PORT - MIN_PORT);
ap->pool = (struct addr_entry *)calloc(num_entry, sizeof(struct addr_entry));
if (!ap->pool) {
free(ap);
return NULL;
}
/* initialize address map */
ap->mapper = (struct addr_map *)calloc(num_addr, sizeof(struct addr_map));
if (!ap->mapper) {
free(ap->pool);
free(ap);
return NULL;
}
TAILQ_INIT(&ap->free_list);
TAILQ_INIT(&ap->used_list);
if (pthread_mutex_init(&ap->lock, NULL)) {
free(ap->pool);
free(ap);
return NULL;
}
pthread_mutex_lock(&ap->lock);
ap->addr_base = ntohl(addr_base);
ap->num_addr = num_addr;
cnt = 0;
for (i = 0; i < num_addr; i++) {
addr_h = ap->addr_base + i;
addr = htonl(addr_h);
for (j = MIN_PORT; j < MAX_PORT; j++) {
ap->pool[cnt].addr.sin_addr.s_addr = addr;
ap->pool[cnt].addr.sin_port = htons(j);
ap->mapper[i].addrmap[j] = &ap->pool[cnt];
TAILQ_INSERT_TAIL(&ap->free_list, &ap->pool[cnt], addr_link);
if ((++cnt) >= num_entry)
break;
}
}
ap->num_entry = cnt;
ap->num_free = cnt;
ap->num_used = 0;
pthread_mutex_unlock(&ap->lock);
return ap;
}
/*----------------------------------------------------------------------------*/
addr_pool_t
CreateAddressPoolPerCore(int core, int num_queues,
in_addr_t saddr_base, int num_addr, in_addr_t daddr, in_port_t dport)
{
struct addr_pool *ap;
int num_entry;
int i, j, cnt;
in_addr_t saddr;
uint32_t saddr_h, daddr_h;
uint16_t sport_h, dport_h;
int rss_core;
ap = (addr_pool_t)calloc(1, sizeof(struct addr_pool));
if (!ap)
return NULL;
/* initialize address pool */
num_entry = (num_addr * (MAX_PORT - MIN_PORT)) / num_queues;
ap->pool = (struct addr_entry *)calloc(num_entry, sizeof(struct addr_entry));
if (!ap->pool) {
free(ap);
return NULL;
}
/* initialize address map */
ap->mapper = (struct addr_map *)calloc(num_addr, sizeof(struct addr_map));
if (!ap->mapper) {
free(ap->pool);
free(ap);
return NULL;
}
TAILQ_INIT(&ap->free_list);
TAILQ_INIT(&ap->used_list);
if (pthread_mutex_init(&ap->lock, NULL)) {
free(ap->pool);
free(ap);
return NULL;
}
pthread_mutex_lock(&ap->lock);
ap->addr_base = ntohl(saddr_base);
ap->num_addr = num_addr;
daddr_h = ntohl(daddr);
dport_h = ntohs(dport);
/* search address space to get RSS-friendly addresses */
cnt = 0;
for (i = 0; i < num_addr; i++) {
saddr_h = ap->addr_base + i;
saddr = htonl(saddr_h);
for (j = MIN_PORT; j < MAX_PORT; j++) {
if (cnt >= num_entry)
break;
sport_h = j;
rss_core = GetRSSCPUCore(daddr_h, saddr_h, dport_h, sport_h, num_queues);
if (rss_core != core)
continue;
ap->pool[cnt].addr.sin_addr.s_addr = saddr;
ap->pool[cnt].addr.sin_port = htons(sport_h);
ap->mapper[i].addrmap[j] = &ap->pool[cnt];
TAILQ_INSERT_TAIL(&ap->free_list, &ap->pool[cnt], addr_link);
cnt++;
}
}
ap->num_entry = cnt;
ap->num_free = cnt;
ap->num_used = 0;
//fprintf(stderr, "CPU %d: Created %d address entries.\n", core, cnt);
if (ap->num_entry < CONFIG.max_concurrency) {
fprintf(stderr, "[WARINING] Available # addresses (%d) is smaller than"
" the max concurrency (%d).\n",
ap->num_entry, CONFIG.max_concurrency);
}
pthread_mutex_unlock(&ap->lock);
return ap;
}
/*----------------------------------------------------------------------------*/
void
DestroyAddressPool(addr_pool_t ap)
{
if (!ap)
return;
if (ap->pool) {
free(ap->pool);
ap->pool = NULL;
}
if (ap->mapper) {
free(ap->mapper);
ap->mapper = NULL;
}
pthread_mutex_destroy(&ap->lock);
free(ap);
}
/*----------------------------------------------------------------------------*/
int
FetchAddress(addr_pool_t ap, int core, int num_queues,
const struct sockaddr_in *daddr, struct sockaddr_in *saddr)
{
struct addr_entry *walk, *next;
int rss_core;
int ret = -1;
if (!ap || !daddr || !saddr)
return -1;
pthread_mutex_lock(&ap->lock);
walk = TAILQ_FIRST(&ap->free_list);
while (walk) {
next = TAILQ_NEXT(walk, addr_link);
rss_core = GetRSSCPUCore(ntohl(walk->addr.sin_addr.s_addr),
ntohl(daddr->sin_addr.s_addr), ntohs(walk->addr.sin_port),
ntohs(daddr->sin_port), num_queues);
if (core == rss_core)
break;
walk = next;
}
if (walk) {
*saddr = walk->addr;
TAILQ_REMOVE(&ap->free_list, walk, addr_link);
TAILQ_INSERT_TAIL(&ap->used_list, walk, addr_link);
ap->num_free--;
ap->num_used++;
ret = 0;
}
pthread_mutex_unlock(&ap->lock);
return ret;
}
/*----------------------------------------------------------------------------*/
int
FreeAddress(addr_pool_t ap, const struct sockaddr_in *addr)
{
struct addr_entry *walk, *next;
int ret = -1;
if (!ap || !addr)
return -1;
pthread_mutex_lock(&ap->lock);
if (ap->mapper) {
uint32_t addr_h = ntohl(addr->sin_addr.s_addr);
uint16_t port_h = ntohs(addr->sin_port);
int index = addr_h - ap->addr_base;
if (index >= 0 || index < ap->num_addr) {
walk = ap->mapper[addr_h - ap->addr_base].addrmap[port_h];
} else {
walk = NULL;
}
} else {
walk = TAILQ_FIRST(&ap->used_list);
while (walk) {
next = TAILQ_NEXT(walk, addr_link);
if (addr->sin_port == walk->addr.sin_port &&
addr->sin_addr.s_addr == walk->addr.sin_addr.s_addr) {
break;
}
walk = next;
}
}
if (walk) {
TAILQ_REMOVE(&ap->used_list, walk, addr_link);
TAILQ_INSERT_TAIL(&ap->free_list, walk, addr_link);
ap->num_free++;
ap->num_used--;
ret = 0;
}
pthread_mutex_unlock(&ap->lock);
return ret;
}
/*----------------------------------------------------------------------------*/

1476
mtcp/src/api.c Normal file

File diff suppressed because it is too large Load Diff

336
mtcp/src/arp.c Normal file
View File

@ -0,0 +1,336 @@
#include <stdint.h>
#include <sys/types.h>
#include "mtcp.h"
#include "arp.h"
#include "eth_out.h"
#include "debug.h"
#define ARP_LEN 28
#define ARP_HEAD_LEN 8
/*----------------------------------------------------------------------------*/
enum arp_hrd_format
{
arp_hrd_ethernet = 1
};
/*----------------------------------------------------------------------------*/
enum arp_opcode
{
arp_op_request = 1,
arp_op_reply = 2,
};
/*----------------------------------------------------------------------------*/
struct arphdr
{
uint16_t ar_hrd; /* hardware address format */
uint16_t ar_pro; /* protocol address format */
uint8_t ar_hln; /* hardware address length */
uint8_t ar_pln; /* protocol address length */
uint16_t ar_op; /* arp opcode */
uint8_t ar_sha[ETH_ALEN]; /* sender hardware address */
uint32_t ar_sip; /* sender ip address */
uint8_t ar_tha[ETH_ALEN]; /* targe hardware address */
uint32_t ar_tip; /* target ip address */
} __attribute__ ((packed));
/*----------------------------------------------------------------------------*/
struct arp_queue_entry
{
uint32_t ip;
int nif_out;
uint32_t ts_out;
TAILQ_ENTRY(arp_queue_entry) arp_link;
};
/*----------------------------------------------------------------------------*/
struct arp_manager
{
TAILQ_HEAD (, arp_queue_entry) list;
int cnt;
};
/*----------------------------------------------------------------------------*/
struct arp_manager arpm;
/*----------------------------------------------------------------------------*/
void
DumpARPPacket(struct arphdr *arph);
/*----------------------------------------------------------------------------*/
int
InitARPTable()
{
CONFIG.arp.entries = 0;
CONFIG.arp.entry = (struct arp_entry *)
calloc(MAX_ARPENTRY, sizeof(struct arp_entry));
if (CONFIG.arp.entry == NULL) {
perror("calloc");
return -1;
}
TAILQ_INIT(&arpm.list);
return 0;
}
/*----------------------------------------------------------------------------*/
unsigned char *
GetHWaddr(uint32_t ip)
{
int i;
unsigned char *haddr = NULL;
for (i = 0; i < CONFIG.eths_num; i++) {
if (ip == CONFIG.eths[i].ip_addr) {
haddr = CONFIG.eths[i].haddr;
break;
}
}
return haddr;
}
/*----------------------------------------------------------------------------*/
unsigned char *
GetDestinationHWaddr(uint32_t dip)
{
unsigned char *d_haddr = NULL;
int prefix = 0;
int i;
/* Longest prefix matching */
for (i = 0; i < CONFIG.arp.entries; i++) {
if (CONFIG.arp.entry[i].prefix == 1) {
if (CONFIG.arp.entry[i].ip == dip) {
d_haddr = CONFIG.arp.entry[i].haddr;
break;
}
} else {
if ((dip & CONFIG.arp.entry[i].ip_mask) ==
CONFIG.arp.entry[i].ip_masked) {
if (CONFIG.arp.entry[i].prefix > prefix) {
d_haddr = CONFIG.arp.entry[i].haddr;
prefix = CONFIG.arp.entry[i].prefix;
}
}
}
}
return d_haddr;
}
/*----------------------------------------------------------------------------*/
static int
ARPOutput(struct mtcp_manager *mtcp, int nif, int opcode,
uint32_t dst_ip, unsigned char *dst_haddr)
{
if (!dst_haddr)
return -1;
/* Allocate a buffer */
struct arphdr *arph = (struct arphdr *)EthernetOutput(mtcp,
ETH_P_ARP, nif, dst_haddr, sizeof(struct arphdr));
if (!arph) {
return -1;
}
/* Fill arp header */
arph->ar_hrd = htons(arp_hrd_ethernet);
arph->ar_pro = htons(ETH_P_IP);
arph->ar_hln = ETH_ALEN;
arph->ar_pln = 4;
arph->ar_op = htons(opcode);
/* Fill arp body */
arph->ar_sip = CONFIG.eths[nif].ip_addr;
arph->ar_tip = dst_ip;
memcpy(arph->ar_sha, CONFIG.eths[nif].haddr, arph->ar_hln);
memcpy(arph->ar_tha, dst_haddr, arph->ar_hln);
#if DBGMSG
DumpARPPacket(arph);
#endif
return 0;
}
/*----------------------------------------------------------------------------*/
int
RegisterARPEntry(uint32_t ip, const unsigned char *haddr)
{
int idx = CONFIG.arp.entries;
CONFIG.arp.entry[idx].prefix = 32;
CONFIG.arp.entry[idx].ip = ip;
memcpy(CONFIG.arp.entry[idx].haddr, haddr, ETH_ALEN);
CONFIG.arp.entry[idx].ip_mask = -1;
CONFIG.arp.entry[idx].ip_masked = ip;
CONFIG.arp.entries = idx + 1;
TRACE_CONFIG("Learned new arp entry.\n");
PrintARPTable();
return 0;
}
/*----------------------------------------------------------------------------*/
void
RequestARP(mtcp_manager_t mtcp, uint32_t ip, int nif, uint32_t cur_ts)
{
struct arp_queue_entry *ent;
unsigned char haddr[ETH_ALEN];
/* if the arp request is in progress, return */
TAILQ_FOREACH(ent, &arpm.list, arp_link) {
if (ent->ip == ip)
return;
}
ent = (struct arp_queue_entry *)calloc(1, sizeof(struct arp_queue_entry));
ent->ip = ip;
ent->nif_out = nif;
ent->ts_out = cur_ts;
TAILQ_INSERT_TAIL(&arpm.list, ent, arp_link);
/* else, broadcast arp request */
memset(haddr, 0xFF, ETH_ALEN);
ARPOutput(mtcp, nif, arp_op_request, ip, haddr);
}
/*----------------------------------------------------------------------------*/
static int
ProcessARPRequest(mtcp_manager_t mtcp,
struct arphdr *arph, int nif, uint32_t cur_ts)
{
unsigned char *temp;
/* register the arp entry if not exist */
temp = GetDestinationHWaddr(arph->ar_sip);
if (!temp) {
RegisterARPEntry(arph->ar_sip, arph->ar_sha);
}
/* send arp reply */
ARPOutput(mtcp, nif, arp_op_reply, arph->ar_sip, arph->ar_sha);
return 0;
}
/*----------------------------------------------------------------------------*/
static int
ProcessARPReply(mtcp_manager_t mtcp, struct arphdr *arph, uint32_t cur_ts)
{
unsigned char *temp;
struct arp_queue_entry *ent;
/* register the arp entry if not exist */
temp = GetDestinationHWaddr(arph->ar_sip);
if (!temp) {
RegisterARPEntry(arph->ar_sip, arph->ar_sha);
}
/* remove from the arp request queue */
TAILQ_FOREACH(ent, &arpm.list, arp_link) {
if (ent->ip == arph->ar_tip) {
TAILQ_REMOVE(&arpm.list, ent, arp_link);
free(ent);
break;
}
}
return 0;
}
/*----------------------------------------------------------------------------*/
int
ProcessARPPacket(mtcp_manager_t mtcp, uint32_t cur_ts,
const int ifidx, unsigned char *pkt_data, int len)
{
struct arphdr *arph = (struct arphdr *)(pkt_data + sizeof(struct ethhdr));
int i;
int to_me = FALSE;
/* process the arp messages destined to me */
for (i = 0; i < CONFIG.eths_num; i++) {
if (arph->ar_tip == CONFIG.eths[i].ip_addr) {
to_me = TRUE;
}
}
if (!to_me)
return TRUE;
#if DBGMSG
DumpARPPacket(arph);
#endif
switch (ntohs(arph->ar_op)) {
case arp_op_request:
ProcessARPRequest(mtcp, arph, ifidx, cur_ts);
break;
case arp_op_reply:
ProcessARPReply(mtcp, arph, cur_ts);
break;
default:
break;
}
return TRUE;
}
/*----------------------------------------------------------------------------*/
// Publish my address
void
PublishARP(mtcp_manager_t mtcp)
{
int i;
for (i = 0; i < CONFIG.eths_num; i++) {
ARPOutput(mtcp, CONFIG.eths[i].ifindex, arp_op_request, 0, NULL);
}
}
/*----------------------------------------------------------------------------*/
void
PrintARPTable()
{
int i;
/* print out process start information */
TRACE_CONFIG("ARP Table:\n");
for (i = 0; i < CONFIG.arp.entries; i++) {
uint8_t *da = (uint8_t *)&CONFIG.arp.entry[i].ip;
TRACE_CONFIG("IP addr: %u.%u.%u.%u, "
"dst_hwaddr: %02X:%02X:%02X:%02X:%02X:%02X\n",
da[0], da[1], da[2], da[3],
CONFIG.arp.entry[i].haddr[0],
CONFIG.arp.entry[i].haddr[1],
CONFIG.arp.entry[i].haddr[2],
CONFIG.arp.entry[i].haddr[3],
CONFIG.arp.entry[i].haddr[4],
CONFIG.arp.entry[i].haddr[5]);
}
if (CONFIG.arp.entries == 0)
TRACE_CONFIG("(blank)\n");
TRACE_CONFIG("----------------------------------------------------------"
"-----------------------\n");
}
/*----------------------------------------------------------------------------*/
void
DumpARPPacket(struct arphdr *arph)
{
uint8_t *t;
fprintf(stderr, "ARP header: \n");
fprintf(stderr, "Hareware type: %d (len: %d), "
"protocol type: %d (len: %d), opcode: %d\n",
ntohs(arph->ar_hrd), arph->ar_hln,
ntohs(arph->ar_pro), arph->ar_pln, ntohs(arph->ar_op));
t = (uint8_t *)&arph->ar_sip;
fprintf(stderr, "Sender IP: %u.%u.%u.%u, "
"haddr: %02X:%02X:%02X:%02X:%02X:%02X\n",
t[0], t[1], t[2], t[3],
arph->ar_sha[0], arph->ar_sha[1], arph->ar_sha[2],
arph->ar_sha[3], arph->ar_sha[4], arph->ar_sha[5]);
t = (uint8_t *)&arph->ar_tip;
fprintf(stderr, "Target IP: %u.%u.%u.%u, "
"haddr: %02X:%02X:%02X:%02X:%02X:%02X\n",
t[0], t[1], t[2], t[3],
arph->ar_tha[0], arph->ar_tha[1], arph->ar_tha[2],
arph->ar_tha[3], arph->ar_tha[4], arph->ar_tha[5]);
}
/*----------------------------------------------------------------------------*/

686
mtcp/src/config.c Normal file
View File

@ -0,0 +1,686 @@
#include <stdlib.h>
#include <assert.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <unistd.h>
#include <ctype.h>
#include "mtcp.h"
#include "config.h"
#include "tcp_in.h"
#include "arp.h"
#include "debug.h"
#define MAX_OPTLINE_LEN 1024
#define MAX_PROCLINE_LEN 1024
static const char *route_file = "config/route.conf";
static const char *arp_file = "config/arp.conf";
/*----------------------------------------------------------------------------*/
static int
GetIntValue(char* value)
{
int ret = 0;
ret = strtol(value, (char**)NULL, 10);
if (errno == EINVAL || errno == ERANGE)
return -1;
return ret;
}
/*----------------------------------------------------------------------------*/
static inline uint32_t
MaskFromPrefix(int prefix)
{
uint32_t mask = 0;
uint8_t *mask_t = (uint8_t *)&mask;
int i, j;
for (i = 0; i <= prefix / 8 && i < 4; i++) {
for (j = 0; j < (prefix - i * 8) && j < 8; j++) {
mask_t[i] |= (1 << (7 - j));
}
}
return mask;
}
/*----------------------------------------------------------------------------*/
static void
EnrollRouteTableEntry(char *optstr)
{
char *daddr_s;
char *prefix;
char *dev;
int ifidx;
int ridx;
int i;
daddr_s = strtok(optstr, "/");
prefix = strtok(NULL, " ");
dev = strtok(NULL, "\n");
assert(daddr_s != NULL);
assert(prefix != NULL);
assert(dev != NULL);
ifidx = -1;
for (i = 0; i < num_devices; i++) {
if (strcmp(dev, devices[i].name) != 0)
continue;
ifidx = devices[i].ifindex;
break;
}
if (ifidx == -1) {
TRACE_CONFIG("Interface %s does not exist!\n", dev);
exit(4);
}
ridx = CONFIG.routes++;
CONFIG.rtable[ridx].daddr = inet_addr(daddr_s);
CONFIG.rtable[ridx].prefix = atoi(prefix);
if (CONFIG.rtable[ridx].prefix > 32 || CONFIG.rtable[ridx].prefix < 0) {
TRACE_CONFIG("Prefix length should be between 0 - 32.\n");
exit(4);
}
CONFIG.rtable[ridx].mask = MaskFromPrefix(CONFIG.rtable[ridx].prefix);
CONFIG.rtable[ridx].masked =
CONFIG.rtable[ridx].daddr & CONFIG.rtable[ridx].mask;
CONFIG.rtable[ridx].nif = ifidx;
}
/*----------------------------------------------------------------------------*/
int
SetRoutingTableFromFile()
{
#define ROUTES "ROUTES"
FILE *fc;
char optstr[MAX_OPTLINE_LEN];
int i;
TRACE_CONFIG("Loading routing configurations from : %s\n", route_file);
fc = fopen(route_file, "r");
if (fc == NULL) {
perror("fopen");
TRACE_CONFIG("Skip loading static routing table\n");
return -1;
}
while (1) {
char *iscomment;
int num;
if (fgets(optstr, MAX_OPTLINE_LEN, fc) == NULL)
break;
//skip comment
iscomment = strchr(optstr, '#');
if (iscomment == optstr)
continue;
if (iscomment != NULL)
*iscomment = 0;
if (!strncmp(optstr, ROUTES, sizeof(ROUTES) - 1)) {
num = GetIntValue(optstr + sizeof(ROUTES));
if (num <= 0)
break;
for (i = 0; i < num; i++) {
if (fgets(optstr, MAX_OPTLINE_LEN, fc) == NULL)
break;
if (*optstr == '#') {
i -= 1;
continue;
}
EnrollRouteTableEntry(optstr);
}
}
}
fclose(fc);
return 0;
}
/*----------------------------------------------------------------------------*/
void
PrintRoutingTable()
{
int i;
uint8_t *da;
uint8_t *m;
uint8_t *md;
/* print out process start information */
TRACE_CONFIG("Routes:\n");
for (i = 0; i < CONFIG.routes; i++) {
da = (uint8_t *)&CONFIG.rtable[i].daddr;
m = (uint8_t *)&CONFIG.rtable[i].mask;
md = (uint8_t *)&CONFIG.rtable[i].masked;
TRACE_CONFIG("Destination: %u.%u.%u.%u/%d, Mask: %u.%u.%u.%u, "
"Masked: %u.%u.%u.%u, Route: xge%d\n",
da[0], da[1], da[2], da[3], CONFIG.rtable[i].prefix,
m[0], m[1], m[2], m[3], md[0], md[1], md[2], md[3],
CONFIG.rtable[i].nif);
}
if (CONFIG.routes == 0)
TRACE_CONFIG("(blank)\n");
TRACE_CONFIG("----------------------------------------------------------"
"-----------------------\n");
}
/*----------------------------------------------------------------------------*/
static void
ParseMACAddress(unsigned char *haddr, char *haddr_str)
{
int i;
char *str;
unsigned int temp;
str = strtok(haddr_str, ":");
i = 0;
while (str != NULL) {
if (i >= ETH_ALEN) {
TRACE_CONFIG("MAC address length exceeds %d!\n", ETH_ALEN);
exit(4);
}
sscanf(str, "%x", &temp);
haddr[i++] = temp;
str = strtok(NULL, ":");
}
if (i < ETH_ALEN) {
TRACE_CONFIG("MAC address length is less than %d!\n", ETH_ALEN);
exit(4);
}
}
/*----------------------------------------------------------------------------*/
static int
ParseIPAddress(uint32_t *ip_addr, char *ip_str)
{
if (ip_str == NULL) {
*ip_addr = 0;
return -1;
}
*ip_addr = inet_addr(ip_str);
if (*ip_addr == INADDR_NONE) {
TRACE_CONFIG("IP address is not valid %s\n", ip_str);
*ip_addr = 0;
return -1;
}
return 0;
}
/*----------------------------------------------------------------------------*/
int
SetRoutingTable()
{
int i, ridx;
unsigned int c;
CONFIG.routes = 0;
CONFIG.rtable = (struct route_table *)
calloc(MAX_DEVICES, sizeof(struct route_table));
if (!CONFIG.rtable)
exit(EXIT_FAILURE);
/* set default routing table */
for (i = 0; i < CONFIG.eths_num; i ++) {
ridx = CONFIG.routes++;
CONFIG.rtable[ridx].daddr = CONFIG.eths[i].ip_addr & CONFIG.eths[i].netmask;
CONFIG.rtable[ridx].prefix = 0;
c = CONFIG.eths[i].netmask;
while ((c = (c >> 1))){
CONFIG.rtable[ridx].prefix++;
}
CONFIG.rtable[ridx].prefix++;
CONFIG.rtable[ridx].mask = CONFIG.eths[i].netmask;
CONFIG.rtable[ridx].masked = CONFIG.rtable[ridx].daddr;
CONFIG.rtable[ridx].nif = devices[i].ifindex;
}
/* set additional routing table */
SetRoutingTableFromFile();
return 0;
}
/*----------------------------------------------------------------------------*/
int
GetNumQueues()
{
FILE *fp;
char buf[MAX_PROCLINE_LEN];
int queue_cnt;
fp = fopen("/proc/interrupts", "r");
if (!fp) {
TRACE_CONFIG("Failed to read data from /proc/interrupts!\n");
return -1;
}
/* count number of NIC queues from /proc/interrupts */
queue_cnt = 0;
while (!feof(fp)) {
if (fgets(buf, MAX_PROCLINE_LEN, fp) == NULL)
break;
/* "xge0-rx" is the keyword for counting queues */
if (strstr(buf, "xge0-rx")) {
queue_cnt++;
}
}
fclose(fp);
return queue_cnt;
}
/*----------------------------------------------------------------------------*/
int
SetInterfaceInfo()
{
struct ifreq ifr;
int eidx = 0;
int i, j;
TRACE_CONFIG("Loading interface setting\n");
CONFIG.eths = (struct eth_table *)
calloc(MAX_DEVICES, sizeof(struct eth_table));
if (!CONFIG.eths)
exit(EXIT_FAILURE);
// Create socket
int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
if (sock == -1) {
perror("socket");
}
for (i = 0; i < num_devices; i++) {
strcpy(ifr.ifr_name, devices[i].name);
//getting interface information
if (ioctl(sock, SIOCGIFFLAGS, &ifr) == 0) {
// Setting informations
eidx = CONFIG.eths_num++;
strcpy(CONFIG.eths[eidx].dev_name, ifr.ifr_name);
CONFIG.eths[eidx].ifindex = devices[i].ifindex;
//geting address
if (ioctl(sock, SIOCGIFADDR, &ifr) == 0 ) {
struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr;
CONFIG.eths[eidx].ip_addr = *(uint32_t *)&sin;
}
if (ioctl(sock, SIOCGIFHWADDR, &ifr) == 0 ) {
for (j = 0; j < 6; j ++) {
CONFIG.eths[eidx].haddr[j] = ifr.ifr_addr.sa_data[j];
}
}
/* Net MASK */
if (ioctl(sock, SIOCGIFNETMASK, &ifr) == 0) {
struct in_addr sin = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr;
CONFIG.eths[eidx].netmask = *(uint32_t *)&sin;
}
// add to attached devices
for (j = 0; j < num_devices_attached; j++) {
if (devices_attached[j] == devices[i].ifindex) {
break;
}
}
devices_attached[num_devices_attached] = devices[i].ifindex;
num_devices_attached++;
} else {
perror("SIOCGIFFLAGS");
}
}
num_queues = GetNumQueues();
if (num_queues <= 0) {
TRACE_CONFIG("Failed to find NIC queues!\n");
return -1;
}
if (num_queues > num_cpus) {
TRACE_CONFIG("Too many NIC queues available.\n");
return -1;
}
return 0;
}
/*----------------------------------------------------------------------------*/
void
PrintInterfaceInfo()
{
int i;
/* print out process start information */
TRACE_CONFIG("Interfaces:\n");
for (i = 0; i < CONFIG.eths_num; i++) {
uint8_t *da = (uint8_t *)&CONFIG.eths[i].ip_addr;
uint8_t *nm = (uint8_t *)&CONFIG.eths[i].netmask;
TRACE_CONFIG("name: %s, ifindex: %d, "
"hwaddr: %02X:%02X:%02X:%02X:%02X:%02X, "
"ipaddr: %u.%u.%u.%u, "
"netmask: %u.%u.%u.%u\n",
CONFIG.eths[i].dev_name,
CONFIG.eths[i].ifindex,
CONFIG.eths[i].haddr[0],
CONFIG.eths[i].haddr[1],
CONFIG.eths[i].haddr[2],
CONFIG.eths[i].haddr[3],
CONFIG.eths[i].haddr[4],
CONFIG.eths[i].haddr[5],
da[0], da[1], da[2], da[3],
nm[0], nm[1], nm[2], nm[3]);
}
TRACE_CONFIG("Number of NIC queues: %d\n", num_queues);
TRACE_CONFIG("----------------------------------------------------------"
"-----------------------\n");
}
/*----------------------------------------------------------------------------*/
static void
EnrollARPTableEntry(char *optstr)
{
char *dip_s; /* destination IP string */
char *prefix_s; /* IP prefix string */
char *daddr_s; /* destination MAC string */
int prefix;
uint32_t dip_mask;
int idx;
dip_s = strtok(optstr, "/");
prefix_s = strtok(NULL, " ");
daddr_s = strtok(NULL, "\n");
assert(dip_s != NULL);
assert(prefix_s != NULL);
assert(daddr_s != NULL);
prefix = atoi(prefix_s);
if (prefix > 32 || prefix < 0) {
TRACE_CONFIG("Prefix length should be between 0 - 32.\n");
return;
}
idx = CONFIG.arp.entries++;
CONFIG.arp.entry[idx].prefix = prefix;
ParseIPAddress(&CONFIG.arp.entry[idx].ip, dip_s);
ParseMACAddress(CONFIG.arp.entry[idx].haddr, daddr_s);
dip_mask = MaskFromPrefix(prefix);
CONFIG.arp.entry[idx].ip_mask = dip_mask;
CONFIG.arp.entry[idx].ip_masked = CONFIG.arp.entry[idx].ip & dip_mask;
/*
int i, cnt;
cnt = 1;
cnt = cnt << (32 - prefix);
for (i = 0; i < cnt; i++) {
idx = CONFIG.arp.entries++;
CONFIG.arp.entry[idx].ip = htonl(ntohl(ip) + i);
memcpy(CONFIG.arp.entry[idx].haddr, haddr, ETH_ALEN);
}
*/
}
/*----------------------------------------------------------------------------*/
int
LoadARPTable()
{
#define ARP_ENTRY "ARP_ENTRY"
FILE *fc;
char optstr[MAX_OPTLINE_LEN];
int numEntry = 0;
int hasNumEntry = 0;
TRACE_CONFIG("Loading ARP table from : %s\n", arp_file);
InitARPTable();
fc = fopen(arp_file, "r");
if (fc == NULL) {
perror("fopen");
TRACE_CONFIG("Skip loading static ARP table\n");
return -1;
}
while (1) {
char *p;
char *temp;
if (fgets(optstr, MAX_OPTLINE_LEN, fc) == NULL)
break;
p = optstr;
// skip comment
if ((temp = strchr(p, '#')) != NULL)
*temp = 0;
// remove front and tailing spaces
while (*p && isspace((int)*p))
p++;
temp = p + strlen(p) - 1;
while (temp >= p && isspace((int)*temp))
*temp = 0;
if (*p == 0) /* nothing more to process? */
continue;
if (!hasNumEntry && strncmp(p, ARP_ENTRY, sizeof(ARP_ENTRY)-1) == 0) {
numEntry = GetIntValue(p + sizeof(ARP_ENTRY));
if (numEntry <= 0) {
fprintf(stderr, "Wrong entry in arp.conf: %s\n", p);
exit(-1);
}
#if 0
CONFIG.arp.entry = (struct arp_entry *)
calloc(numEntry + MAX_ARPENTRY, sizeof(struct arp_entry));
if (CONFIG.arp.entry == NULL) {
fprintf(stderr, "Wrong entry in arp.conf: %s\n", p);
exit(-1);
}
#endif
hasNumEntry = 1;
} else {
if (numEntry <= 0) {
fprintf(stderr,
"Error in arp.conf: more entries than "
"are specifed, entry=%s\n", p);
exit(-1);
}
EnrollARPTableEntry(p);
numEntry--;
}
}
fclose(fc);
return 0;
}
/*----------------------------------------------------------------------------*/
static int
ParseConfiguration(char *line)
{
char optstr[MAX_OPTLINE_LEN];
char *p, *q;
strncpy(optstr, line, MAX_OPTLINE_LEN - 1);
p = strtok(optstr, " \t=");
if (p == NULL) {
TRACE_CONFIG("No option name found for the line: %s\n", line);
return -1;
}
q = strtok(NULL, " \t=");
if (q == NULL) {
TRACE_CONFIG("No option value found for the line: %s\n", line);
return -1;
}
if (strcmp(p, "num_cores") == 0) {
CONFIG.num_cores = atoi(q);
if (CONFIG.num_cores <= 0) {
TRACE_CONFIG("Number of cores should be larger than 0.\n");
return -1;
}
if (CONFIG.num_cores > num_cpus) {
TRACE_CONFIG("Number of cores should be smaller than "
"# physical CPU cores.\n");
return -1;
}
} else if (strcmp(p, "max_concurrency") == 0) {
CONFIG.max_concurrency = atoi(q);
if (CONFIG.max_concurrency < 0) {
TRACE_CONFIG("The maximum concurrency should be larger than 0.\n");
return -1;
}
} else if (strcmp(p, "max_num_buffers") == 0) {
CONFIG.max_num_buffers = atoi(q);
if (CONFIG.max_num_buffers < 0) {
TRACE_CONFIG("The maximum # buffers should be larger than 0.\n");
return -1;
}
} else if (strcmp(p, "rcvbuf") == 0) {
CONFIG.rcvbuf_size = atoi(q);
if (CONFIG.rcvbuf_size < 64) {
TRACE_CONFIG("Receive buffer size should be larger than 64.\n");
return -1;
}
} else if (strcmp(p, "sndbuf") == 0) {
CONFIG.sndbuf_size = atoi(q);
if (CONFIG.sndbuf_size < 64) {
TRACE_CONFIG("Send buffer size should be larger than 64.\n");
return -1;
}
} else if (strcmp(p, "tcp_timeout") == 0) {
CONFIG.tcp_timeout = atoi(q);
if (CONFIG.tcp_timeout > 0) {
CONFIG.tcp_timeout = SEC_TO_USEC(CONFIG.tcp_timeout) / TIME_TICK;
}
} else if (strcmp(p, "tcp_timewait") == 0) {
CONFIG.tcp_timewait = atoi(q);
if (CONFIG.tcp_timewait > 0) {
CONFIG.tcp_timewait = SEC_TO_USEC(CONFIG.tcp_timewait) / TIME_TICK;
}
} else if (strcmp(p, "stat_print") == 0) {
int i;
for (i = 0; i < CONFIG.eths_num; i++) {
if (strcmp(CONFIG.eths[i].dev_name, q) == 0) {
CONFIG.eths[i].stat_print = TRUE;
}
}
} else {
TRACE_CONFIG("Unknown option type: %s\n", line);
return -1;
}
return 0;
}
/*----------------------------------------------------------------------------*/
int
LoadConfiguration(char *fname)
{
FILE *fp;
char optstr[MAX_OPTLINE_LEN];
TRACE_CONFIG("----------------------------------------------------------"
"-----------------------\n");
TRACE_CONFIG("Loading mtcp configuration from : %s\n", fname);
fp = fopen(fname, "r");
if (fp == NULL) {
perror("fopen");
TRACE_CONFIG("Failed to load configuration file: %s\n", fname);
return -1;
}
/* set default configuration */
CONFIG.num_cores = num_cpus;
CONFIG.max_concurrency = 100000;
CONFIG.max_num_buffers = 100000;
CONFIG.rcvbuf_size = 8192;
CONFIG.sndbuf_size = 8192;
CONFIG.tcp_timeout = TCP_TIMEOUT;
CONFIG.tcp_timewait = TCP_TIMEWAIT;
while (1) {
char *p;
char *temp;
if (fgets(optstr, MAX_OPTLINE_LEN, fp) == NULL)
break;
p = optstr;
// skip comment
if ((temp = strchr(p, '#')) != NULL)
*temp = 0;
// remove front and tailing spaces
while (*p && isspace((int)*p))
p++;
temp = p + strlen(p) - 1;
while (temp >= p && isspace((int)*temp))
*temp = 0;
if (*p == 0) /* nothing more to process? */
continue;
if (ParseConfiguration(p) < 0)
return -1;
}
fclose(fp);
return 0;
}
/*----------------------------------------------------------------------------*/
void
PrintConfiguration()
{
int i;
TRACE_CONFIG("Configurations:\n");
TRACE_CONFIG("Number of CPU cores available: %d\n", num_cpus);
TRACE_CONFIG("Number of CPU cores to use: %d\n", CONFIG.num_cores);
TRACE_CONFIG("Maximum number of concurrency per core: %d\n",
CONFIG.max_concurrency);
TRACE_CONFIG("Maximum number of preallocated buffers per core: %d\n",
CONFIG.max_num_buffers);
TRACE_CONFIG("Receive buffer size: %d\n", CONFIG.rcvbuf_size);
TRACE_CONFIG("Send buffer size: %d\n", CONFIG.sndbuf_size);
if (CONFIG.tcp_timeout > 0) {
TRACE_CONFIG("TCP timeout seconds: %d\n",
USEC_TO_SEC(CONFIG.tcp_timeout * TIME_TICK));
} else {
TRACE_CONFIG("TCP timeout check disabled.\n");
}
TRACE_CONFIG("TCP timewait seconds: %d\n",
USEC_TO_SEC(CONFIG.tcp_timewait * TIME_TICK));
TRACE_CONFIG("NICs to print statistics:");
for (i = 0; i < CONFIG.eths_num; i++) {
if (CONFIG.eths[i].stat_print) {
TRACE_CONFIG(" %s", CONFIG.eths[i].dev_name);
}
}
TRACE_CONFIG("\n");
TRACE_CONFIG("----------------------------------------------------------"
"-----------------------\n");
}
/*----------------------------------------------------------------------------*/

1730
mtcp/src/core.c Normal file

File diff suppressed because it is too large Load Diff

76
mtcp/src/cpu.c Normal file
View File

@ -0,0 +1,76 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <numa.h>
#include <sched.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <assert.h>
#define MAX_FILE_NAME 1024
/*----------------------------------------------------------------------------*/
int
GetNumCPUs()
{
return sysconf(_SC_NPROCESSORS_ONLN);
}
/*----------------------------------------------------------------------------*/
pid_t
Gettid()
{
return syscall(__NR_gettid);
}
/*----------------------------------------------------------------------------*/
int
mtcp_core_affinitize(int cpu)
{
cpu_set_t cpus;
struct bitmask *bmask;
FILE *fp;
char sysfname[MAX_FILE_NAME];
int phy_id;
size_t n;
int ret;
n = GetNumCPUs();
if (cpu < 0 || cpu >= (int) n) {
errno = -EINVAL;
return -1;
}
CPU_ZERO(&cpus);
CPU_SET((unsigned)cpu, &cpus);
ret = sched_setaffinity(Gettid(), sizeof(cpus), &cpus);
if (numa_max_node() == 0)
return ret;
bmask = numa_bitmask_alloc(n);
assert(bmask);
/* read physical id of the core from sys information */
snprintf(sysfname, MAX_FILE_NAME - 1,
"/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu);
fp = fopen(sysfname, "r");
if (!fp) {
perror(sysfname);
errno = EFAULT;
return -1;
}
fscanf(fp, "%d", &phy_id);
numa_bitmask_setbit(bmask, phy_id);
numa_set_membind(bmask);
numa_bitmask_free(bmask);
fclose(fp);
return ret;
}

269
mtcp/src/debug.c Normal file
View File

@ -0,0 +1,269 @@
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdint.h>
#include <stdarg.h>
#include "debug.h"
#include "tcp_in.h"
#include "logger.h"
/*----------------------------------------------------------------------------*/
void flush_log_data(mtcp_manager_t mtcp)
{
int ret = 0;
if (mtcp->w_buffer) {
EnqueueJobBuffer(mtcp->logger, mtcp->w_buffer);
ret = write(mtcp->sp_fd, "A", 1);
if (ret != 1) {
TRACE_INFO("Failed to flush logs in the buffer.\n");
perror("write() for pipe");
}
}
}
/*----------------------------------------------------------------------------*/
void
thread_printf(mtcp_manager_t mtcp, FILE* f_idx, const char* _Format, ...)
{
va_list argptr;
va_start(argptr, _Format);
#define PRINT_LIMIT 4096
int len;
log_buff *wbuf;
assert(f_idx != NULL);
pthread_mutex_lock(&mtcp->logger->mutex);
wbuf = mtcp->w_buffer;
if (wbuf && (wbuf->buff_len + PRINT_LIMIT > LOG_BUFF_SIZE)) {
flush_log_data(mtcp);
wbuf = NULL;
}
if (!wbuf) {
wbuf = DequeueFreeBuffer(mtcp->logger);
assert(wbuf);
wbuf->buff_len = 0;
wbuf->tid = mtcp->ctx->cpu;
wbuf->fid = f_idx;
mtcp->w_buffer = wbuf;
}
len = vsnprintf(wbuf->buff + wbuf->buff_len, PRINT_LIMIT, _Format, argptr);
wbuf->buff_len += len;
pthread_mutex_unlock(&mtcp->logger->mutex);
va_end(argptr);
}
/*----------------------------------------------------------------------------*/
void
DumpPacket(mtcp_manager_t mtcp, char *buf, int len, char *step, int ifindex)
{
struct ethhdr *ethh;
struct iphdr *iph;
struct udphdr *udph;
struct tcphdr *tcph;
uint8_t *t;
if (ifindex >= 0)
thread_printf(mtcp, mtcp->log_fp, "%s %d %u", step, ifindex, mtcp->cur_ts);
else
thread_printf(mtcp, mtcp->log_fp, "%s ? %u", step, mtcp->cur_ts);
ethh = (struct ethhdr *)buf;
if (ntohs(ethh->h_proto) != ETH_P_IP) {
thread_printf(mtcp, mtcp->log_fp, "%02X:%02X:%02X:%02X:%02X:%02X -> %02X:%02X:%02X:%02X:%02X:%02X ",
ethh->h_source[0],
ethh->h_source[1],
ethh->h_source[2],
ethh->h_source[3],
ethh->h_source[4],
ethh->h_source[5],
ethh->h_dest[0],
ethh->h_dest[1],
ethh->h_dest[2],
ethh->h_dest[3],
ethh->h_dest[4],
ethh->h_dest[5]);
thread_printf(mtcp, mtcp->log_fp, "protocol %04hx ", ntohs(ethh->h_proto));
goto done;
}
thread_printf(mtcp, mtcp->log_fp, " ");
iph = (struct iphdr *)(ethh + 1);
udph = (struct udphdr *)((uint32_t *)iph + iph->ihl);
tcph = (struct tcphdr *)((uint32_t *)iph + iph->ihl);
t = (uint8_t *)&iph->saddr;
thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->source));
thread_printf(mtcp, mtcp->log_fp, " -> ");
t = (uint8_t *)&iph->daddr;
thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->dest));
thread_printf(mtcp, mtcp->log_fp, " IP_ID=%d", ntohs(iph->id));
thread_printf(mtcp, mtcp->log_fp, " TTL=%d ", iph->ttl);
if (ip_fast_csum(iph, iph->ihl)) {
__sum16 org_csum, correct_csum;
org_csum = iph->check;
iph->check = 0;
correct_csum = ip_fast_csum(iph, iph->ihl);
thread_printf(mtcp, mtcp->log_fp, "(bad checksum %04x should be %04x) ",
ntohs(org_csum), ntohs(correct_csum));
iph->check = org_csum;
}
switch (iph->protocol) {
case IPPROTO_TCP:
thread_printf(mtcp, mtcp->log_fp, "TCP ");
if (tcph->syn)
thread_printf(mtcp, mtcp->log_fp, "S ");
if (tcph->fin)
thread_printf(mtcp, mtcp->log_fp, "F ");
if (tcph->ack)
thread_printf(mtcp, mtcp->log_fp, "A ");
if (tcph->rst)
thread_printf(mtcp, mtcp->log_fp, "R ");
thread_printf(mtcp, mtcp->log_fp, "seq %u ", ntohl(tcph->seq));
if (tcph->ack)
thread_printf(mtcp, mtcp->log_fp, "ack %u ", ntohl(tcph->ack_seq));
thread_printf(mtcp, mtcp->log_fp, "WDW=%u ", ntohs(tcph->window));
break;
case IPPROTO_UDP:
thread_printf(mtcp, mtcp->log_fp, "UDP ");
break;
default:
thread_printf(mtcp, mtcp->log_fp, "protocol %d ", iph->protocol);
goto done;
}
done:
thread_printf(mtcp, mtcp->log_fp, "len=%d\n", len);
}
/*----------------------------------------------------------------------------*/
void
DumpIPPacket(mtcp_manager_t mtcp, const struct iphdr *iph, int len)
{
struct udphdr *udph;
struct tcphdr *tcph;
uint8_t *t;
udph = (struct udphdr *)((uint32_t *)iph + iph->ihl);
tcph = (struct tcphdr *)((uint32_t *)iph + iph->ihl);
t = (uint8_t *)&iph->saddr;
thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->source));
thread_printf(mtcp, mtcp->log_fp, " -> ");
t = (uint8_t *)&iph->daddr;
thread_printf(mtcp, mtcp->log_fp, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
thread_printf(mtcp, mtcp->log_fp, "(%d)", ntohs(udph->dest));
thread_printf(mtcp, mtcp->log_fp, " IP_ID=%d", ntohs(iph->id));
thread_printf(mtcp, mtcp->log_fp, " TTL=%d ", iph->ttl);
if (ip_fast_csum(iph, iph->ihl)) {
thread_printf(mtcp, mtcp->log_fp, "(bad checksum) ");
}
switch (iph->protocol) {
case IPPROTO_TCP:
thread_printf(mtcp, mtcp->log_fp, "TCP ");
if (tcph->syn)
thread_printf(mtcp, mtcp->log_fp, "S ");
if (tcph->fin)
thread_printf(mtcp, mtcp->log_fp, "F ");
if (tcph->ack)
thread_printf(mtcp, mtcp->log_fp, "A ");
if (tcph->rst)
thread_printf(mtcp, mtcp->log_fp, "R ");
thread_printf(mtcp, mtcp->log_fp, "seq %u ", ntohl(tcph->seq));
if (tcph->ack)
thread_printf(mtcp, mtcp->log_fp, "ack %u ", ntohl(tcph->ack_seq));
thread_printf(mtcp, mtcp->log_fp, "WDW=%u ", ntohs(tcph->window));
break;
case IPPROTO_UDP:
thread_printf(mtcp, mtcp->log_fp, "UDP ");
break;
default:
thread_printf(mtcp, mtcp->log_fp, "protocol %d ", iph->protocol);
goto done;
}
done:
thread_printf(mtcp, mtcp->log_fp, "len=%d\n", len);
}
/*----------------------------------------------------------------------------*/
void
DumpIPPacketToFile(FILE *fout, const struct iphdr *iph, int len)
{
struct udphdr *udph;
struct tcphdr *tcph;
uint8_t *t;
udph = (struct udphdr *)((uint32_t *)iph + iph->ihl);
tcph = (struct tcphdr *)((uint32_t *)iph + iph->ihl);
t = (uint8_t *)&iph->saddr;
fprintf(fout, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
fprintf(fout, "(%d)", ntohs(udph->source));
fprintf(fout, " -> ");
t = (uint8_t *)&iph->daddr;
fprintf(fout, "%u.%u.%u.%u", t[0], t[1], t[2], t[3]);
if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
fprintf(fout, "(%d)", ntohs(udph->dest));
fprintf(fout, " IP_ID=%d", ntohs(iph->id));
fprintf(fout, " TTL=%d ", iph->ttl);
if (ip_fast_csum(iph, iph->ihl)) {
fprintf(fout, "(bad checksum) ");
}
switch (iph->protocol) {
case IPPROTO_TCP:
fprintf(fout, "TCP ");
if (tcph->syn)
fprintf(fout, "S ");
if (tcph->fin)
fprintf(fout, "F ");
if (tcph->ack)
fprintf(fout, "A ");
if (tcph->rst)
fprintf(fout, "R ");
fprintf(fout, "seq %u ", ntohl(tcph->seq));
if (tcph->ack)
fprintf(fout, "ack %u ", ntohl(tcph->ack_seq));
fprintf(fout, "WDW=%u ", ntohs(tcph->window));
break;
case IPPROTO_UDP:
fprintf(fout, "UDP ");
break;
default:
fprintf(fout, "protocol %d ", iph->protocol);
goto done;
}
done:
fprintf(fout, "len=%d\n", len);
}

60
mtcp/src/eth_in.c Normal file
View File

@ -0,0 +1,60 @@
#include "ps.h"
#include "ip_in.h"
#include "eth_in.h"
#include "arp.h"
#include "debug.h"
/*----------------------------------------------------------------------------*/
int
ProcessPacket(mtcp_manager_t mtcp, const int ifidx,
uint32_t cur_ts, unsigned char *pkt_data, int len)
{
struct ethhdr *ethh = (struct ethhdr *)pkt_data;
u_short ip_proto = ntohs(ethh->h_proto);
int ret;
#ifdef PKTDUMP
DumpPacket(mtcp, (char *)pkt_data, len, "IN", ifidx);
#endif
#ifdef NETSTAT
mtcp->nstat.rx_packets[ifidx]++;
mtcp->nstat.rx_bytes[ifidx] += len + 24;
#endif /* NETSTAT */
#if 0
/* ignore mac address which is not for current interface */
int i;
for (i = 0; i < 6; i ++) {
if (ethh->h_dest[i] != CONFIG.eths[ifidx].haddr[i]) {
return FALSE;
}
}
#endif
if (ip_proto == ETH_P_IP) {
/* process ipv4 packet */
ret = ProcessIPv4Packet(mtcp, cur_ts, ifidx, pkt_data, len);
} else if (ip_proto == ETH_P_ARP) {
ProcessARPPacket(mtcp, cur_ts, ifidx, pkt_data, len);
return TRUE;
} else {
//DumpPacket(mtcp, (char *)pkt_data, len, "??", ifidx);
struct ps_packet packet;
packet.ifindex = ifidx;
packet.len = len;
packet.buf = (char *)pkt_data;
ps_slowpath_packet(mtcp->ctx->handle, &packet);
return TRUE;
}
#ifdef NETSTAT
if (ret < 0) {
mtcp->nstat.rx_errors[ifidx]++;
}
#endif
return ret;
}

263
mtcp/src/eth_out.c Normal file
View File

@ -0,0 +1,263 @@
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <linux/if_ether.h>
#include <linux/tcp.h>
#include <netinet/ip.h>
#include "mtcp.h"
#include "arp.h"
#include "eth_out.h"
#include "debug.h"
#ifndef TRUE
#define TRUE (1)
#endif
#ifndef FALSE
#define FALSE (0)
#endif
#ifndef ERROR
#define ERROR (-1)
#endif
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
#define MAX_WINDOW_SIZE 65535
/*----------------------------------------------------------------------------*/
enum ETH_BUFFER_RETURN {BUF_RET_MAYBE, BUF_RET_ALWAYS};
/*----------------------------------------------------------------------------*/
#if !(E_PSIO || USE_CHUNK_BUF)
inline void
InitWriteChunks(struct ps_handle* handle, struct ps_chunk *w_chunk)
{
int i, ret;
for (i = 0; i < ETH_NUM; i++)
{
ret = ps_alloc_chunk(handle, &w_chunk[i]);
if (ret != 0)
{
perror("ps_alloc_chunk");
exit(1);
}
w_chunk[i].queue.ifindex = i;
w_chunk[i].recv_blocking = 0;
w_chunk[i].cnt = 0;
}
}
/*----------------------------------------------------------------------------*/
int
FlushWriteBuffer(struct mtcp_thread_context* ctx, int ifidx)
{
int ret = 0;
struct ps_chunk* w_chunk = ctx->w_chunk;
mtcp_manager_t mtcp = ctx->mtcp_manager;
int i;
int drop = 0;
assert(ctx != NULL);
assert(w_chunk != NULL);
if (w_chunk[ifidx].cnt > 0) {
STAT_COUNT(mtcp->runstat.rounds_tx_try);
ret = ps_send_chunk(ctx->handle, &w_chunk[ifidx]);
drop = ctx->w_chunk[ifidx].cnt - ret;
if (ret < 0) {
TRACE_ERROR("ps_send_chunk failed to send chunks, %d:%d\n",
ifidx, w_chunk[ifidx].cnt);
return ret;
} else {
#ifdef NETSTAT
mtcp->nstat.tx_packets[ifidx] += ret;
#endif /* NETSTAT */
for (i = 0; i < ret; i++) {
#ifdef PKTDUMP
DumpPacket(mtcp,
w_chunk[ifidx].buf + w_chunk[ifidx].info[i].offset,
w_chunk[ifidx].info[i].len, "OUT", ifidx);
#endif /* PKTDUMP */
#ifdef NETSTAT
mtcp->nstat.tx_bytes[ifidx] += w_chunk[ifidx].info[i].len + 24;
#endif /* NETSTAT */
}
#ifdef NETSTAT
if (ret != w_chunk[ifidx].cnt) {
mtcp->nstat.tx_drops[ifidx] += (w_chunk[ifidx].cnt - ret);
}
#endif /* NETSTAT */
if (ret == 0) {
return ret;
}
}
#ifdef PKTDUMP
thread_printf(mtcp, mtcp->log_fp, "sent chunks, ret: %d (tries: %d)\n",
ret, w_chunk[ifidx].cnt);
thread_printf(mtcp, mtcp->log_fp, "======================================"
"======================================================"
"====================\n\n");
#endif /* PKTDUMP */
if (drop > 0) {
ctx->w_chunk[ifidx].cnt = drop;
for (i = 0; i < drop; i++) {
ctx->w_chunk[ifidx].info[i].len =
ctx->w_chunk[ifidx].info[ret + i].len;
ctx->w_chunk[ifidx].info[i].offset =
ctx->w_chunk[ifidx].info[ret + i].offset;
}
ctx->w_off[ifidx] = ctx->w_chunk[ifidx].info[drop - 1].offset +
(ctx->w_chunk[ifidx].info[drop - 1].len + 63) / 64 * 64;
ctx->w_cur_idx[ifidx] += ret;
} else {
ctx->w_chunk[ifidx].cnt = 0;
ctx->w_off[ifidx] = 0;
ctx->w_cur_idx[ifidx] = 0;
}
}
return ret;
}
/*----------------------------------------------------------------------------*/
static inline char *
GetWriteBuffer(struct mtcp_thread_context *ctx, int method, int ifidx, int len)
{
struct ps_chunk *w_chunk = ctx->w_chunk;
uint32_t *w_off = ctx->w_off;
int w_idx;
assert(w_chunk != NULL);
assert(w_off != NULL);
if (ifidx < 0 || ifidx >= CONFIG.eths_num )
return NULL;
//pthread_mutex_lock(&ctx->send_lock);
if (ctx->w_cur_idx[ifidx] + w_chunk[ifidx].cnt >= MAX_SEND_PCK_CHUNK) {
if (method == BUF_RET_MAYBE) {
return NULL;
} else if (method == BUF_RET_ALWAYS) {
if (FlushWriteBuffer(ctx, ifidx) <= 0)
return NULL;
} else {
assert(0);
}
}
assert(ctx->w_cur_idx[ifidx] + w_chunk[ifidx].cnt < MAX_SEND_PCK_CHUNK);
assert(w_off[ifidx] < MAX_PACKET_SIZE * MAX_CHUNK_SIZE);
w_idx = w_chunk[ifidx].cnt++;
w_chunk[ifidx].info[w_idx].len = len;
w_chunk[ifidx].info[w_idx].offset = w_off[ifidx];
w_off[ifidx] += (len + 63) / 64 * 64;
//pthread_mutex_unlock(&ctx->send_lock);
return (w_chunk[ifidx].buf + w_chunk[ifidx].info[w_idx].offset);
}
/*----------------------------------------------------------------------------*/
#else /* E_PSIO */
int
FlushSendChunkBuf(mtcp_manager_t mtcp, int nif)
{
struct ps_chunk_buf *c_buf;
int send_cnt, to_send_cnt = 0;
int start_idx;
int i;
c_buf = &mtcp->ctx->w_chunk_buf[nif];
if (!c_buf)
return -1;
to_send_cnt = c_buf->cnt;
if (to_send_cnt > 0) {
STAT_COUNT(mtcp->runstat.rounds_tx_try);
start_idx = c_buf->next_to_send;
send_cnt = ps_send_chunk_buf(mtcp->ctx->handle, c_buf);
for (i = 0; i < send_cnt; i++) {
#ifdef NETSTAT
mtcp->nstat.tx_bytes[nif] += c_buf->info[start_idx].len + 24;
#endif
#if PKTDUMP
DumpPacket(mtcp, c_buf->buf + c_buf->info[start_idx].offset,
c_buf->info[start_idx].len, "OUT", nif);
#endif
start_idx = (start_idx + 1) % ENTRY_CNT;
}
if (send_cnt < 0) {
TRACE_ERROR("ps_send_chunk_buf failed. "
"ret: %d, error: %s\n", send_cnt, strerror(errno));
#ifdef NETSTAT
} else {
mtcp->nstat.tx_packets[nif] += send_cnt;
#endif
}
return send_cnt;
}
return 0;
}
#endif /* E_PSIO */
/*----------------------------------------------------------------------------*/
uint8_t *
EthernetOutput(struct mtcp_manager *mtcp, uint16_t h_proto,
int nif, unsigned char* dst_haddr, uint16_t iplen)
{
char *buf;
struct ethhdr *ethh;
int i;
#if E_PSIO || USE_CHUNK_BUF
struct ps_chunk_buf *c_buf = &mtcp->ctx->w_chunk_buf[nif];
buf = ps_assign_chunk_buf(c_buf, iplen + ETHERNET_HEADER_LEN);
#else
buf = GetWriteBuffer(mtcp->ctx,
BUF_RET_MAYBE, nif, iplen + ETHERNET_HEADER_LEN);
#endif
if (!buf) {
//TRACE_DBG("Failed to get available write buffer\n");
return NULL;
}
//memset(buf, 0, ETHERNET_HEADER_LEN + iplen);
// if (!stream->sndvar->d_haddr) {
// stream->sndvar->d_haddr = GetDestinationHWaddr(stream->daddr);
// }
#if 0
TRACE_DBG("dst_hwaddr: %02X:%02X:%02X:%02X:%02X:%02X\n",
stream->sndvar->d_haddr[0], stream->sndvar->d_haddr[1],
stream->sndvar->d_haddr[2], stream->sndvar->d_haddr[3],
stream->sndvar->d_haddr[4], stream->sndvar->d_haddr[5]);
#endif
ethh = (struct ethhdr *)buf;
for (i = 0; i < ETH_ALEN; i++) {
ethh->h_source[i] = CONFIG.eths[nif].haddr[i];
ethh->h_dest[i] = dst_haddr[i];
}
ethh->h_proto = htons(h_proto);
return (uint8_t *)(ethh + 1);
}
/*----------------------------------------------------------------------------*/

580
mtcp/src/eventpoll.c Normal file
View File

@ -0,0 +1,580 @@
#include <sys/queue.h>
#include <unistd.h>
#include <time.h>
#include <signal.h>
#include <assert.h>
#include "mtcp.h"
#include "tcp_stream.h"
#include "eventpoll.h"
#include "tcp_in.h"
#include "pipe.h"
#include "debug.h"
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
#define SPIN_BEFORE_SLEEP FALSE
#define SPIN_THRESH 10000000
/*----------------------------------------------------------------------------*/
char *event_str[] = {"NONE", "IN", "PRI", "OUT", "ERR", "HUP", "RDHUP"};
/*----------------------------------------------------------------------------*/
char *
EventToString(uint32_t event)
{
switch (event) {
case MTCP_EPOLLNONE:
return event_str[0];
break;
case MTCP_EPOLLIN:
return event_str[1];
break;
case MTCP_EPOLLPRI:
return event_str[2];
break;
case MTCP_EPOLLOUT:
return event_str[3];
break;
case MTCP_EPOLLERR:
return event_str[4];
break;
case MTCP_EPOLLHUP:
return event_str[5];
break;
case MTCP_EPOLLRDHUP:
return event_str[6];
break;
default:
assert(0);
}
assert(0);
return NULL;
}
/*----------------------------------------------------------------------------*/
struct event_queue *
CreateEventQueue(int size)
{
struct event_queue *eq;
eq = (struct event_queue *)calloc(1, sizeof(struct event_queue));
if (!eq)
return NULL;
eq->start = 0;
eq->end = 0;
eq->size = size;
eq->events = (struct mtcp_epoll_event_int *)
calloc(size, sizeof(struct mtcp_epoll_event_int));
if (!eq->events) {
free(eq);
return NULL;
}
eq->num_events = 0;
return eq;
}
/*----------------------------------------------------------------------------*/
void
DestroyEventQueue(struct event_queue *eq)
{
if (eq->events)
free(eq->events);
free(eq);
}
/*----------------------------------------------------------------------------*/
int
mtcp_epoll_create(mctx_t mctx, int size)
{
mtcp_manager_t mtcp = g_mtcp[mctx->cpu];
struct mtcp_epoll *ep;
socket_map_t epsocket;
if (size <= 0) {
errno = EINVAL;
return -1;
}
epsocket = AllocateSocket(mctx, MTCP_SOCK_EPOLL, FALSE);
if (!epsocket) {
errno = ENFILE;
return -1;
}
ep = (struct mtcp_epoll *)calloc(1, sizeof(struct mtcp_epoll));
if (!ep) {
FreeSocket(mctx, epsocket->id, FALSE);
return -1;
}
/* create event queues */
ep->usr_queue = CreateEventQueue(size);
if (!ep->usr_queue)
return -1;
ep->usr_shadow_queue = CreateEventQueue(size);
if (!ep->usr_shadow_queue) {
DestroyEventQueue(ep->usr_queue);
return -1;
}
ep->mtcp_queue = CreateEventQueue(size);
if (!ep->mtcp_queue) {
DestroyEventQueue(ep->usr_queue);
DestroyEventQueue(ep->usr_shadow_queue);
return -1;
}
TRACE_EPOLL("epoll structure of size %d created.\n", ep->size);
mtcp->ep = ep;
epsocket->ep = ep;
if (pthread_mutex_init(&ep->epoll_lock, NULL)) {
return -1;
}
if (pthread_cond_init(&ep->epoll_cond, NULL)) {
return -1;
}
return epsocket->id;
}
/*----------------------------------------------------------------------------*/
int
CloseEpollSocket(mctx_t mctx, int epid)
{
mtcp_manager_t mtcp;
struct mtcp_epoll *ep;
mtcp = GetMTCPManager(mctx);
if (!mtcp) {
return -1;
}
ep = mtcp->smap[epid].ep;
if (!ep) {
errno = EINVAL;
return -1;
}
DestroyEventQueue(ep->usr_queue);
DestroyEventQueue(ep->usr_shadow_queue);
DestroyEventQueue(ep->mtcp_queue);
free(ep);
pthread_mutex_lock(&ep->epoll_lock);
mtcp->ep = NULL;
mtcp->smap[epid].ep = NULL;
pthread_cond_signal(&ep->epoll_cond);
pthread_mutex_unlock(&ep->epoll_lock);
pthread_cond_destroy(&ep->epoll_cond);
pthread_mutex_destroy(&ep->epoll_lock);
return 0;
}
/*----------------------------------------------------------------------------*/
static int
RaisePendingStreamEvents(mtcp_manager_t mtcp,
struct mtcp_epoll *ep, socket_map_t socket)
{
tcp_stream *stream = socket->stream;
if (!stream)
return -1;
if (stream->state < TCP_ST_ESTABLISHED)
return -1;
TRACE_EPOLL("Stream %d at state %s\n",
stream->id, TCPStateToString(stream));
/* if there are payloads already read before epoll registration */
/* generate read event */
if (socket->epoll & MTCP_EPOLLIN) {
struct tcp_recv_vars *rcvvar = stream->rcvvar;
if (rcvvar->rcvbuf && rcvvar->rcvbuf->merged_len > 0) {
TRACE_EPOLL("Socket %d: Has existing payloads\n", socket->id);
AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN);
} else if (stream->state == TCP_ST_CLOSE_WAIT) {
TRACE_EPOLL("Socket %d: Waiting for close\n", socket->id);
AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN);
}
}
/* same thing to the write event */
if (socket->epoll & MTCP_EPOLLOUT) {
struct tcp_send_vars *sndvar = stream->sndvar;
if (!sndvar->sndbuf ||
(sndvar->sndbuf && sndvar->sndbuf->len < sndvar->snd_wnd)) {
if (!(socket->events & MTCP_EPOLLOUT)) {
TRACE_EPOLL("Socket %d: Adding write event\n", socket->id);
AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT);
}
}
}
return 0;
}
/*----------------------------------------------------------------------------*/
int
mtcp_epoll_ctl(mctx_t mctx, int epid,
int op, int sockid, struct mtcp_epoll_event *event)
{
mtcp_manager_t mtcp;
struct mtcp_epoll *ep;
socket_map_t socket;
uint32_t events;
mtcp = GetMTCPManager(mctx);
if (!mtcp) {
return -1;
}
if (epid < 0 || epid >= CONFIG.max_concurrency) {
TRACE_API("Epoll id %d out of range.\n", epid);
errno = EBADF;
return -1;
}
if (sockid < 0 || sockid >= CONFIG.max_concurrency) {
TRACE_API("Socket id %d out of range.\n", sockid);
errno = EBADF;
return -1;
}
if (mtcp->smap[epid].socktype == MTCP_SOCK_UNUSED) {
errno = EBADF;
return -1;
}
if (mtcp->smap[epid].socktype != MTCP_SOCK_EPOLL) {
errno = EINVAL;
return -1;
}
ep = mtcp->smap[epid].ep;
if (!ep || (!event && op != MTCP_EPOLL_CTL_DEL)) {
errno = EINVAL;
return -1;
}
socket = &mtcp->smap[sockid];
if (op == MTCP_EPOLL_CTL_ADD) {
if (socket->epoll) {
errno = EEXIST;
return -1;
}
/* EPOLLERR and EPOLLHUP are registered as default */
events = event->events;
events |= (MTCP_EPOLLERR | MTCP_EPOLLHUP);
socket->ep_data = event->data;
socket->epoll = events;
TRACE_EPOLL("Adding epoll socket %d(type %d) ET: %u, IN: %u, OUT: %u\n",
socket->id, socket->socktype, socket->epoll & MTCP_EPOLLET,
socket->epoll & MTCP_EPOLLIN, socket->epoll & MTCP_EPOLLOUT);
if (socket->socktype == MTCP_SOCK_STREAM) {
RaisePendingStreamEvents(mtcp, ep, socket);
} else if (socket->socktype == MTCP_SOCK_PIPE) {
RaisePendingPipeEvents(mctx, epid, sockid);
}
} else if (op == MTCP_EPOLL_CTL_MOD) {
if (!socket->epoll) {
pthread_mutex_unlock(&ep->epoll_lock);
errno = ENOENT;
return -1;
}
events = event->events;
events |= (MTCP_EPOLLERR | MTCP_EPOLLHUP);
socket->ep_data = event->data;
socket->epoll = events;
if (socket->socktype == MTCP_SOCK_STREAM) {
RaisePendingStreamEvents(mtcp, ep, socket);
} else if (socket->socktype == MTCP_SOCK_PIPE) {
RaisePendingPipeEvents(mctx, epid, sockid);
}
} else if (op == MTCP_EPOLL_CTL_DEL) {
if (!socket->epoll) {
errno = ENOENT;
return -1;
}
socket->epoll = MTCP_EPOLLNONE;
}
return 0;
}
/*----------------------------------------------------------------------------*/
int
mtcp_epoll_wait(mctx_t mctx, int epid,
struct mtcp_epoll_event *events, int maxevents, int timeout)
{
mtcp_manager_t mtcp;
struct mtcp_epoll *ep;
struct event_queue *eq;
struct event_queue *eq_shadow;
socket_map_t event_socket;
int validity;
int i, cnt, ret;
int num_events;
mtcp = GetMTCPManager(mctx);
if (!mtcp) {
return -1;
}
if (epid < 0 || epid >= CONFIG.max_concurrency) {
TRACE_API("Epoll id %d out of range.\n", epid);
errno = EBADF;
return -1;
}
if (mtcp->smap[epid].socktype == MTCP_SOCK_UNUSED) {
errno = EBADF;
return -1;
}
if (mtcp->smap[epid].socktype != MTCP_SOCK_EPOLL) {
errno = EINVAL;
return -1;
}
ep = mtcp->smap[epid].ep;
if (!ep || !events || maxevents <= 0) {
errno = EINVAL;
return -1;
}
ep->stat.calls++;
#if SPIN_BEFORE_SLEEP
int spin = 0;
while (ep->num_events == 0 && spin < SPIN_THRESH) {
spin++;
}
#endif /* SPIN_BEFORE_SLEEP */
if (pthread_mutex_lock(&ep->epoll_lock)) {
if (errno == EDEADLK)
perror("mtcp_epoll_wait: epoll_lock blocked\n");
assert(0);
}
wait:
eq = ep->usr_queue;
eq_shadow = ep->usr_shadow_queue;
/* wait until event occurs */
while (eq->num_events == 0 && eq_shadow->num_events == 0 && timeout != 0) {
#if INTR_SLEEPING_MTCP
/* signal to mtcp thread if it is sleeping */
if (mtcp->wakeup_flag && mtcp->is_sleeping) {
pthread_kill(mtcp->ctx->thread, SIGUSR1);
}
#endif
ep->stat.waits++;
ep->waiting = TRUE;
if (timeout > 0) {
struct timespec deadline;
clock_gettime(CLOCK_REALTIME, &deadline);
if (timeout > 1000) {
int sec;
sec = timeout / 1000;
deadline.tv_sec += sec;
timeout -= sec * 1000;
}
if (deadline.tv_nsec >= 1000000000) {
deadline.tv_sec++;
deadline.tv_nsec -= 1000000000;
}
//deadline.tv_sec = mtcp->cur_tv.tv_sec;
//deadline.tv_nsec = (mtcp->cur_tv.tv_usec + timeout * 1000) * 1000;
ret = pthread_cond_timedwait(&ep->epoll_cond,
&ep->epoll_lock, &deadline);
if (ret && ret != ETIMEDOUT) {
/* errno set by pthread_cond_timedwait() */
pthread_mutex_unlock(&ep->epoll_lock);
TRACE_ERROR("pthread_cond_timedwait failed. ret: %d, error: %s\n",
ret, strerror(errno));
return -1;
}
timeout = 0;
} else if (timeout < 0) {
ret = pthread_cond_wait(&ep->epoll_cond, &ep->epoll_lock);
if (ret) {
/* errno set by pthread_cond_wait() */
pthread_mutex_unlock(&ep->epoll_lock);
TRACE_ERROR("pthread_cond_wait failed. ret: %d, error: %s\n",
ret, strerror(errno));
return -1;
}
}
ep->waiting = FALSE;
if (mtcp->ctx->done || mtcp->ctx->exit || mtcp->ctx->interrupt) {
mtcp->ctx->interrupt = FALSE;
//ret = pthread_cond_signal(&ep->epoll_cond);
pthread_mutex_unlock(&ep->epoll_lock);
errno = EINTR;
return -1;
}
}
/* fetch events from the user event queue */
cnt = 0;
num_events = eq->num_events;
for (i = 0; i < num_events && cnt < maxevents; i++) {
event_socket = &mtcp->smap[eq->events[eq->start].sockid];
validity = TRUE;
if (event_socket->socktype == MTCP_SOCK_UNUSED)
validity = FALSE;
if (!(event_socket->epoll & eq->events[eq->start].ev.events))
validity = FALSE;
if (!(event_socket->events & eq->events[eq->start].ev.events))
validity = FALSE;
if (validity) {
events[cnt++] = eq->events[eq->start].ev;
assert(eq->events[eq->start].sockid >= 0);
TRACE_EPOLL("Socket %d: Handled event. event: %s, "
"start: %u, end: %u, num: %u\n",
event_socket->id,
EventToString(eq->events[eq->start].ev.events),
eq->start, eq->end, eq->num_events);
ep->stat.handled++;
} else {
TRACE_EPOLL("Socket %d: event %s invalidated.\n",
eq->events[eq->start].sockid,
EventToString(eq->events[eq->start].ev.events));
ep->stat.invalidated++;
}
event_socket->events &= (~eq->events[eq->start].ev.events);
eq->start++;
eq->num_events--;
if (eq->start >= eq->size) {
eq->start = 0;
}
}
/* fetch eventes from user shadow event queue */
eq = ep->usr_shadow_queue;
num_events = eq->num_events;
for (i = 0; i < num_events && cnt < maxevents; i++) {
event_socket = &mtcp->smap[eq->events[eq->start].sockid];
validity = TRUE;
if (event_socket->socktype == MTCP_SOCK_UNUSED)
validity = FALSE;
if (!(event_socket->epoll & eq->events[eq->start].ev.events))
validity = FALSE;
if (!(event_socket->events & eq->events[eq->start].ev.events))
validity = FALSE;
if (validity) {
events[cnt++] = eq->events[eq->start].ev;
assert(eq->events[eq->start].sockid >= 0);
TRACE_EPOLL("Socket %d: Handled event. event: %s, "
"start: %u, end: %u, num: %u\n",
event_socket->id,
EventToString(eq->events[eq->start].ev.events),
eq->start, eq->end, eq->num_events);
ep->stat.handled++;
} else {
TRACE_EPOLL("Socket %d: event %s invalidated.\n",
eq->events[eq->start].sockid,
EventToString(eq->events[eq->start].ev.events));
ep->stat.invalidated++;
}
event_socket->events &= (~eq->events[eq->start].ev.events);
eq->start++;
eq->num_events--;
if (eq->start >= eq->size) {
eq->start = 0;
}
}
if (cnt == 0 && timeout != 0)
goto wait;
pthread_mutex_unlock(&ep->epoll_lock);
return cnt;
}
/*----------------------------------------------------------------------------*/
inline int
AddEpollEvent(struct mtcp_epoll *ep,
int queue_type, socket_map_t socket, uint32_t event)
{
struct event_queue *eq;
int index;
if (!ep || !socket || !event)
return -1;
ep->stat.issued++;
if (socket->events & event) {
return 0;
}
if (queue_type == MTCP_EVENT_QUEUE) {
eq = ep->mtcp_queue;
} else if (queue_type == USR_EVENT_QUEUE) {
eq = ep->usr_queue;
pthread_mutex_lock(&ep->epoll_lock);
} else if (queue_type == USR_SHADOW_EVENT_QUEUE) {
eq = ep->usr_shadow_queue;
} else {
TRACE_ERROR("Non-existing event queue type!\n");
return -1;
}
if (eq->num_events >= eq->size) {
TRACE_ERROR("Exceeded epoll event queue! num_events: %d, size: %d\n",
eq->num_events, eq->size);
if (queue_type == USR_EVENT_QUEUE)
pthread_mutex_unlock(&ep->epoll_lock);
return -1;
}
index = eq->end++;
socket->events |= event;
eq->events[index].sockid = socket->id;
eq->events[index].ev.events = event;
eq->events[index].ev.data = socket->ep_data;
if (eq->end >= eq->size) {
eq->end = 0;
}
eq->num_events++;
#if 0
TRACE_EPOLL("Socket %d New event: %s, start: %u, end: %u, num: %u\n",
ep->events[index].sockid,
EventToString(ep->events[index].ev.events),
ep->start, ep->end, ep->num_events);
#endif
if (queue_type == USR_EVENT_QUEUE)
pthread_mutex_unlock(&ep->epoll_lock);
ep->stat.registered++;
return 0;
}

122
mtcp/src/fhash.c Normal file
View File

@ -0,0 +1,122 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/queue.h>
#include "debug.h"
#include "fhash.h"
/*----------------------------------------------------------------------------*/
struct hashtable *
CreateHashtable(unsigned int (*hashfn) (const tcp_stream *), // key function
int (*eqfn) (const tcp_stream*,
const tcp_stream *)) // equality
{
int i;
struct hashtable* ht = calloc(1, sizeof(struct hashtable));
if (!ht){
TRACE_ERROR("calloc: CreateHashtable");
return 0;
}
ht->hashfn = hashfn;
ht->eqfn = eqfn;
/* init the tables */
for (i = 0; i < NUM_BINS; i++)
TAILQ_INIT(&ht->ht_table[i]);
return ht;
}
/*----------------------------------------------------------------------------*/
void
DestroyHashtable(struct hashtable *ht)
{
free(ht);
}
/*----------------------------------------------------------------------------*/
int
HTInsert(struct hashtable *ht, tcp_stream *item)
{
/* create an entry*/
int idx;
assert(ht);
assert(ht->ht_count <= 65535); // uint16_t ht_count
idx = ht->hashfn(item);
assert(idx >=0 && idx < NUM_BINS);
#if STATIC_TABLE
for (i = 0; i < TCP_AR_CNT; i++) {
// insert into empty array slot
if (!ht->ht_array[idx][i]) {
ht->ht_array[idx][i] = item;
item->ht_idx = i;
ht->ht_count++;
return 0;
}
}
TRACE_INFO("[WARNING] HTSearch() cnt: %d!!\n", TCP_AR_CNT);
#endif
TAILQ_INSERT_TAIL(&ht->ht_table[idx], item, rcvvar->he_link);
item->ht_idx = TCP_AR_CNT;
ht->ht_count++;
return 0;
}
/*----------------------------------------------------------------------------*/
void*
HTRemove(struct hashtable *ht, tcp_stream *item)
{
hash_bucket_head *head;
int idx = ht->hashfn(item);
#if STATIC_TABLE
if (item->ht_idx < TCP_AR_CNT) {
assert(ht_array[idx][item->ht_idx]);
ht->ht_array[idx][item->ht_idx] = NULL;
} else {
#endif
head = &ht->ht_table[idx];
TAILQ_REMOVE(head, item, rcvvar->he_link);
#if STATIC_TABLE
}
#endif
ht->ht_count--;
return (item);
}
/*----------------------------------------------------------------------------*/
tcp_stream*
HTSearch(struct hashtable *ht, const tcp_stream *item)
{
int idx;
tcp_stream *walk;
hash_bucket_head *head;
idx = ht->hashfn(item);
#if STATIC_TABLE
for (i = 0; i < TCP_AR_CNT; i++) {
if (ht->ht_array[idx][i]) {
if (ht->eqfn(ht->ht_array[idx][i], item))
return ht->ht_array[idx][i];
}
}
#endif
head = &ht->ht_table[ht->hashfn(item)];
TAILQ_FOREACH(walk, head, rcvvar->he_link) {
if (ht->eqfn(walk, item))
return walk;
}
return NULL;
}
/*----------------------------------------------------------------------------*/

View File

@ -0,0 +1,37 @@
#ifndef __ADDR_POOL_H_
#define __ADDR_POOL_H_
#include <netinet/in.h>
#include <sys/queue.h>
/*----------------------------------------------------------------------------*/
typedef struct addr_pool *addr_pool_t;
/*----------------------------------------------------------------------------*/
/* CreateAddressPool() */
/* Create address pool for given address range. */
/* addr_base: the base address in network order. */
/* num_addr: number of addresses to use as source IP */
/*----------------------------------------------------------------------------*/
addr_pool_t
CreateAddressPool(in_addr_t addr_base, int num_addr);
/*----------------------------------------------------------------------------*/
/* CreateAddressPoolPerCore() */
/* Create address pool only for the given core number. */
/* All addresses and port numbers should be in network order. */
/*----------------------------------------------------------------------------*/
addr_pool_t
CreateAddressPoolPerCore(int core, int num_queues,
in_addr_t saddr_base, int num_addr, in_addr_t daddr, in_port_t dport);
/*----------------------------------------------------------------------------*/
void
DestroyAddressPool(addr_pool_t ap);
/*----------------------------------------------------------------------------*/
int
FetchAddress(addr_pool_t ap, int core, int num_queues,
const struct sockaddr_in *daddr, struct sockaddr_in *saddr);
/*----------------------------------------------------------------------------*/
int
FreeAddress(addr_pool_t ap, const struct sockaddr_in *addr);
/*----------------------------------------------------------------------------*/
#endif /* __ADDR_POOL_H_ */

28
mtcp/src/include/arp.h Normal file
View File

@ -0,0 +1,28 @@
#ifndef __ARP_H_
#define __ARP_H_
#define MAX_ARPENTRY 1024
int
InitARPTable();
unsigned char *
GetHWaddr(uint32_t ip);
unsigned char *
GetDestinationHWaddr(uint32_t dip);
void
RequestARP(mtcp_manager_t mtcp, uint32_t ip, int nif, uint32_t cur_ts);
int
ProcessARPPacket(mtcp_manager_t mtcp, uint32_t cur_ts,
const int ifidx, unsigned char* pkt_data, int len);
void
PublishARP(mtcp_manager_t mtcp);
void
PrintARPTable();
#endif /* __ARP_H_ */

43
mtcp/src/include/config.h Normal file
View File

@ -0,0 +1,43 @@
#ifndef __CONFIG_H_
#define __CONFIG_H_
#include "ps.h"
int num_cpus;
int num_queues;
int num_devices;
struct ps_device devices[MAX_DEVICES];
int num_devices_attached;
int devices_attached[MAX_DEVICES];
int
LoadConfiguration(char *fname);
/* set configurations from the setted
interface information */
int
SetInterfaceInfo();
/* set configurations from the files */
int
SetRoutingTable();
int
LoadARPTable();
/* print setted configuration */
void
PrintConfiguration();
void
PrintInterfaceInfo();
void
PrintRoutingTable();
/* set socket modes */
int
SetSocketMode(int8_t socket_mode);
#endif /* __CONFIG_H_ */

6
mtcp/src/include/cpu.h Normal file
View File

@ -0,0 +1,6 @@
#ifndef __CPU_H_
#define __CPU_H_
int GetNumCPUs();
#endif /* __CPU_H_ */

228
mtcp/src/include/debug.h Normal file
View File

@ -0,0 +1,228 @@
#ifndef __DEBUG_H_
#define __DEBUG_H_
#include <errno.h>
#include <stdio.h>
#include <assert.h>
#include "mtcp.h"
#include "tcp_in.h"
#ifdef DBGTEMP
#define TRACE_TEMP(f, m...) { \
fprintf(stderr, "[CPU %d][%10s:%4d] " f, mtcp->ctx->cpu, \
__FUNCTION__, __LINE__, ##m); \
}
#else
#define TRACE_TEMP(f, m...) (void)0
#endif /* DBGTEMP*/
#ifdef DBGERR
#define TRACE_ERROR(f, m...) { \
fprintf(stderr, "[%10s:%4d] " f, __FUNCTION__, __LINE__, ##m); \
}
#else
#define TRACE_ERROR(f, m...) (void)0
#endif /* DBGERR */
#ifdef DBGCERR
#define CTRACE_ERROR(f, m...) { \
fprintf(stderr, "[CPU %d][%10s:%4d] " f, mtcp->ctx->cpu, __FUNCTION__, __LINE__, ##m); \
}
#else
#define CTRACE_ERROR(f, m...) (void)0
#endif /* DBGERR */
#ifdef DBGMSG
#define TRACE_DBG(f, m...) {\
thread_printf(mtcp, mtcp->log_fp, "[%10s:%4d] " \
f, __FUNCTION__, __LINE__, ##m); \
}
#else
#define TRACE_DBG(f, m...) (void)0
#endif /* DBGMSG */
#ifdef INFO
#define TRACE_INFO(f, m...) { \
fprintf(stderr, "[%10s:%4d] " f,__FUNCTION__, __LINE__, ##m); \
}
#else
#define TRACE_INFO(f, m...) (void)0
#endif /* INFO */
#define TRACE_CONFIG(f, m...) fprintf(stderr, f, ##m)
#ifdef DBGLOG
#define TRACE_LOG(f, m...) TRACE_INFO(f, ##m)
#else
#define TRACE_LOG(f, m...) (void)0
#endif
#ifdef STREAM
#define TRACE_STREAM(f, m...) TRACE_FUNC("STREAM", f, ##m)
#else
#define TRACE_STREAM(f, m...) (void)0
#endif
#ifdef STATE
#define TRACE_STATE(f, m...) TRACE_FUNC("STATE", f, ##m)
#else
#define TRACE_STATE(f, m...) (void)0
#endif
#ifdef SNDBUF
#define TRACE_SNDBUF(f, m...) TRACE_FUNC("SNDBUF", f, ##m)
#else
#define TRACE_SNDBUF(f, m...) (void)0
#endif
#ifdef RCVBUF
#define TRACE_RCVBUF(f, m...) TRACE_FUNC("RCVBUF", f, ##m)
#else
#define TRACE_RCVBUF(f, m...) (void)0
#endif
#ifdef CLWND
#define TRACE_CLWND(f, m...) TRACE_FUNC("CLWND", f, ##m)
#else
#define TRACE_CLWND(f, m...) (void)0
#endif
#ifdef LOSS
#define TRACE_LOSS(f, m...) TRACE_FUNC("LOSS", f, ##m)
#else
#define TRACE_LOSS(f, m...) (void)0
#endif
#ifdef SACK
#define TRACE_SACK(f, m...) TRACE_FUNC("SACK", f, ##m)
#else
#define TRACE_SACK(f, m...) (void)0
#endif
#ifdef TSTAMP
#define TRACE_TSTAMP(f, m...) TRACE_FUNC("TSTAMP", f, ##m)
#else
#define TRACE_TSTAMP(f, m...) (void)0
#endif
#ifdef RTT
#define TRACE_RTT(f, m...) TRACE_FUNC("RTT", f, ##m)
#else
#define TRACE_RTT(f, m...) (void)0
#endif
#ifdef RTO
#define TRACE_RTO(f, m...) TRACE_FUNC("RTO", f, ##m)
#else
#define TRACE_RTO(f, m...) (void)0
#endif
#ifdef CONG
#define TRACE_CONG(f, m...) TRACE_FUNC("CONG", f, ##m)
#else
#define TRACE_CONG(f, m...) (void)0
#endif
#ifdef EPOLL
#define TRACE_EPOLL(f, m...) TRACE_FUNC("EPOLL", f, ##m)
#else
#define TRACE_EPOLL(f, m...) (void)0
#endif
#ifdef FSTAT
#define TRACE_FSTAT(f, m...) TRACE_FUNC("FSTAT", f, ##m)
#else
#define TRACE_FSTAT(f, m...) (void)0
#endif
#ifdef APP
#define TRACE_APP(f, m...) TRACE_FUNC("APP", f, ##m)
#else
#define TRACE_APP(f, m...) (void)0
#endif
#ifdef DBGFIN
#define TRACE_FIN(f, m...) TRACE_FUNC("FIN", f, ##m)
#else
#define TRACE_FIN(f, m...) (void)0
#endif
#ifdef TSTAT
#define TRACE_TSTAT(f, m...) TRACE_FUNC("TSTAT", f, ##m)
#else
#define TRACE_TSTAT(f, m...) (void)0
#endif
#ifdef LOOP
#define TRACE_LOOP(f, m...) TRACE_FUNC("LOOP", "ts: %u, "f, cur_ts, ##m)
#else
#define TRACE_LOOP(f, m...) (void)0
#endif
#ifdef ROUND
#define TRACE_ROUND(f, m...) TRACE_FUNC("ROUND", f, ##m)
#else
#define TRACE_ROUND(f, m...) (void)0
#endif
#ifdef SELECT
#define TRACE_SELECT(f, m...) TRACE_FUNC("SELECT", f, ##m)
#else
#define TRACE_SELECT(f, m...) (void)0
#endif
#ifdef API
#define TRACE_API(f, m...) TRACE_FUNC("API", f, ##m)
#else
#define TRACE_API(f, m...) (void)0
#endif
#ifdef DBGFUNC
#define TRACE_FUNC(n, f, m...) { \
thread_printf(mtcp, mtcp->log_fp, "[%6s: %10s:%4d] " \
f, n, __FUNCTION__, __LINE__, ##m); \
}
#else
#define TRACE_FUNC(f, m...) (void)0
#endif /* DBGFUNC */
void
DumpPacket(mtcp_manager_t mtcp, char *buf, int len, char *step, int ifindex);
void
DumpIPPacket(mtcp_manager_t mtcp, const struct iphdr *iph, int len);
void
DumpIPPacketToFile(FILE *fout, const struct iphdr *iph, int len);
void
flush_log_data(mtcp_manager_t mtcp);
void
thread_printf(mtcp_manager_t mtcp, FILE* f_idx, const char* _Format, ...);
#endif /* __DEBUG_H_ */

10
mtcp/src/include/eth_in.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef __ETH_IN_H_
#define __ETH_IN_H_
#include "mtcp.h"
int
ProcessPacket(mtcp_manager_t mtcp, const int ifidx,
uint32_t cur_ts, unsigned char *pkt_data, int len);
#endif /* __ETH_IN_H_ */

View File

@ -0,0 +1,30 @@
#ifndef __ETH_OUT_H_
#define __ETH_OUT_H_
#include <stdint.h>
#include "mtcp.h"
#include "tcp_stream.h"
#include "ps.h"
#define MAX_SEND_PCK_CHUNK 64
#if !(E_PSIO || USE_CHUNK_BUF)
inline void
InitWriteChunks(struct ps_handle* handle, struct ps_chunk *w_chunk);
int
FlushWriteBuffer(struct mtcp_thread_context *ctx, int ifidx);
#else
int
FlushSendChunkBuf(mtcp_manager_t mtcp, int nif);
#endif
uint8_t *
EthernetOutput(struct mtcp_manager *mtcp, uint16_t h_proto,
int nif, unsigned char* dst_haddr, uint16_t iplen);
#endif /* __ETH_OUT_H_ */

View File

@ -0,0 +1,60 @@
#ifndef __EVENTPOLL_H_
#define __EVENTPOLL_H_
#include "mtcp_api.h"
#include "mtcp_epoll.h"
/*----------------------------------------------------------------------------*/
struct mtcp_epoll_stat
{
uint64_t calls;
uint64_t waits;
uint64_t wakes;
uint64_t issued;
uint64_t registered;
uint64_t invalidated;
uint64_t handled;
};
/*----------------------------------------------------------------------------*/
struct mtcp_epoll_event_int
{
struct mtcp_epoll_event ev;
int sockid;
};
/*----------------------------------------------------------------------------*/
enum event_queue_type
{
USR_EVENT_QUEUE = 0,
USR_SHADOW_EVENT_QUEUE = 1,
MTCP_EVENT_QUEUE = 2
};
/*----------------------------------------------------------------------------*/
struct event_queue
{
struct mtcp_epoll_event_int *events;
int start; // starting index
int end; // ending index
int size; // max size
int num_events; // number of events
};
/*----------------------------------------------------------------------------*/
struct mtcp_epoll
{
struct event_queue *usr_queue;
struct event_queue *usr_shadow_queue;
struct event_queue *mtcp_queue;
uint8_t waiting;
struct mtcp_epoll_stat stat;
pthread_cond_t epoll_cond;
pthread_mutex_t epoll_lock;
};
/*----------------------------------------------------------------------------*/
int
CloseEpollSocket(mctx_t mctx, int epid);
#endif /* __EVENTPOLL_H_ */

42
mtcp/src/include/fhash.h Normal file
View File

@ -0,0 +1,42 @@
#ifndef __FHASH_H_
#define __FHASH_H_
#include <sys/queue.h>
#include "tcp_stream.h"
#define NUM_BINS (131072) /* 132 K entries per thread*/
#define TCP_AR_CNT (3)
#define STATIC_TABLE FALSE
typedef struct hash_bucket_head {
tcp_stream *tqh_first;
tcp_stream **tqh_last;
} hash_bucket_head;
/* hashtable structure */
struct hashtable {
uint8_t ht_count ; // count for # entry
#if STATIC_TABLE
tcp_stream* ht_array[NUM_BINS][TCP_AR_CNT];
#endif
hash_bucket_head ht_table[NUM_BINS];
// functions
unsigned int (*hashfn) (const tcp_stream *);
int (*eqfn) (const tcp_stream *, const tcp_stream *);
};
/*functions for hashtable*/
struct hashtable *CreateHashtable(unsigned int (*hashfn) (const tcp_stream*),
int (*eqfn) (const tcp_stream*,
const tcp_stream *));
void DestroyHashtable(struct hashtable *ht);
int HTInsert(struct hashtable *ht, tcp_stream *);
void* HTRemove(struct hashtable *ht, tcp_stream *);
tcp_stream* HTSearch(struct hashtable *ht, const tcp_stream *);
#endif /* __FHASH_H_ */

10
mtcp/src/include/ip_in.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef __IP_IN_H_
#define __IP_IN_H_
#include "mtcp.h"
int
ProcessIPv4Packet(mtcp_manager_t mtcp, uint32_t cur_ts,
const int ifidx, unsigned char* pkt_data, int len);
#endif /* __IP_IN_H_ */

20
mtcp/src/include/ip_out.h Normal file
View File

@ -0,0 +1,20 @@
#ifndef __IP_OUT_H_
#define __IP_OUT_H_
#include <stdint.h>
#include "tcp_stream.h"
inline int
GetOutputInterface(uint32_t daddr);
void
ForwardIPv4Packet(mtcp_manager_t mtcp, int nif_in, char *buf, int len);
uint8_t *
IPOutputStandalone(struct mtcp_manager *mtcp,
uint16_t ip_id, uint32_t saddr, uint32_t daddr, uint16_t tcplen);
uint8_t *
IPOutput(struct mtcp_manager *mtcp, tcp_stream *stream, uint16_t tcplen);
#endif /* __IP_OUT_H_ */

47
mtcp/src/include/logger.h Normal file
View File

@ -0,0 +1,47 @@
#ifndef __LOGGER_H_
#define __LOGGER_H_
#include <stdint.h>
#define LOG_BUFF_SIZE (256*1024)
#define NUM_LOG_BUFF (100)
enum {
IDLE_LOGT,
ACTIVE_LOGT
} log_thread_state;
typedef struct log_buff
{
int tid;
FILE* fid;
int buff_len;
char buff[LOG_BUFF_SIZE];
TAILQ_ENTRY(log_buff) buff_link;
} log_buff;
typedef struct log_thread_context {
pthread_t thread;
int cpu;
int done;
int sp_fd;
int pair_sp_fd;
int free_buff_cnt;
int job_buff_cnt;
uint8_t state;
pthread_mutex_t mutex;
pthread_mutex_t free_mutex;
TAILQ_HEAD(, log_buff) working_queue;
TAILQ_HEAD(, log_buff) free_queue;
} log_thread_context;
log_buff* DequeueFreeBuffer (log_thread_context *ctx);
void EnqueueJobBuffer(log_thread_context *ctx, log_buff* working_bp);
void InitLogThreadContext (log_thread_context *ctx, int cpu);
void *ThreadLogMain(void* arg);
#endif /* __LOGGER_H_ */

View File

@ -0,0 +1,23 @@
#ifndef __MEMORY_MGT_H_
#define __MEMORY_MGT_H_
struct mem_pool;
typedef struct mem_pool* mem_pool_t;
/* create a memory pool with a chunk size and total size
an return the pointer to the memory pool */
mem_pool_t MPCreate(int chunk_size, size_t total_size, int is_hugepage);
/* allocate one chunk */
void *MPAllocateChunk(mem_pool_t mp);
/* free one chunk */
void MPFreeChunk(mem_pool_t mp, void *p);
/* destroy the memory pool */
void MPDestroy(mem_pool_t mp);
/* return the number of free chunks */
int MPGetFreeChunks(mem_pool_t mp);
#endif /* __MEMORY_MGT_H_ */

306
mtcp/src/include/mtcp.h Normal file
View File

@ -0,0 +1,306 @@
#ifndef __MTCP_H_
#define __MTCP_H_
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/queue.h>
#include <pthread.h>
#include "memory_mgt.h"
#include "tcp_ring_buffer.h"
#include "tcp_send_buffer.h"
#include "tcp_stream_queue.h"
#include "socket.h"
#include "mtcp_api.h"
#include "eventpoll.h"
#include "addr_pool.h"
#include "ps.h"
#include "logger.h"
#include "stat.h"
#ifndef TRUE
#define TRUE (1)
#endif
#ifndef FALSE
#define FALSE (0)
#endif
#ifndef ERROR
#define ERROR (-1)
#endif
#define MAX_CPUS 16
#define ETHERNET_HEADER_LEN 14 // sizeof(struct ethhdr)
#define IP_HEADER_LEN 20 // sizeof(struct iphdr)
#define TCP_HEADER_LEN 20 // sizeof(struct tcphdr)
#define TOTAL_TCP_HEADER_LEN 54 // total header length
/* configrations */
#define BACKLOG_SIZE (10*1024)
#define MAX_PKT_SIZE (2*1024)
#define ETH_NUM 4
#define TCP_OPT_TIMESTAMP_ENABLED TRUE
#define TCP_OPT_SACK_ENABLED FALSE
#define E_PSIO TRUE
#define USE_CHUNK_BUF FALSE
#define LOCK_STREAM_QUEUE FALSE
#define USE_SPIN_LOCK TRUE
#define INTR_SLEEPING_MTCP TRUE
#define PROMISCUOUS_MODE TRUE
#define BLOCKING_SUPPORT FALSE
/*----------------------------------------------------------------------------*/
/* Statistics */
#ifdef NETSTAT
#define NETSTAT_PERTHREAD TRUE
#define NETSTAT_TOTAL TRUE
#endif /* NETSTAT */
#define RTM_STAT FALSE
/*----------------------------------------------------------------------------*/
/* Lock definitions for socket buffer */
#if USE_SPIN_LOCK
#define SBUF_LOCK_INIT(lock, errmsg, action); \
if (pthread_spin_init(lock, PTHREAD_PROCESS_PRIVATE)) { \
perror("pthread_spin_init" errmsg); \
action; \
}
#define SBUF_LOCK_DESTROY(lock) pthread_spin_destroy(lock)
#define SBUF_LOCK(lock) pthread_spin_lock(lock)
#define SBUF_UNLOCK(lock) pthread_spin_unlock(lock)
#else
#define SBUF_LOCK_INIT(lock, errmsg, action); \
if (pthread_mutex_init(lock, NULL)) { \
perror("pthread_mutex_init" errmsg); \
action; \
}
#define SBUF_LOCK_DESTROY(lock) pthread_mutex_destroy(lock)
#define SBUF_LOCK(lock) pthread_mutex_lock(lock)
#define SBUF_UNLOCK(lock) pthread_mutex_unlock(lock)
#endif /* USE_SPIN_LOCK */
/*----------------------------------------------------------------------------*/
struct eth_table
{
char dev_name[128];
int ifindex;
int stat_print;
unsigned char haddr[ETH_ALEN];
uint32_t netmask;
// unsigned char dst_haddr[ETH_ALEN];
uint32_t ip_addr;
};
/*----------------------------------------------------------------------------*/
struct route_table
{
uint32_t daddr;
uint32_t mask;
uint32_t masked;
int prefix;
int nif;
};
/*----------------------------------------------------------------------------*/
struct arp_entry
{
uint32_t ip;
int8_t prefix;
uint32_t ip_mask;
uint32_t ip_masked;
unsigned char haddr[ETH_ALEN];
};
/*----------------------------------------------------------------------------*/
struct arp_table
{
struct arp_entry *entry;
int entries;
};
/*----------------------------------------------------------------------------*/
struct mtcp_config
{
/* socket mode */
int8_t socket_mode;
/* network interface config */
struct eth_table *eths;
int eths_num;
/* route config */
struct route_table *rtable; // routing table
int routes; // # of entries
/* arp config */
struct arp_table arp;
int num_cores;
int max_concurrency;
int max_num_buffers;
int rcvbuf_size;
int sndbuf_size;
int tcp_timewait;
int tcp_timeout;
};
/*----------------------------------------------------------------------------*/
struct mtcp_context
{
int cpu;
};
/*----------------------------------------------------------------------------*/
struct mtcp_sender
{
int ifidx;
TAILQ_HEAD (control_head, tcp_stream) control_list;
TAILQ_HEAD (send_head, tcp_stream) send_list;
TAILQ_HEAD (ack_head, tcp_stream) ack_list;
int control_list_cnt;
int send_list_cnt;
int ack_list_cnt;
};
/*----------------------------------------------------------------------------*/
struct mtcp_manager
{
mem_pool_t flow_pool; /* memory pool for tcp_stream */
mem_pool_t rv_pool; /* memory pool for recv variables */
mem_pool_t sv_pool; /* memory pool for send variables */
mem_pool_t mv_pool; /* memory pool for monitor variables */
//mem_pool_t socket_pool;
sb_manager_t rbm_snd;
rb_manager_t rbm_rcv;
struct hashtable *tcp_flow_table;
uint32_t s_index:24; /* stream index */
socket_map_t smap;
TAILQ_HEAD (, socket_map) free_smap;
addr_pool_t ap; /* address pool */
uint32_t g_id; /* id space in a thread */
uint32_t flow_cnt; /* number of concurrent flows */
struct mtcp_thread_context* ctx;
/* variables related to logger */
int sp_fd;
log_thread_context* logger;
log_buff* w_buffer;
FILE *log_fp;
/* variables related to event */
struct mtcp_epoll *ep;
uint32_t ts_last_event;
struct tcp_listener *listener;
stream_queue_t connectq; /* streams need to connect */
stream_queue_t sendq; /* streams need to send data */
stream_queue_t ackq; /* streams need to send ack */
stream_queue_t closeq; /* streams need to close */
stream_queue_int *closeq_int; /* internally maintained closeq */
stream_queue_t resetq; /* streams need to reset */
stream_queue_int *resetq_int; /* internally maintained resetq */
stream_queue_t destroyq; /* streams need to be destroyed */
struct mtcp_sender *g_sender;
struct mtcp_sender *n_sender[ETH_NUM];
/* lists related to timeout */
struct rto_hashstore* rto_store;
TAILQ_HEAD (timewait_head, tcp_stream) timewait_list;
TAILQ_HEAD (timeout_head, tcp_stream) timeout_list;
int rto_list_cnt;
int timewait_list_cnt;
int timeout_list_cnt;
#if BLOCKING_SUPPORT
TAILQ_HEAD (rcv_br_head, tcp_stream) rcv_br_list;
TAILQ_HEAD (snd_br_head, tcp_stream) snd_br_list;
int rcv_br_list_cnt;
int snd_br_list_cnt;
#endif
uint32_t cur_ts;
int wakeup_flag;
int is_sleeping;
/* statistics */
struct bcast_stat bstat;
struct timeout_stat tstat;
#ifdef NETSTAT
struct net_stat nstat;
struct net_stat p_nstat;
uint32_t p_nstat_ts;
struct run_stat runstat;
struct run_stat p_runstat;
struct time_stat rtstat;
#endif /* NETSTAT */
};
/*----------------------------------------------------------------------------*/
typedef struct mtcp_manager* mtcp_manager_t;
/*----------------------------------------------------------------------------*/
mtcp_manager_t
GetMTCPManager(mctx_t mctx);
/*----------------------------------------------------------------------------*/
struct mtcp_thread_context
{
int cpu;
pthread_t thread;
uint8_t done:1,
exit:1,
interrupt:1;
struct ps_handle *handle;
struct mtcp_manager* mtcp_manager;
#if E_PSIO || USE_CHUNK_BUF
struct ps_chunk_buf w_chunk_buf[ETH_NUM];
#else
struct ps_chunk w_chunk[ETH_NUM];
uint32_t w_off[ETH_NUM];
int16_t w_cur_idx[ETH_NUM];
#endif
pthread_mutex_t smap_lock;
pthread_mutex_t flow_pool_lock;
pthread_mutex_t socket_pool_lock;
#if LOCK_STREAM_QUEUE
#if USE_SPIN_LOCK
pthread_spinlock_t connect_lock;
pthread_spinlock_t close_lock;
pthread_spinlock_t reset_lock;
pthread_spinlock_t sendq_lock;
pthread_spinlock_t ackq_lock;
pthread_spinlock_t destroyq_lock;
#else
pthread_mutex_t connect_lock;
pthread_mutex_t close_lock;
pthread_mutex_t reset_lock;
pthread_mutex_t sendq_lock;
pthread_mutex_t ackq_lock;
pthread_mutex_t destroyq_lock;
#endif /* USE_SPIN_LOCK */
#endif /* LOCK_STREAM_QUEUE */
};
/*----------------------------------------------------------------------------*/
typedef struct mtcp_thread_context* mtcp_thread_context_t;
/*----------------------------------------------------------------------------*/
struct mtcp_manager *g_mtcp[MAX_CPUS];
struct mtcp_config CONFIG;
addr_pool_t ap;
/*----------------------------------------------------------------------------*/
#endif /* __MTCP_H_ */

132
mtcp/src/include/mtcp_api.h Normal file
View File

@ -0,0 +1,132 @@
#ifndef __MTCP_API_H_
#define __MTCP_API_H_
#include <stdint.h>
#include <netinet/in.h>
#include <sys/uio.h>
#ifdef __cplusplus
extern "C" {
#endif
enum socket_type
{
MTCP_SOCK_UNUSED,
MTCP_SOCK_STREAM,
MTCP_SOCK_PROXY,
MTCP_SOCK_LISTENER,
MTCP_SOCK_EPOLL,
MTCP_SOCK_PIPE,
};
struct mtcp_conf
{
int num_cores;
int max_concurrency;
int max_num_buffers;
int rcvbuf_size;
int sndbuf_size;
int tcp_timewait;
int tcp_timeout;
};
typedef struct mtcp_context *mctx_t;
int
mtcp_init(char *config_file);
void
mtcp_destroy();
int
mtcp_getconf(struct mtcp_conf *conf);
int
mtcp_setconf(const struct mtcp_conf *conf);
int
mtcp_core_affinitize(int cpu);
mctx_t
mtcp_create_context(int cpu);
void
mtcp_destroy_context(mctx_t mctx);
typedef void (*mtcp_sighandler_t)(int);
mtcp_sighandler_t
mtcp_register_signal(int signum, mtcp_sighandler_t handler);
int
mtcp_pipe(mctx_t mctx, int pipeid[2]);
int
mtcp_getsockopt(mctx_t mctx, int sockid, int level,
int optname, void *optval, socklen_t *optlen);
int
mtcp_setsockopt(mctx_t mctx, int sockid, int level,
int optname, const void *optval, socklen_t optlen);
int
mtcp_setsock_nonblock(mctx_t mctx, int sockid);
/* mtcp_socket_ioctl: similar to ioctl,
but only FIONREAD is supported currently */
int
mtcp_socket_ioctl(mctx_t mctx, int sockid, int request, void *argp);
int
mtcp_socket(mctx_t mctx, int domain, int type, int protocol);
int
mtcp_bind(mctx_t mctx, int sockid,
const struct sockaddr *addr, socklen_t addrlen);
int
mtcp_listen(mctx_t mctx, int sockid, int backlog);
int
mtcp_accept(mctx_t mctx, int sockid, struct sockaddr *addr, socklen_t *addrlen);
int
mtcp_init_rss(mctx_t mctx, in_addr_t saddr_base, int num_addr,
in_addr_t daddr, in_addr_t dport);
int
mtcp_connect(mctx_t mctx, int sockid,
const struct sockaddr *addr, socklen_t addrlen);
int
mtcp_close(mctx_t mctx, int sockid);
int
mtcp_abort(mctx_t mctx, int sockid);
int
mtcp_read(mctx_t mctx, int sockid, char *buf, int len);
/* readv should work in atomic */
int
mtcp_readv(mctx_t mctx, int sockid, struct iovec *iov, int numIOV);
int
mtcp_write(mctx_t mctx, int sockid, char *buf, int len);
/* writev should work in atomic */
int
mtcp_writev(mctx_t mctx, int sockid, struct iovec *iov, int numIOV);
#if 0
int
mtcp_delete(mctx_t mctx, int sockid, int len);
#endif
#ifdef __cplusplus
};
#endif
#endif /* __MTCP_API_H_ */

View File

@ -0,0 +1,69 @@
#ifndef __MTCP_EPOLL_H_
#define __MTCP_EPOLL_H_
#include "mtcp_api.h"
#ifdef __cplusplus
extern "C" {
#endif
/*----------------------------------------------------------------------------*/
enum mtcp_epoll_op
{
MTCP_EPOLL_CTL_ADD = 1,
MTCP_EPOLL_CTL_DEL = 2,
MTCP_EPOLL_CTL_MOD = 3,
};
/*----------------------------------------------------------------------------*/
enum mtcp_event_type
{
MTCP_EPOLLNONE = 0x000,
MTCP_EPOLLIN = 0x001,
MTCP_EPOLLPRI = 0x002,
MTCP_EPOLLOUT = 0x004,
MTCP_EPOLLRDNORM = 0x040,
MTCP_EPOLLRDBAND = 0x080,
MTCP_EPOLLWRNORM = 0x100,
MTCP_EPOLLWRBAND = 0x200,
MTCP_EPOLLMSG = 0x400,
MTCP_EPOLLERR = 0x008,
MTCP_EPOLLHUP = 0x010,
MTCP_EPOLLRDHUP = 0x2000,
MTCP_EPOLLONESHOT = (1 << 30),
MTCP_EPOLLET = (1 << 31)
};
/*----------------------------------------------------------------------------*/
typedef union mtcp_epoll_data
{
void *ptr;
int sockid;
uint32_t u32;
uint64_t u64;
} mtcp_epoll_data_t;
/*----------------------------------------------------------------------------*/
struct mtcp_epoll_event
{
uint32_t events;
mtcp_epoll_data_t data;
};
/*----------------------------------------------------------------------------*/
int
mtcp_epoll_create(mctx_t mctx, int size);
/*----------------------------------------------------------------------------*/
int
mtcp_epoll_ctl(mctx_t mctx, int epid,
int op, int sockid, struct mtcp_epoll_event *event);
/*----------------------------------------------------------------------------*/
int
mtcp_epoll_wait(mctx_t mctx, int epid,
struct mtcp_epoll_event *events, int maxevents, int timeout);
/*----------------------------------------------------------------------------*/
char *
EventToString(uint32_t event);
/*----------------------------------------------------------------------------*/
#ifdef __cplusplus
};
#endif
#endif /* __MTCP_EPOLL_H_ */

18
mtcp/src/include/pipe.h Normal file
View File

@ -0,0 +1,18 @@
#ifndef __MTCP_PIPE_H_
#define __MTCP_PIPE_H_
#include <mtcp_api.h>
int
PipeRead(mctx_t mctx, int pipeid, char *buf, int len);
int
PipeWrite(mctx_t mctx, int pipeid, char *buf, int len);
int
RaisePendingPipeEvents(mctx_t mctx, int epid, int pipeid);
int
PipeClose(mctx_t mctx, int pipeid);
#endif /* __MTCP_PIPE_H_ */

10
mtcp/src/include/rss.h Normal file
View File

@ -0,0 +1,10 @@
#ifndef __RSS_H_
#define __RSS_H_
#include <netinet/in.h>
/* sip, dip, sp, dp: in network byte order */
int GetRSSCPUCore(in_addr_t sip, in_addr_t dip,
in_port_t sp, in_port_t dp, int num_queues);
#endif /* __RSS_H_ */

61
mtcp/src/include/socket.h Normal file
View File

@ -0,0 +1,61 @@
#ifndef __SOCKET_H_
#define __SOCKET_H_
#include "mtcp_api.h"
#include "mtcp_epoll.h"
/*----------------------------------------------------------------------------*/
enum socket_opts
{
MTCP_NONBLOCK = 0x01,
MTCP_ADDR_BIND = 0x02,
};
/*----------------------------------------------------------------------------*/
struct socket_map
{
int id;
int socktype;
uint32_t opts;
struct sockaddr_in saddr;
union {
struct tcp_stream *stream;
struct tcp_listener *listener;
struct mtcp_epoll *ep;
struct pipe *pp;
};
uint32_t epoll; /* registered events */
uint32_t events; /* available events */
mtcp_epoll_data_t ep_data;
TAILQ_ENTRY (socket_map) free_smap_link;
};
/*----------------------------------------------------------------------------*/
typedef struct socket_map * socket_map_t;
/*----------------------------------------------------------------------------*/
socket_map_t
AllocateSocket(mctx_t mctx, int socktype, int need_lock);
/*----------------------------------------------------------------------------*/
void
FreeSocket(mctx_t mctx, int sockid, int need_lock);
/*----------------------------------------------------------------------------*/
socket_map_t
GetSocket(mctx_t mctx, int sockid);
/*----------------------------------------------------------------------------*/
struct tcp_listener
{
int sockid;
socket_map_t socket;
int backlog;
stream_queue_t acceptq;
pthread_mutex_t accept_lock;
pthread_cond_t accept_cond;
};
/*----------------------------------------------------------------------------*/
#endif /* __SOCKET_H_ */

81
mtcp/src/include/stat.h Normal file
View File

@ -0,0 +1,81 @@
#ifndef __STAT_H_
#define __STAT_H_
struct run_stat
{
uint64_t rounds;
uint64_t rounds_rx;
uint64_t rounds_rx_try;
uint64_t rounds_tx;
uint64_t rounds_tx_try;
uint64_t rounds_select;
uint64_t rounds_select_rx;
uint64_t rounds_select_tx;
uint64_t rounds_select_intr;
uint64_t rounds_accept;
uint64_t rounds_read;
uint64_t rounds_write;
uint64_t rounds_epoll;
uint64_t rounds_wndadv;
uint64_t rounds_rtocheck;
uint64_t rounds_twcheck;
uint64_t rounds_tocheck;
};
struct stat_counter
{
uint64_t cnt;
uint64_t sum;
uint64_t max;
uint64_t min;
};
struct time_stat
{
struct stat_counter round;
struct stat_counter processing;
struct stat_counter tcheck;
struct stat_counter epoll;
struct stat_counter handle;
struct stat_counter xmit;
struct stat_counter select;
};
struct net_stat
{
uint64_t tx_packets[MAX_DEVICES];
uint64_t tx_bytes[MAX_DEVICES];
uint64_t tx_drops[MAX_DEVICES];
uint64_t rx_packets[MAX_DEVICES];
uint64_t rx_bytes[MAX_DEVICES];
uint64_t rx_errors[MAX_DEVICES];
};
struct bcast_stat
{
uint64_t cycles;
uint64_t write;
uint64_t read;
uint64_t epoll;
uint64_t wnd_adv;
uint64_t ack;
};
struct timeout_stat
{
uint64_t cycles;
uint64_t rto_try;
uint64_t rto;
uint64_t timewait_try;
uint64_t timewait;
};
#ifdef NETSTAT
#define STAT_COUNT(stat) stat++
#else
#define STAT_COUNT(stat)
#endif
#endif /* __STAT_H_ */

123
mtcp/src/include/tcp_in.h Normal file
View File

@ -0,0 +1,123 @@
#ifndef __TCP_IN_H_
#define __TCP_IN_H_
#include <linux/if_ether.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <netinet/ip.h>
#include "mtcp.h"
#include "fhash.h"
#define TCP_FLAG_FIN 0x01 // 0000 0001
#define TCP_FLAG_SYN 0x02 // 0000 0010
#define TCP_FLAG_RST 0x04 // 0000 0100
#define TCP_FLAG_PSH 0x08 // 0000 1000
#define TCP_FLAG_ACK 0x10 // 0001 0000
#define TCP_FLAG_URG 0x20 // 0010 0000
#define TCP_FLAG_SACK 0x40 // 0100 0000
#define TCP_FLAG_WACK 0x80 // 1000 0000
#define TCP_OPT_FLAG_MSS 0x02 // 0000 0010
#define TCP_OPT_FLAG_WSCALE 0x04 // 0000 0100
#define TCP_OPT_FLAG_SACK_PERMIT 0x08 // 0000 1000
#define TCP_OPT_FLAG_SACK 0x10 // 0001 0000
#define TCP_OPT_FLAG_TIMESTAMP 0x20 // 0010 0000
#define TCP_OPT_MSS_LEN 4
#define TCP_OPT_WSCALE_LEN 3
#define TCP_OPT_SACK_PERMIT_LEN 2
#define TCP_OPT_SACK_LEN 10
#define TCP_OPT_TIMESTAMP_LEN 10
#define TCP_DEFAULT_MSS 1460
#define TCP_DEFAULT_WSCALE 7
#define TCP_INITIAL_WINDOW 14600 // initial window size
#define TCP_SEQ_LT(a,b) ((int32_t)((a)-(b)) < 0)
#define TCP_SEQ_LEQ(a,b) ((int32_t)((a)-(b)) <= 0)
#define TCP_SEQ_GT(a,b) ((int32_t)((a)-(b)) > 0)
#define TCP_SEQ_GEQ(a,b) ((int32_t)((a)-(b)) >= 0)
#define TCP_SEQ_BETWEEN(a,b,c) (TCP_SEQ_GEQ(a,b) && TCP_SEQ_LEQ(a,c))
/* convert timeval to timestamp (precision: 10us) */
#define HZ 1000
#define TIME_TICK (1000000/HZ) // in us
#define TIMEVAL_TO_TS(t) (uint32_t)((t)->tv_sec * HZ + \
((t)->tv_usec / TIME_TICK))
#define TS_TO_USEC(t) ((t) * TIME_TICK)
#define TS_TO_MSEC(t) (TS_TO_USEC(t) / 1000)
#define USEC_TO_TS(t) ((t) / TIME_TICK)
#define MSEC_TO_TS(t) (USEC_TO_TS((t) * 1000))
#define SEC_TO_USEC(t) ((t) * 1000000)
#define SEC_TO_MSEC(t) ((t) * 1000)
#define MSEC_TO_USEC(t) ((t) * 1000)
#define USEC_TO_SEC(t) ((t) / 1000000)
//#define TCP_TIMEWAIT (MSEC_TO_USEC(5000) / TIME_TICK) // 5s
#define TCP_TIMEWAIT 0
#define TCP_INITIAL_RTO (MSEC_TO_USEC(500) / TIME_TICK) // 500ms
#define TCP_FIN_RTO (MSEC_TO_USEC(500) / TIME_TICK) // 500ms
#define TCP_TIMEOUT (MSEC_TO_USEC(30000) / TIME_TICK) // 30s
#define TCP_MAX_RTX 16
#define TCP_MAX_SYN_RETRY 7
#define TCP_MAX_BACKOFF 7
enum tcp_state
{
TCP_ST_CLOSED = 0,
TCP_ST_LISTEN = 1,
TCP_ST_SYN_SENT = 2,
TCP_ST_SYN_RCVD = 3,
TCP_ST_ESTABLISHED = 4,
TCP_ST_FIN_WAIT_1 = 5,
TCP_ST_FIN_WAIT_2 = 6,
TCP_ST_CLOSE_WAIT = 7,
TCP_ST_CLOSING = 8,
TCP_ST_LAST_ACK = 9,
TCP_ST_TIME_WAIT = 10
};
enum tcp_option
{
TCP_OPT_END = 0,
TCP_OPT_NOP = 1,
TCP_OPT_MSS = 2,
TCP_OPT_WSCALE = 3,
TCP_OPT_SACK_PERMIT = 4,
TCP_OPT_SACK = 5,
TCP_OPT_TIMESTAMP = 8
};
enum tcp_close_reason
{
TCP_NOT_CLOSED = 0,
TCP_ACTIVE_CLOSE = 1,
TCP_PASSIVE_CLOSE = 2,
TCP_CONN_FAIL = 3,
TCP_CONN_LOST = 4,
TCP_RESET = 5,
TCP_NO_MEM = 6,
TCP_NOT_ACCEPTED = 7,
TCP_TIMEDOUT = 8
};
void
ParseTCPOptions(tcp_stream *cur_stream,
uint32_t cur_ts, uint8_t *tcpopt, int len);
inline int
ProcessTCPUplink(mtcp_manager_t mtcp, uint32_t cur_ts, tcp_stream *cur_stream,
const struct tcphdr *tcph, uint32_t seq, uint32_t ack_seq,
uint8_t *payload, int payloadlen, uint32_t window);
int
ProcessTCPPacket(struct mtcp_manager *mtcp, uint32_t cur_ts,
const struct iphdr* iph, int ip_len);
uint16_t
TCPCalcChecksum(uint16_t *buf, uint16_t len, uint32_t saddr, uint32_t daddr);
#endif /* __TCP_IN_H_ */

View File

@ -0,0 +1,59 @@
#ifndef __TCP_OUT_H_
#define __TCP_OUT_H_
#include "mtcp.h"
#include "tcp_stream.h"
enum ack_opt
{
ACK_OPT_NOW,
ACK_OPT_AGGREGATE,
ACK_OPT_WACK
};
int
SendTCPPacketStandalone(struct mtcp_manager *mtcp,
uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport,
uint32_t seq, uint32_t ack_seq, uint16_t window, uint8_t flags,
uint8_t *payload, uint16_t payloadlen,
uint32_t cur_ts, uint32_t echo_ts);
int
SendTCPPacket(struct mtcp_manager *mtcp, tcp_stream *cur_stream,
uint32_t cur_ts, uint8_t flags, uint8_t *payload, uint16_t payloadlen);
inline int
WriteTCPControlList(mtcp_manager_t mtcp,
struct mtcp_sender *sender, uint32_t cur_ts, int thresh);
inline int
WriteTCPDataList(mtcp_manager_t mtcp,
struct mtcp_sender *sender, uint32_t cur_ts, int thresh);
inline int
WriteTCPACKList(mtcp_manager_t mtcp,
struct mtcp_sender *sender, uint32_t cur_ts, int thresh);
inline void
AddtoControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts);
inline void
AddtoSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
RemoveFromControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
RemoveFromSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
RemoveFromACKList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
EnqueueACK(mtcp_manager_t mtcp,
tcp_stream *cur_stream, uint32_t cur_ts, uint8_t opt);
inline void
DumpControlList(mtcp_manager_t mtcp, struct mtcp_sender *sender);
#endif /* __TCP_OUT_H_ */

View File

@ -0,0 +1,22 @@
#ifndef __TCP_RB_FRAG_QUEUE_
#define __TCP_RB_FRAG_QUEUE_
#include "tcp_ring_buffer.h"
/*---------------------------------------------------------------------------*/
typedef struct rb_frag_queue* rb_frag_queue_t;
/*---------------------------------------------------------------------------*/
rb_frag_queue_t
CreateRBFragQueue(int capacity);
/*---------------------------------------------------------------------------*/
void
DestroyRBFragQueue(rb_frag_queue_t rb_fragq);
/*---------------------------------------------------------------------------*/
int
RBFragEnqueue(rb_frag_queue_t rb_fragq, struct fragment_ctx *frag);
/*---------------------------------------------------------------------------*/
struct fragment_ctx *
RBFragDequeue(rb_frag_queue_t rb_fragq);
/*---------------------------------------------------------------------------*/
#endif /* __TCP_RB_FRAG_QUEUE_ */

View File

@ -0,0 +1,77 @@
/*
* 2010.12.10 Shinae Woo
* Ring buffer structure for managing dynamically allocating ring buffer
*
* put data to the tail
* get/pop/remove data from the head
*
* always garantee physically continuous ready in-memory data from data_offset to the data_offset+len
* automatically increase total buffer size when buffer is full
* for efficiently managing packet payload and chunking
*
*/
#ifndef __NRE_RING_BUFFER_
#define __NRE_RING_BUFFER_
#include <stdint.h>
#include <sys/types.h>
/*----------------------------------------------------------------------------*/
enum rb_caller
{
AT_APP,
AT_MTCP
};
/*----------------------------------------------------------------------------*/
typedef struct rb_manager* rb_manager_t;
/*----------------------------------------------------------------------------*/
struct fragment_ctx
{
uint32_t seq;
uint32_t len : 31;
uint32_t is_calloc : 1;
struct fragment_ctx *next;
};
/*----------------------------------------------------------------------------*/
struct tcp_ring_buffer
{
u_char* data; /* buffered data */
u_char* head; /* pointer to the head */
uint32_t head_offset; /* offset for the head (head - data) */
uint32_t tail_offset; /* offset fot the last byte (null byte) */
int merged_len; /* contiguously merged length */
uint64_t cum_len; /* cummulatively merged length */
int last_len; /* currently saved data length */
int size; /* total ring buffer size */
/* TCP payload features */
uint32_t head_seq;
uint32_t init_seq;
struct fragment_ctx* fctx;
};
/*----------------------------------------------------------------------------*/
uint32_t RBGetCurnum(rb_manager_t rbm);
void RBPrintInfo(struct tcp_ring_buffer* buff);
void RBPrintStr(struct tcp_ring_buffer* buff);
void RBPrintHex(struct tcp_ring_buffer* buff);
/*----------------------------------------------------------------------------*/
rb_manager_t RBManagerCreate(size_t chunk_size, uint32_t cnum);
/*----------------------------------------------------------------------------*/
struct tcp_ring_buffer* RBInit(rb_manager_t rbm, uint32_t init_seq);
void RBFree(rb_manager_t rbm, struct tcp_ring_buffer* buff);
uint32_t RBIsDanger(rb_manager_t rbm);
/*----------------------------------------------------------------------------*/
/* data manupulation functions */
int RBPut(rb_manager_t rbm, struct tcp_ring_buffer* buff,
void* data, uint32_t len , uint32_t seq);
size_t RBGet(rb_manager_t rbm, struct tcp_ring_buffer* buff, size_t len);
size_t RBRemove(rb_manager_t rbm, struct tcp_ring_buffer* buff,
size_t len, int option);
/*----------------------------------------------------------------------------*/
#endif

View File

@ -0,0 +1,22 @@
#ifndef __TCP_SB_QUEUE_
#define __TCP_SB_QUEUE_
#include "tcp_send_buffer.h"
/*---------------------------------------------------------------------------*/
typedef struct sb_queue* sb_queue_t;
/*---------------------------------------------------------------------------*/
sb_queue_t
CreateSBQueue(int capacity);
/*---------------------------------------------------------------------------*/
void
DestroySBQueue(sb_queue_t sq);
/*---------------------------------------------------------------------------*/
int
SBEnqueue(sb_queue_t sq, struct tcp_send_buffer *buf);
/*---------------------------------------------------------------------------*/
struct tcp_send_buffer *
SBDequeue(sb_queue_t sq);
/*---------------------------------------------------------------------------*/
#endif /* __TCP_SB_QUEUE_ */

View File

@ -0,0 +1,44 @@
#ifndef __TCP_SEND_BUFFER_H_
#define __TCP_SEND_BUFFER_H_
#include <stdlib.h>
#include <stdint.h>
/*----------------------------------------------------------------------------*/
typedef struct sb_manager* sb_manager_t;
/*----------------------------------------------------------------------------*/
struct tcp_send_buffer
{
unsigned char *data;
unsigned char *head;
uint32_t head_off;
uint32_t tail_off;
uint32_t len;
uint64_t cum_len;
uint32_t size;
uint32_t head_seq;
uint32_t init_seq;
};
/*----------------------------------------------------------------------------*/
uint32_t
SBGetCurnum(sb_manager_t sbm);
/*----------------------------------------------------------------------------*/
sb_manager_t
SBManagerCreate(size_t chunk_size, uint32_t cnum);
/*----------------------------------------------------------------------------*/
struct tcp_send_buffer *
SBInit(sb_manager_t sbm, uint32_t init_seq);
/*----------------------------------------------------------------------------*/
void
SBFree(sb_manager_t sbm, struct tcp_send_buffer *buf);
/*----------------------------------------------------------------------------*/
size_t
SBPut(sb_manager_t sbm, struct tcp_send_buffer *buf, void *data, size_t len);
/*----------------------------------------------------------------------------*/
size_t
SBRemove(sb_manager_t sbm, struct tcp_send_buffer *buf, size_t len);
/*----------------------------------------------------------------------------*/
#endif /* __TCP_SEND_BUFFER_H_ */

View File

@ -0,0 +1,230 @@
#ifndef __TCP_STREAM_H_
#define __TCP_STREAM_H_
#include <netinet/ip.h>
#include <linux/tcp.h>
#include <sys/queue.h>
#include "mtcp.h"
struct rtm_stat
{
uint32_t tdp_ack_cnt;
uint32_t tdp_ack_bytes;
uint32_t ack_upd_cnt;
uint32_t ack_upd_bytes;
#if TCP_OPT_SACK_ENABLED
uint32_t sack_cnt;
uint32_t sack_bytes;
uint32_t tdp_sack_cnt;
uint32_t tdp_sack_bytes;
#endif /* TCP_OPT_SACK_ENABLED */
uint32_t rto_cnt;
uint32_t rto_bytes;
};
#if TCP_OPT_SACK_ENABLED
struct sack_entry
{
uint32_t left_edge;
uint32_t right_edge;
uint32_t expire;
};
#endif /* TCP_OPT_SACK_ENABLED */
struct tcp_recv_vars
{
/* receiver variables */
uint32_t rcv_wnd; /* receive window (unscaled) */
//uint32_t rcv_up; /* receive urgent pointer */
uint32_t irs; /* initial receiving sequence */
uint32_t snd_wl1; /* segment seq number for last window update */
uint32_t snd_wl2; /* segment ack number for last window update */
/* variables for fast retransmission */
uint32_t last_ack_seq; /* highest ackd seq */
uint8_t dup_acks; /* number of duplicated acks */
/* timestamps */
uint32_t ts_recent; /* recent peer timestamp */
uint32_t ts_lastack_rcvd; /* last ack rcvd time */
uint32_t ts_last_ts_upd; /* last peer ts update time */
uint32_t ts_tw_expire; // timestamp for timewait expire
/* RTT estimation variables */
uint32_t srtt; /* smoothed round trip time << 3 (scaled) */
uint32_t mdev; /* medium deviation */
uint32_t mdev_max; /* maximal mdev ffor the last rtt period */
uint32_t rttvar; /* smoothed mdev_max */
uint32_t rtt_seq; /* sequence number to update rttvar */
#if TCP_OPT_SACK_ENABLED /* currently not used */
#define MAX_SACK_ENTRY 8
struct sack_entry sack_table[MAX_SACK_ENTRY];
uint8_t sacks:3;
#endif /* TCP_OPT_SACK_ENABLED */
struct tcp_ring_buffer *rcvbuf;
#if USE_SPIN_LOCK
pthread_spinlock_t read_lock;
#else
pthread_mutex_t read_lock;
#endif
TAILQ_ENTRY(tcp_stream) he_link; /* hash table entry link */
#if BLOCKING_SUPPORT
TAILQ_ENTRY(tcp_stream) rcv_br_link;
pthread_cond_t read_cond;
#endif
};
struct tcp_send_vars
{
/* IP-level information */
uint16_t ip_id;
uint16_t mss; /* maximum segment size */
uint16_t eff_mss; /* effective segment size (excluding tcp option) */
uint8_t wscale; /* window scale */
int8_t nif_out; /* cached output network interface */
unsigned char *d_haddr; /* cached destination MAC address */
/* send sequence variables */
uint32_t snd_una; /* send unacknoledged */
uint32_t snd_wnd; /* send window (unscaled) */
uint32_t peer_wnd; /* client window size */
//uint32_t snd_up; /* send urgent pointer (not used) */
uint32_t iss; /* initial sending sequence */
uint32_t fss; /* final sending sequence */
/* retransmission timeout variables */
uint8_t nrtx; /* number of retransmission */
uint8_t max_nrtx; /* max number of retransmission */
uint32_t rto; /* retransmission timeout */
uint32_t ts_rto; /* timestamp for retransmission timeout */
/* congestion control variables */
uint32_t cwnd; /* congestion window */
uint32_t ssthresh; /* slow start threshold */
/* timestamp */
uint32_t ts_lastack_sent; /* last ack sent time */
uint8_t is_wack:1, /* is ack for window adertisement? */
ack_cnt:6; /* number of acks to send. max 64 */
uint8_t on_control_list;
uint8_t on_send_list;
uint8_t on_ack_list;
uint8_t on_sendq;
uint8_t on_ackq;
uint8_t on_closeq;
uint8_t on_resetq;
uint8_t on_closeq_int:1,
on_resetq_int:1,
is_fin_sent:1,
is_fin_ackd:1;
TAILQ_ENTRY(tcp_stream) control_link;
TAILQ_ENTRY(tcp_stream) send_link;
TAILQ_ENTRY(tcp_stream) ack_link;
TAILQ_ENTRY(tcp_stream) timer_link; /* timer link (rto list, tw list) */
TAILQ_ENTRY(tcp_stream) timeout_link; /* connection timeout link */
struct tcp_send_buffer *sndbuf;
#if USE_SPIN_LOCK
pthread_spinlock_t write_lock;
#else
pthread_mutex_t write_lock;
#endif
#if RTM_STAT
struct rtm_stat rstat; /* retransmission statistics */
#endif
#if BLOCKING_SUPPORT
TAILQ_ENTRY(tcp_stream) snd_br_link;
pthread_cond_t write_cond;
#endif
};
typedef struct tcp_stream
{
socket_map_t socket;
uint32_t id:24,
stream_type:8;
uint32_t saddr; /* in network order */
uint32_t daddr; /* in network order */
uint16_t sport; /* in network order */
uint16_t dport; /* in network order */
uint8_t state; /* tcp state */
uint8_t close_reason; /* close reason */
uint8_t on_hash_table;
uint8_t on_timewait_list;
uint8_t ht_idx;
uint8_t closed;
uint8_t is_bound_addr;
uint8_t need_wnd_adv;
int16_t on_rto_idx;
uint16_t on_timeout_list:1,
on_rcv_br_list:1,
on_snd_br_list:1,
saw_timestamp:1, /* whether peer sends timestamp */
sack_permit:1, /* whether peer permits SACK */
control_list_waiting:1,
have_reset:1;
uint32_t snd_nxt; /* send next */
uint32_t rcv_nxt; /* receive next */
struct tcp_recv_vars *rcvvar;
struct tcp_send_vars *sndvar;
uint32_t last_active_ts; /* ts_last_ack_sent or ts_last_ts_upd */
} tcp_stream;
inline char *
TCPStateToString(const tcp_stream *cur_stream);
unsigned int
HashFlow(const tcp_stream *flow);
int
EqualFlow(const tcp_stream *flow1, const tcp_stream *flow2);
inline int
AddEpollEvent(struct mtcp_epoll *ep,
int queue_type, socket_map_t socket, uint32_t event);
inline void
RaiseReadEvent(mtcp_manager_t mtcp, tcp_stream *stream);
inline void
RaiseWriteEvent(mtcp_manager_t mtcp, tcp_stream *stream);
inline void
RaiseCloseEvent(mtcp_manager_t mtcp, tcp_stream *stream);
inline void
RaiseErrorEvent(mtcp_manager_t mtcp, tcp_stream *stream);
tcp_stream *
CreateTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type,
uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport);
void
DestroyTCPStream(mtcp_manager_t mtcp, tcp_stream *stream);
void
DumpStream(mtcp_manager_t mtcp, tcp_stream *stream);
#endif /* __TCP_STREAM_H_ */

View File

@ -0,0 +1,78 @@
#ifndef __TCP_STREAM_QUEUE_
#define __TCP_STREAM_QUEUE_
#include <stdint.h>
/* Lock definitions for stream queue */
#if LOCK_STREAM_QUEUE
#if USE_SPIN_LOCK
#define SQ_LOCK_INIT(lock, errmsg, action); \
if (pthread_spin_init(lock, PTHREAD_PROCESS_PRIVATE)) { \
perror("pthread_spin_init" errmsg); \
action; \
}
#define SQ_LOCK_DESTROY(lock) pthread_spin_destroy(lock)
#define SQ_LOCK(lock) pthread_spin_lock(lock)
#define SQ_UNLOCK(lock) pthread_spin_unlock(lock)
#else
#define SQ_LOCK_INIT(lock, errmsg, action); \
if (pthread_mutex_init(lock, NULL)) { \
perror("pthread_mutex_init" errmsg); \
action; \
}
#define SQ_LOCK_DESTROY(lock) pthread_mutex_destroy(lock)
#define SQ_LOCK(lock) pthread_mutex_lock(lock)
#define SQ_UNLOCK(lock) pthread_mutex_unlock(lock)
#endif /* USE_SPIN_LOCK */
#else /* LOCK_STREAM_QUEUE */
#define SQ_LOCK_INIT(lock, errmsg, action) (void) 0
#define SQ_LOCK_DESTROY(lock) (void) 0
#define SQ_LOCK(lock) (void) 0
#define SQ_UNLOCK(lock) (void) 0
#endif /* LOCK_STREAM_QUEUE */
/*---------------------------------------------------------------------------*/
typedef struct stream_queue* stream_queue_t;
/*---------------------------------------------------------------------------*/
typedef struct stream_queue_int
{
struct tcp_stream **array;
int size;
int first;
int last;
int count;
} stream_queue_int;
/*---------------------------------------------------------------------------*/
stream_queue_int *
CreateInternalStreamQueue(int size);
/*---------------------------------------------------------------------------*/
void
DestroyInternalStreamQueue(stream_queue_int *sq);
/*---------------------------------------------------------------------------*/
int
StreamInternalEnqueue(stream_queue_int *sq, struct tcp_stream *stream);
/*---------------------------------------------------------------------------*/
struct tcp_stream *
StreamInternalDequeue(stream_queue_int *sq);
/*---------------------------------------------------------------------------*/
stream_queue_t
CreateStreamQueue(int size);
/*---------------------------------------------------------------------------*/
void
DestroyStreamQueue(stream_queue_t sq);
/*---------------------------------------------------------------------------*/
int
StreamEnqueue(stream_queue_t sq, struct tcp_stream *stream);
/*---------------------------------------------------------------------------*/
struct tcp_stream *
StreamDequeue(stream_queue_t sq);
/*---------------------------------------------------------------------------*/
int
StreamQueueIsEmpty(stream_queue_t sq);
/*---------------------------------------------------------------------------*/
#endif /* __TCP_STREAM_QUEUE_ */

View File

@ -0,0 +1,32 @@
#ifndef __TCP_UTIL_H_
#define __TCP_UTIL_H_
#include "mtcp.h"
#include "tcp_stream.h"
struct tcp_timestamp
{
uint32_t ts_val;
uint32_t ts_ref;
};
void ParseTCPOptions(tcp_stream *cur_stream,
uint32_t cur_ts, uint8_t *tcpopt, int len);
inline int
ParseTCPTimestamp(tcp_stream *cur_stream,
struct tcp_timestamp *ts, uint8_t *tcpopt, int len);
#if TCP_OPT_SACK_ENABLED
void
ParseSACKOption(tcp_stream *cur_stream,
uint32_t ack_seq, uint8_t *tcpopt, int len);
#endif
uint16_t
TCPCalcChecksum(uint16_t *buf, uint16_t len, uint32_t saddr, uint32_t daddr);
void
PrintTCPOptions(uint8_t *tcpopt, int len);
#endif /* __TCP_UTIL_H_ */

54
mtcp/src/include/timer.h Normal file
View File

@ -0,0 +1,54 @@
#ifndef __TIMER_H_
#define __TIMER_H_
#include "mtcp.h"
#include "tcp_stream.h"
#define RTO_HASH 3000
struct rto_hashstore
{
uint32_t rto_now_idx; // pointing the hs_table_s index
uint32_t rto_now_ts; //
TAILQ_HEAD(rto_head , tcp_stream) rto_list[RTO_HASH+1];
};
struct rto_hashstore*
InitRTOHashstore();
inline void
AddtoRTOList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
RemoveFromRTOList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
AddtoTimewaitList(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts);
inline void
RemoveFromTimewaitList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
AddtoTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
RemoveFromTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
UpdateTimeoutList(mtcp_manager_t mtcp, tcp_stream *cur_stream);
inline void
UpdateRetransmissionTimer(mtcp_manager_t mtcp,
tcp_stream *cur_stream, uint32_t cur_ts);
void
CheckRtmTimeout(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh);
void
CheckTimewaitExpire(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh);
void
CheckConnectionTimeout(mtcp_manager_t mtcp, uint32_t cur_ts, int thresh);
#endif /* __TIMER_H_ */

56
mtcp/src/ip_in.c Normal file
View File

@ -0,0 +1,56 @@
#include <string.h>
#include <netinet/ip.h>
#include "ip_in.h"
#include "tcp_in.h"
#include "mtcp_api.h"
#include "ps.h"
#include "debug.h"
#define ETH_P_IP_FRAG 0xF800
#define ETH_P_IPV6_FRAG 0xF6DD
/*----------------------------------------------------------------------------*/
inline int
ProcessIPv4Packet(mtcp_manager_t mtcp, uint32_t cur_ts,
const int ifidx, unsigned char* pkt_data, int len)
{
/* check and process IPv4 packets */
struct iphdr* iph = (struct iphdr *)(pkt_data + sizeof(struct ethhdr));
int ip_len = ntohs(iph->tot_len);
/* drop the packet shorter than ip header */
if (ip_len < sizeof(struct iphdr))
return ERROR;
if (ip_fast_csum(iph, iph->ihl))
return ERROR;
#if !PROMISCUOUS_MODE
/* if not promiscuous mode, drop if the destination is not myself */
if (iph->daddr != CONFIG.eths[ifidx].ip_addr)
//DumpIPPacketToFile(stderr, iph, ip_len);
return TRUE;
#endif
// see if the version is correct
if (iph->version != 0x4 ) {
struct ps_packet packet;
packet.ifindex = ifidx;
packet.len = len;
packet.buf = (char *)pkt_data;
ps_slowpath_packet(mtcp->ctx->handle, &packet);
return FALSE;
}
switch (iph->protocol) {
case IPPROTO_TCP:
return ProcessTCPPacket(mtcp, cur_ts, iph, ip_len);
default:
/* currently drop other protocols */
return FALSE;
}
return FALSE;
}
/*----------------------------------------------------------------------------*/

129
mtcp/src/ip_out.c Normal file
View File

@ -0,0 +1,129 @@
#include "ip_out.h"
#include "ip_in.h"
#include "eth_out.h"
#include "arp.h"
#include "debug.h"
/*----------------------------------------------------------------------------*/
inline int
GetOutputInterface(uint32_t daddr)
{
int nif = -1;
int i;
int prefix = 0;
/* Longest prefix matching */
for (i = 0; i < CONFIG.routes; i++) {
if ((daddr & CONFIG.rtable[i].mask) == CONFIG.rtable[i].masked) {
if (CONFIG.rtable[i].prefix > prefix) {
nif = CONFIG.rtable[i].nif;
prefix = CONFIG.rtable[i].prefix;
}
}
}
if (nif < 0) {
uint8_t *da = (uint8_t *)&daddr;
TRACE_ERROR("[WARNING] No route to %u.%u.%u.%u\n",
da[0], da[1], da[2], da[3]);
assert(0);
}
return nif;
}
/*----------------------------------------------------------------------------*/
uint8_t *
IPOutputStandalone(struct mtcp_manager *mtcp,
uint16_t ip_id, uint32_t saddr, uint32_t daddr, uint16_t tcplen)
{
struct iphdr *iph;
int nif;
unsigned char * haddr;
nif = GetOutputInterface(daddr);
if (nif < 0)
return NULL;
haddr = GetDestinationHWaddr(daddr);
if (!haddr) {
#if 0
uint8_t *da = (uint8_t *)&daddr;
TRACE_INFO("[WARNING] The destination IP %u.%u.%u.%u "
"is not in ARP table!\n",
da[0], da[1], da[2], da[3]);
#endif
RequestARP(mtcp, daddr, nif, mtcp->cur_ts);
return NULL;
}
iph = (struct iphdr *)EthernetOutput(mtcp,
ETH_P_IP, nif, haddr, tcplen + IP_HEADER_LEN);
if (!iph) {
return NULL;
}
iph->ihl = IP_HEADER_LEN >> 2;
iph->version = 4;
iph->tos = 0;
iph->tot_len = htons(IP_HEADER_LEN + tcplen);
iph->id = htons(ip_id);
iph->frag_off = htons(0x4000); // no fragmentation
iph->ttl = 64;
iph->protocol = IPPROTO_TCP;
iph->saddr = saddr;
iph->daddr = daddr;
iph->check = 0;
iph->check = ip_fast_csum(iph, iph->ihl);
return (uint8_t *)(iph + 1);
}
/*----------------------------------------------------------------------------*/
uint8_t *
IPOutput(struct mtcp_manager *mtcp, tcp_stream *stream, uint16_t tcplen)
{
struct iphdr *iph;
int nif;
unsigned char *haddr;
if (stream->sndvar->nif_out >= 0) {
nif = stream->sndvar->nif_out;
} else {
nif = GetOutputInterface(stream->daddr);
stream->sndvar->nif_out = nif;
}
haddr = GetDestinationHWaddr(stream->daddr);
if (!haddr) {
#if 0
uint8_t *da = (uint8_t *)&stream->daddr;
TRACE_INFO("[WARNING] The destination IP %u.%u.%u.%u "
"is not in ARP table!\n",
da[0], da[1], da[2], da[3]);
#endif
/* if not found in the arp table, send arp request and return NULL */
/* tcp will retry sending the packet later */
RequestARP(mtcp, stream->daddr, stream->sndvar->nif_out, mtcp->cur_ts);
return NULL;
}
iph = (struct iphdr *)EthernetOutput(mtcp, ETH_P_IP,
stream->sndvar->nif_out, haddr, tcplen + IP_HEADER_LEN);
if (!iph) {
return NULL;
}
iph->ihl = IP_HEADER_LEN >> 2;
iph->version = 4;
iph->tos = 0;
iph->tot_len = htons(IP_HEADER_LEN + tcplen);
iph->id = htons(stream->sndvar->ip_id++);
iph->frag_off = htons(0x4000); // no fragmentation
iph->ttl = 64;
iph->protocol = IPPROTO_TCP;
iph->saddr = stream->saddr;
iph->daddr = stream->daddr;
iph->check = 0;
iph->check = ip_fast_csum(iph, iph->ihl);
return (uint8_t *)(iph + 1);
}

170
mtcp/src/logger.c Normal file
View File

@ -0,0 +1,170 @@
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <errno.h>
#include <sys/queue.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <pthread.h>
#include "cpu.h"
#include "debug.h"
#include "logger.h"
/*----------------------------------------------------------------------------*/
static void
EnqueueFreeBuffer(log_thread_context *ctx, log_buff *free_bp)
{
pthread_mutex_lock(&ctx->free_mutex);
TAILQ_INSERT_TAIL(&ctx->free_queue, free_bp, buff_link);
ctx->free_buff_cnt++;
assert(ctx->free_buff_cnt <= NUM_LOG_BUFF);
assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF);
pthread_mutex_unlock(&ctx->free_mutex);
}
/*----------------------------------------------------------------------------*/
log_buff*
DequeueFreeBuffer(log_thread_context *ctx)
{
pthread_mutex_lock(&ctx->free_mutex);
log_buff *free_bp = TAILQ_FIRST(&ctx->free_queue);
if (free_bp) {
TAILQ_REMOVE(&ctx->free_queue, free_bp, buff_link);
ctx->free_buff_cnt--;
}
assert(ctx->free_buff_cnt >= 0);
assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF);
pthread_mutex_unlock(&ctx->free_mutex);
return (free_bp);
}
/*----------------------------------------------------------------------------*/
void
EnqueueJobBuffer(log_thread_context *ctx, log_buff *working_bp)
{
TAILQ_INSERT_TAIL(&ctx->working_queue, working_bp, buff_link);
ctx->job_buff_cnt++;
ctx->state = ACTIVE_LOGT;
assert(ctx->job_buff_cnt <= NUM_LOG_BUFF);
if (ctx->free_buff_cnt + ctx->job_buff_cnt > NUM_LOG_BUFF) {
TRACE_ERROR("free_buff_cnt(%d) + job_buff_cnt(%d) > NUM_LOG_BUFF(%d)\n",
ctx->free_buff_cnt, ctx->job_buff_cnt, NUM_LOG_BUFF);
}
assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF);
}
/*----------------------------------------------------------------------------*/
static log_buff*
DequeueJobBuffer(log_thread_context *ctx)
{
pthread_mutex_lock(&ctx->mutex);
log_buff *working_bp = TAILQ_FIRST(&ctx->working_queue);
if (working_bp) {
TAILQ_REMOVE(&ctx->working_queue, working_bp, buff_link);
ctx->job_buff_cnt--;
} else {
ctx->state = IDLE_LOGT;
}
assert(ctx->job_buff_cnt >= 0);
assert(ctx->free_buff_cnt + ctx->job_buff_cnt <= NUM_LOG_BUFF);
pthread_mutex_unlock(&ctx->mutex);
return (working_bp);
}
/*----------------------------------------------------------------------------*/
void
InitLogThreadContext(struct log_thread_context *ctx, int cpu)
{
int i;
int sv[2];
/* initialize log_thread_context */
memset(ctx, 0, sizeof(struct log_thread_context));
ctx->cpu = cpu;
ctx->state = IDLE_LOGT;
ctx->done = 0;
if (pipe(sv)) {
fprintf(stderr, "pipe() failed, errno=%d, errstr=%s\n",
errno, strerror(errno));
exit(1);
}
ctx->sp_fd = sv[0];
ctx->pair_sp_fd = sv[1];
pthread_mutex_init(&ctx->mutex, NULL);
pthread_mutex_init(&ctx->free_mutex, NULL);
TAILQ_INIT(&ctx->working_queue);
TAILQ_INIT(&ctx->free_queue);
/* initialize free log_buff */
log_buff *w_buff = malloc(sizeof(log_buff) * NUM_LOG_BUFF);
assert(w_buff);
for (i = 0; i < NUM_LOG_BUFF; i++) {
EnqueueFreeBuffer(ctx, &w_buff[i]);
}
}
/*----------------------------------------------------------------------------*/
void *
ThreadLogMain(void* arg)
{
size_t len;
log_thread_context* ctx = (log_thread_context *) arg;
log_buff* w_buff;
int cnt;
mtcp_core_affinitize(ctx->cpu);
//fprintf(stderr, "[CPU %d] Log thread created. thread: %lu\n",
// ctx->cpu, pthread_self());
TRACE_LOG("Log thread %d is starting.\n", ctx->cpu);
while (!ctx->done) {
/* handle every jobs in job buffer*/
cnt = 0;
while ((w_buff = DequeueJobBuffer(ctx))){
if (++cnt > NUM_LOG_BUFF) {
TRACE_ERROR("CPU %d: Exceed NUM_LOG_BUFF %d.\n",
ctx->cpu, cnt);
break;
}
len = fwrite(w_buff->buff, 1, w_buff->buff_len, w_buff->fid);
if (len != w_buff->buff_len) {
TRACE_ERROR("CPU %d: Tried to write %d, but only write %ld\n",
ctx->cpu, w_buff->buff_len, len);
}
//assert(len == w_buff->buff_len);
EnqueueFreeBuffer(ctx, w_buff);
}
/* */
while (ctx->state == IDLE_LOGT && !ctx->done) {
char temp[1];
int ret = read(ctx->sp_fd, temp, 1);
if (ret)
break;
}
}
TRACE_LOG("Log thread %d out of first loop.\n", ctx->cpu);
/* handle every jobs in job buffer*/
cnt = 0;
while ((w_buff = DequeueJobBuffer(ctx))){
if (++cnt > NUM_LOG_BUFF) {
TRACE_ERROR("CPU %d: "
"Exceed NUM_LOG_BUFF %d in final loop.\n", ctx->cpu, cnt);
break;
}
len = fwrite(w_buff->buff, 1, w_buff->buff_len, w_buff->fid);
assert(len == w_buff->buff_len);
EnqueueFreeBuffer(ctx, w_buff);
}
TRACE_LOG("Log thread %d finished.\n", ctx->cpu);
pthread_exit(NULL);
return NULL;
}
/*----------------------------------------------------------------------------*/

187
mtcp/src/memory_mgt.c Normal file
View File

@ -0,0 +1,187 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <sys/mman.h>
#include <unistd.h>
#ifdef HUGETABLE
#include <hugetlbfs.h>
#endif
#include "debug.h"
#include "memory_mgt.h"
/*----------------------------------------------------------------------------*/
typedef struct tag_mem_chunk
{
int mc_free_chunks;
struct tag_mem_chunk *mc_next;
} mem_chunk;
/*----------------------------------------------------------------------------*/
typedef mem_chunk *mem_chunk_t;
#ifdef HUGETABLE
typedef enum { MEM_NORMAL, MEM_HUGEPAGE};
#endif
/*----------------------------------------------------------------------------*/
typedef struct mem_pool
{
u_char *mp_startptr; /* start pointer */
mem_chunk_t mp_freeptr; /* pointer to the start memory chunk */
int mp_free_chunks; /* number of total free chunks */
int mp_total_chunks; /* number of total free chunks */
int mp_chunk_size; /* chunk size in bytes */
int mp_type;
} mem_pool;
/*----------------------------------------------------------------------------*/
mem_pool *
MPCreate(int chunk_size, size_t total_size, int is_hugepage)
{
int res;
mem_pool_t mp;
if (chunk_size < sizeof(mem_chunk)) {
TRACE_ERROR("The chunk size should be larger than %lu. current: %d\n",
sizeof(mem_chunk), chunk_size);
return NULL;
}
if (chunk_size % 4 != 0) {
TRACE_ERROR("The chunk size should be multiply of 4!\n");
return NULL;
}
//assert(chunk_size <= 2*1024*1024);
if ((mp = calloc(1, sizeof(mem_pool))) == NULL) {
perror("calloc failed");
exit(0);
}
mp->mp_type = is_hugepage;
mp->mp_chunk_size = chunk_size;
mp->mp_free_chunks = ((total_size + (chunk_size -1))/chunk_size);
mp->mp_total_chunks = mp->mp_free_chunks;
total_size = chunk_size * ((size_t)mp->mp_free_chunks);
/* allocate the big memory chunk */
#ifdef HUGETABLE
if (is_hugepage == MEM_HUGEPAGE) {
mp->mp_startptr = get_huge_pages(total_size, NULL);
if (!mp->mp_startptr) {
TRACE_ERROR("posix_memalign failed, size=%ld\n", total_size);
assert(0);
if (mp) free(mp);
return (NULL);
}
} else {
#endif
res = posix_memalign((void **)&mp->mp_startptr, getpagesize(), total_size);
if (res != 0) {
TRACE_ERROR("posix_memalign failed, size=%ld\n", total_size);
assert(0);
if (mp) free(mp);
return (NULL);
}
#ifdef HUGETABLE
}
#endif
/* try mlock only for superuser */
if (geteuid() == 0) {
if (mlock(mp->mp_startptr, total_size) < 0)
TRACE_ERROR("m_lock failed, size=%ld\n", total_size);
}
mp->mp_freeptr = (mem_chunk_t)mp->mp_startptr;
mp->mp_freeptr->mc_free_chunks = mp->mp_free_chunks;
mp->mp_freeptr->mc_next = NULL;
return mp;
}
/*----------------------------------------------------------------------------*/
void *
MPAllocateChunk(mem_pool_t mp)
{
mem_chunk_t p = mp->mp_freeptr;
if (mp->mp_free_chunks == 0)
return (NULL);
assert(p->mc_free_chunks > 0 && p->mc_free_chunks <= p->mc_free_chunks);
p->mc_free_chunks--;
mp->mp_free_chunks--;
if (p->mc_free_chunks) {
/* move right by one chunk */
mp->mp_freeptr = (mem_chunk_t)((u_char *)p + mp->mp_chunk_size);
mp->mp_freeptr->mc_free_chunks = p->mc_free_chunks;
mp->mp_freeptr->mc_next = p->mc_next;
}
else {
mp->mp_freeptr = p->mc_next;
}
return p;
}
/*----------------------------------------------------------------------------*/
void
MPFreeChunk(mem_pool_t mp, void *p)
{
mem_chunk_t mcp = (mem_chunk_t)p;
// assert((u_char*)p >= mp->mp_startptr &&
// (u_char *)p < mp->mp_startptr + mp->mp_total_size);
assert(((u_char *)p - mp->mp_startptr) % mp->mp_chunk_size == 0);
// assert(*((u_char *)p + (mp->mp_chunk_size-1)) == 'a');
// *((u_char *)p + (mp->mp_chunk_size-1)) = 'f';
mcp->mc_free_chunks = 1;
mcp->mc_next = mp->mp_freeptr;
mp->mp_freeptr = mcp;
mp->mp_free_chunks++;
}
/*----------------------------------------------------------------------------*/
void
MPDestroy(mem_pool_t mp)
{
#ifdef HUGETABLE
if(mp->mp_type == MEM_HUGEPAGE) {
free_huge_pages(mp->mp_startptr);
} else {
#endif
free(mp->mp_startptr);
#ifdef HUGETABLE
}
#endif
free(mp);
}
/*----------------------------------------------------------------------------*/
int
MPGetFreeChunks(mem_pool_t mp)
{
return mp->mp_free_chunks;
}
/*----------------------------------------------------------------------------*/
uint32_t
MPIsDanger(mem_pool_t mp)
{
#define DANGER_THREASHOLD 0.95
#define SAFE_THREASHOLD 0.90
uint32_t danger_num = mp->mp_total_chunks * DANGER_THREASHOLD;
uint32_t safe_num = mp->mp_total_chunks * SAFE_THREASHOLD;
if (danger_num < mp->mp_total_chunks - mp->mp_free_chunks) {
return mp->mp_total_chunks - mp->mp_free_chunks - safe_num;
}
return 0;
}
/*----------------------------------------------------------------------------*/
uint32_t
MPIsOverSafeline(mem_pool_t mp)
{
#define SAFELINE 0.90
uint32_t safe_num = mp->mp_total_chunks * SAFELINE;
if (safe_num < mp->mp_total_chunks - mp->mp_free_chunks) {
return 1;
}
return 0;
}
/*----------------------------------------------------------------------------*/

418
mtcp/src/pipe.c Normal file
View File

@ -0,0 +1,418 @@
#include <pthread.h>
#include <errno.h>
#include "pipe.h"
#include "eventpoll.h"
#include "tcp_stream.h"
#include "mtcp.h"
#include "debug.h"
#define PIPE_BUF_SIZE 10240
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
/*---------------------------------------------------------------------------*/
enum pipe_state
{
PIPE_CLOSED,
PIPE_ACTIVE,
PIPE_CLOSE_WAIT,
};
/*---------------------------------------------------------------------------*/
struct pipe
{
int state;
socket_map_t socket[2];
char *buf;
int buf_off;
int buf_tail;
int buf_len;
int buf_size;
pthread_mutex_t pipe_lock;
pthread_cond_t pipe_cond;
};
/*---------------------------------------------------------------------------*/
int
mtcp_pipe(mctx_t mctx, int pipeid[2])
{
socket_map_t socket[2];
struct pipe *pp;
int ret;
socket[0] = AllocateSocket(mctx, MTCP_SOCK_PIPE, FALSE);
if (!socket[0]) {
errno = ENFILE;
return -1;
}
socket[1] = AllocateSocket(mctx, MTCP_SOCK_PIPE, FALSE);
if (!socket[1]) {
FreeSocket(mctx, socket[0]->id, FALSE);
errno = ENFILE;
return -1;
}
pp = (struct pipe *)calloc(1, sizeof(struct pipe));
if (!pp) {
/* errno set by calloc() */
FreeSocket(mctx, socket[0]->id, FALSE);
FreeSocket(mctx, socket[1]->id, FALSE);
return -1;
}
pp->buf_size = PIPE_BUF_SIZE;
pp->buf = (char *)malloc(pp->buf_size);
if (!pp->buf) {
/* errno set by malloc() */
FreeSocket(mctx, socket[0]->id, FALSE);
FreeSocket(mctx, socket[1]->id, FALSE);
free(pp);
return -1;
}
ret = pthread_mutex_init(&pp->pipe_lock, NULL);
if (ret) {
/* errno set by pthread_mutex_init() */
FreeSocket(mctx, socket[0]->id, FALSE);
FreeSocket(mctx, socket[1]->id, FALSE);
free(pp->buf);
free(pp);
return -1;
}
ret = pthread_cond_init(&pp->pipe_cond, NULL);
if (ret) {
/* errno set by pthread_cond_init() */
FreeSocket(mctx, socket[0]->id, FALSE);
FreeSocket(mctx, socket[1]->id, FALSE);
free(pp->buf);
free(pp);
pthread_mutex_destroy(&pp->pipe_lock);
return -1;
}
pp->state = PIPE_ACTIVE;
pp->socket[0] = socket[0];
pp->socket[1] = socket[1];
socket[0]->pp = pp;
socket[1]->pp = pp;
pipeid[0] = socket[0]->id;
pipeid[1] = socket[1]->id;
return 0;
}
/*---------------------------------------------------------------------------*/
static void
RaiseEventToPair(mtcp_manager_t mtcp, socket_map_t socket, uint32_t event)
{
struct pipe *pp = socket->pp;
socket_map_t pair_socket;
if (pp->socket[0] == socket)
pair_socket = pp->socket[1];
else
pair_socket = pp->socket[0];
if (pair_socket->opts & MTCP_NONBLOCK) {
if (pair_socket->epoll) {
AddEpollEvent(mtcp->ep, USR_EVENT_QUEUE, pair_socket, event);
}
} else {
pthread_cond_signal(&pp->pipe_cond);
}
}
/*---------------------------------------------------------------------------*/
int
PipeRead(mctx_t mctx, int pipeid, char *buf, int len)
{
mtcp_manager_t mtcp;
socket_map_t socket;
struct pipe *pp;
int to_read;
int to_notify;
int ret;
mtcp = GetMTCPManager(mctx);
if (!mtcp) {
return -1;
}
socket = GetSocket(mctx, pipeid);
if (!socket) {
return -1;
}
if (socket->socktype != MTCP_SOCK_PIPE) {
errno = EBADF;
return -1;
}
pp = socket->pp;
if (!pp) {
errno = EBADF;
return -1;
}
if (pp->state == PIPE_CLOSED) {
errno = EINVAL;
return -1;
}
if (pp->state == PIPE_CLOSE_WAIT && pp->buf_len == 0) {
return 0;
}
if (len <= 0) {
if (socket->opts & MTCP_NONBLOCK) {
errno = EAGAIN;
return -1;
} else {
return 0;
}
}
pthread_mutex_lock(&pp->pipe_lock);
if (!(socket->opts & MTCP_NONBLOCK)) {
while (pp->buf_len == 0) {
ret = pthread_cond_wait(&pp->pipe_cond, &pp->pipe_lock);
if (ret) {
/* errno set by pthread_cond_wait() */
pthread_mutex_unlock(&pp->pipe_lock);
return -1;
}
}
}
to_read = MIN(len, pp->buf_len);
if (to_read <= 0) {
pthread_mutex_unlock(&pp->pipe_lock);
if (pp->state == PIPE_ACTIVE) {
errno = EAGAIN;
return -1;
} else if (pp->state == PIPE_CLOSE_WAIT) {
return 0;
}
}
/* if the buffer was full, notify the write event to the pair socket */
to_notify = FALSE;
if (pp->buf_len == pp->buf_size)
to_notify = TRUE;
if (pp->buf_off + to_read < pp->buf_size) {
memcpy(buf, pp->buf + pp->buf_off, to_read);
pp->buf_off += to_read;
} else {
int temp_read = pp->buf_size - pp->buf_off;
memcpy(buf, pp->buf + pp->buf_off, temp_read);
memcpy(buf + temp_read, pp->buf, to_read - temp_read);
pp->buf_off = to_read - temp_read;
}
pp->buf_len -= to_read;
/* notify to the pair socket for new buffer space */
if (to_notify) {
RaiseEventToPair(mtcp, socket, MTCP_EPOLLOUT);
}
pthread_mutex_unlock(&pp->pipe_lock);
/* if level triggered, raise event for remainig buffer */
if (pp->buf_len > 0) {
if ((socket->epoll & MTCP_EPOLLIN) && !(socket->epoll & MTCP_EPOLLET)) {
AddEpollEvent(mtcp->ep,
USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN);
}
} else if (pp->state == PIPE_CLOSE_WAIT && pp->buf_len == 0) {
AddEpollEvent(mtcp->ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN);
}
return to_read;
}
/*---------------------------------------------------------------------------*/
int
PipeWrite(mctx_t mctx, int pipeid, char *buf, int len)
{
mtcp_manager_t mtcp;
socket_map_t socket;
struct pipe *pp;
int to_write;
int to_notify;
int ret;
mtcp = GetMTCPManager(mctx);
if (!mtcp) {
return -1;
}
socket = GetSocket(mctx, pipeid);
if (!socket) {
return -1;
}
if (socket->socktype != MTCP_SOCK_PIPE) {
errno = EBADF;
return -1;
}
pp = socket->pp;
if (!pp) {
errno = EBADF;
return -1;
}
if (pp->state == PIPE_CLOSED) {
errno = EINVAL;
return -1;
}
if (pp->state == PIPE_CLOSE_WAIT) {
errno = EPIPE;
return -1;
}
if (len <= 0) {
if (socket->opts & MTCP_NONBLOCK) {
errno = EAGAIN;
return -1;
} else {
return 0;
}
}
pthread_mutex_lock(&pp->pipe_lock);
if (!(socket->opts & MTCP_NONBLOCK)) {
while (pp->buf_len == pp->buf_size) {
ret = pthread_cond_wait(&pp->pipe_cond, &pp->pipe_lock);
if (ret) {
/* errno set by pthread_cond_wait() */
pthread_mutex_unlock(&pp->pipe_lock);
return -1;
}
}
}
to_write = MIN(len, pp->buf_size - pp->buf_len);
if (to_write <= 0) {
pthread_mutex_unlock(&pp->pipe_lock);
errno = EAGAIN;
return -1;
}
/* if the buffer was empty, notify read event to the pair socket */
to_notify = FALSE;
if (pp->buf_len == 0)
to_notify = TRUE;
if (pp->buf_tail + to_write < pp->buf_size) {
/* if the data fit into the buffer, copy it */
memcpy(pp->buf + pp->buf_tail, buf, to_write);
pp->buf_tail += to_write;
} else {
/* if the data overflow the buffer, wrap around the buffer */
int temp_write = pp->buf_size - pp->buf_tail;
memcpy(pp->buf + pp->buf_tail, buf, temp_write);
memcpy(pp->buf, buf + temp_write, to_write - temp_write);
pp->buf_tail = to_write - temp_write;
}
pp->buf_len += to_write;
/* notify to the pair socket for the new buffers */
if (to_notify) {
RaiseEventToPair(mtcp, socket, MTCP_EPOLLIN);
}
pthread_mutex_unlock(&pp->pipe_lock);
/* if level triggered, raise event for remainig buffer */
if (pp->buf_len < pp->buf_size) {
if ((socket->epoll & MTCP_EPOLLOUT) && !(socket->epoll & MTCP_EPOLLET)) {
AddEpollEvent(mtcp->ep,
USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT);
}
}
return to_write;
}
/*----------------------------------------------------------------------------*/
int
RaisePendingPipeEvents(mctx_t mctx, int epid, int pipeid)
{
struct mtcp_epoll *ep = GetSocket(mctx, epid)->ep;
socket_map_t socket = GetSocket(mctx, pipeid);
struct pipe *pp = socket->pp;
if (!pp)
return -1;
if (pp->state < PIPE_ACTIVE)
return -1;
/* if there are payloads already read before epoll registration */
/* generate read event */
if (socket->epoll & MTCP_EPOLLIN) {
if (pp->buf_len > 0) {
AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN);
} else if (pp->state == PIPE_CLOSE_WAIT) {
AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLIN);
}
}
/* same thing to the write event */
if (socket->epoll & MTCP_EPOLLOUT) {
if (pp->buf_len < pp->buf_size) {
AddEpollEvent(ep, USR_SHADOW_EVENT_QUEUE, socket, MTCP_EPOLLOUT);
}
}
return 0;
}
/*---------------------------------------------------------------------------*/
int
PipeClose(mctx_t mctx, int pipeid)
{
mtcp_manager_t mtcp;
socket_map_t socket;
struct pipe *pp;
mtcp = GetMTCPManager(mctx);
if (!mtcp) {
return -1;
}
socket = GetSocket(mctx, pipeid);
if (!socket) {
return -1;
}
if (socket->socktype != MTCP_SOCK_PIPE) {
errno = EINVAL;
return -1;
}
pp = socket->pp;
if (!pp) {
return 0;
}
if (pp->state == PIPE_CLOSED) {
return 0;
}
pthread_mutex_lock(&pp->pipe_lock);
if (pp->state == PIPE_ACTIVE) {
pp->state = PIPE_CLOSE_WAIT;
RaiseEventToPair(mtcp, socket, MTCP_EPOLLIN);
pthread_mutex_unlock(&pp->pipe_lock);
return 0;
}
/* control reaches here only when PIPE_CLOSE_WAIT */
if (pp->socket[0])
pp->socket[0]->pp = NULL;
if (pp->socket[1])
pp->socket[1]->pp = NULL;
pthread_mutex_unlock(&pp->pipe_lock);
pthread_mutex_destroy(&pp->pipe_lock);
pthread_cond_destroy(&pp->pipe_cond);
free(pp->buf);
free(pp);
return 0;
}
/*---------------------------------------------------------------------------*/

102
mtcp/src/rss.c Normal file
View File

@ -0,0 +1,102 @@
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <stdlib.h>
#include <unistd.h>
#include "rss.h"
/*-------------------------------------------------------------*/
static void
BuildKeyCache(uint32_t *cache, int cache_len)
{
#define NBBY 8 /* number of bits per byte */
/* Keys for system testing */
static const uint8_t key[] = {
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05
};
uint32_t result = (((uint32_t)key[0]) << 24) |
(((uint32_t)key[1]) << 16) |
(((uint32_t)key[2]) << 8) |
((uint32_t)key[3]);
uint32_t idx = 32;
int i;
for (i = 0; i < cache_len; i++, idx++) {
uint8_t shift = (idx % NBBY);
uint32_t bit;
cache[i] = result;
bit = ((key[idx/NBBY] << shift) & 0x80) ? 1 : 0;
result = ((result << 1) | bit);
}
}
/*-------------------------------------------------------------*/
static uint32_t
GetRSSHash(in_addr_t sip, in_addr_t dip, in_port_t sp, in_port_t dp)
{
#define MSB32 0x80000000
#define MSB16 0x8000
#define KEY_CACHE_LEN 96
uint32_t res = 0;
int i;
static int first = 1;
static uint32_t key_cache[KEY_CACHE_LEN] = {0};
if (first) {
BuildKeyCache(key_cache, KEY_CACHE_LEN);
first = 0;
}
for (i = 0; i < 32; i++) {
if (sip & MSB32)
res ^= key_cache[i];
sip <<= 1;
}
for (i = 0; i < 32; i++) {
if (dip & MSB32)
res ^= key_cache[32+i];
dip <<= 1;
}
for (i = 0; i < 16; i++) {
if (sp & MSB16)
res ^= key_cache[64+i];
sp <<= 1;
}
for (i = 0; i < 16; i++) {
if (dp & MSB16)
res ^= key_cache[80+i];
dp <<= 1;
}
return res;
}
/*-------------------------------------------------------------------*/
/* RSS redirection table is in the little endian byte order (intel) */
/* */
/* idx: 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 | 16 17 18 19 ...*/
/* val: 3 2 1 0 | 7 6 5 4 | 11 10 9 8 | 15 14 13 12 | 19 18 17 16 ...*/
/* qid = val % num_queues */
/*-------------------------------------------------------------------*/
int
GetRSSCPUCore(in_addr_t sip, in_addr_t dip,
in_port_t sp, in_port_t dp, int num_queues)
{
#define RSS_BIT_MASK 0x0000007F
static const uint32_t off[4] = {3, 1, -1, -3};
uint32_t masked = GetRSSHash(sip, dip, sp, dp) & RSS_BIT_MASK;
masked += off[masked & 0x3];
return (masked % num_queues);
}
/*-------------------------------------------------------------------*/

85
mtcp/src/socket.c Normal file
View File

@ -0,0 +1,85 @@
#include "mtcp.h"
#include "socket.h"
#include "debug.h"
/*---------------------------------------------------------------------------*/
socket_map_t
AllocateSocket(mctx_t mctx, int socktype, int need_lock)
{
mtcp_manager_t mtcp = g_mtcp[mctx->cpu];
socket_map_t socket = NULL;
if (need_lock)
pthread_mutex_lock(&mtcp->ctx->smap_lock);
while (socket == NULL) {
socket = TAILQ_FIRST(&mtcp->free_smap);
if (!socket) {
if (need_lock)
pthread_mutex_unlock(&mtcp->ctx->smap_lock);
TRACE_ERROR("The concurrent sockets are at maximum.\n");
return NULL;
}
TAILQ_REMOVE(&mtcp->free_smap, socket, free_smap_link);
/* if there is not invalidated events, insert the socket to the end */
/* and find another socket in the free smap list */
if (socket->events) {
TRACE_INFO("There are still not invalidate events remaining.\n");
TRACE_DBG("There are still not invalidate events remaining.\n");
TAILQ_INSERT_TAIL(&mtcp->free_smap, socket, free_smap_link);
socket = NULL;
}
}
if (need_lock)
pthread_mutex_unlock(&mtcp->ctx->smap_lock);
socket->socktype = socktype;
socket->opts = 0;
socket->stream = NULL;
socket->epoll = 0;
socket->events = 0;
//memset(&socket->saddr, 0, sizeof(struct sockaddr_in));
memset(&socket->ep_data, 0, sizeof(mtcp_epoll_data_t));
return socket;
}
/*---------------------------------------------------------------------------*/
void
FreeSocket(mctx_t mctx, int sockid, int need_lock)
{
mtcp_manager_t mtcp = g_mtcp[mctx->cpu];
socket_map_t socket = &mtcp->smap[sockid];
if (socket->socktype == MTCP_SOCK_UNUSED) {
return;
}
socket->socktype = MTCP_SOCK_UNUSED;
socket->epoll = MTCP_EPOLLNONE;
if (need_lock)
pthread_mutex_lock(&mtcp->ctx->smap_lock);
/* insert into free stream map */
mtcp->smap[sockid].stream = NULL;
TAILQ_INSERT_TAIL(&mtcp->free_smap, socket, free_smap_link);
if (need_lock)
pthread_mutex_unlock(&mtcp->ctx->smap_lock);
}
/*---------------------------------------------------------------------------*/
socket_map_t
GetSocket(mctx_t mctx, int sockid)
{
if (sockid < 0 || sockid >= CONFIG.max_concurrency) {
errno = EBADF;
return NULL;
}
return &g_mtcp[mctx->cpu]->smap[sockid];
}

1270
mtcp/src/tcp_in.c Normal file

File diff suppressed because it is too large Load Diff

917
mtcp/src/tcp_out.c Normal file
View File

@ -0,0 +1,917 @@
#include <unistd.h>
#include "tcp_out.h"
#include "mtcp.h"
#include "ip_out.h"
#include "tcp_in.h"
#include "tcp_stream.h"
#include "eventpoll.h"
#include "timer.h"
#include "debug.h"
#define TCP_CALCULATE_CHECKSUM TRUE
#define ACK_PIGGYBACK TRUE
#define TRY_SEND_BEFORE_QUEUE FALSE
#define TCP_MAX_WINDOW 65535
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
/*----------------------------------------------------------------------------*/
static inline uint16_t
CalculateOptionLength(uint8_t flags)
{
uint16_t optlen = 0;
if (flags & TCP_FLAG_SYN) {
optlen += TCP_OPT_MSS_LEN;
#if TCP_OPT_SACK_ENABLED
optlen += TCP_OPT_SACK_PERMIT_LEN;
#if !TCP_OPT_TIMESTAMP_ENABLED
optlen += 2; // insert NOP padding
#endif /* TCP_OPT_TIMESTAMP_ENABLED */
#endif /* TCP_OPT_SACK_ENABLED */
#if TCP_OPT_TIMESTAMP_ENABLED
optlen += TCP_OPT_TIMESTAMP_LEN;
#if !TCP_OPT_SACK_ENABLED
optlen += 2; // insert NOP padding
#endif /* TCP_OPT_SACK_ENABLED */
#endif /* TCP_OPT_TIMESTAMP_ENABLED */
optlen += TCP_OPT_WSCALE_LEN + 1;
} else {
#if TCP_OPT_TIMESTAMP_ENABLED
optlen += TCP_OPT_TIMESTAMP_LEN + 2;
#endif
#if TCP_OPT_SACK_ENABLED
if (flags & TCP_FLAG_SACK) {
optlen += TCP_OPT_SACK_LEN + 2;
}
#endif
}
assert(optlen % 4 == 0);
return optlen;
}
/*----------------------------------------------------------------------------*/
static inline void
GenerateTCPTimestamp(tcp_stream *cur_stream, uint8_t *tcpopt, uint32_t cur_ts)
{
uint32_t *ts = (uint32_t *)(tcpopt + 2);
tcpopt[0] = TCP_OPT_TIMESTAMP;
tcpopt[1] = TCP_OPT_TIMESTAMP_LEN;
ts[0] = htonl(cur_ts);
ts[1] = htonl(cur_stream->rcvvar->ts_recent);
}
/*----------------------------------------------------------------------------*/
static inline void
GenerateTCPOptions(tcp_stream *cur_stream, uint32_t cur_ts,
uint8_t flags, uint8_t *tcpopt, uint16_t optlen)
{
int i = 0;
if (flags & TCP_FLAG_SYN) {
uint16_t mss;
/* MSS option */
mss = cur_stream->sndvar->mss;
tcpopt[i++] = TCP_OPT_MSS;
tcpopt[i++] = TCP_OPT_MSS_LEN;
tcpopt[i++] = mss >> 8;
tcpopt[i++] = mss % 256;
/* SACK permit */
#if TCP_OPT_SACK_ENABLED
#if !TCP_OPT_TIMESTAMP_ENABLED
tcpopt[i++] = TCP_OPT_NOP;
tcpopt[i++] = TCP_OPT_NOP;
#endif /* TCP_OPT_TIMESTAMP_ENABLED */
tcpopt[i++] = TCP_OPT_SACK_PERMIT;
tcpopt[i++] = TCP_OPT_SACK_PERMIT_LEN;
TRACE_SACK("Local SACK permited.\n");
#endif /* TCP_OPT_SACK_ENABLED */
/* Timestamp */
#if TCP_OPT_TIMESTAMP_ENABLED
#if !TCP_OPT_SACK_ENABLED
tcpopt[i++] = TCP_OPT_NOP;
tcpopt[i++] = TCP_OPT_NOP;
#endif /* TCP_OPT_SACK_ENABLED */
GenerateTCPTimestamp(cur_stream, tcpopt + i, cur_ts);
i += TCP_OPT_TIMESTAMP_LEN;
#endif /* TCP_OPT_TIMESTAMP_ENABLED */
/* Window scale */
tcpopt[i++] = TCP_OPT_NOP;
tcpopt[i++] = TCP_OPT_WSCALE;
tcpopt[i++] = TCP_OPT_WSCALE_LEN;
tcpopt[i++] = cur_stream->sndvar->wscale;
} else {
#if TCP_OPT_TIMESTAMP_ENABLED
tcpopt[i++] = TCP_OPT_NOP;
tcpopt[i++] = TCP_OPT_NOP;
GenerateTCPTimestamp(cur_stream, tcpopt + i, cur_ts);
i += TCP_OPT_TIMESTAMP_LEN;
#endif
#if TCP_OPT_SACK_ENABLED
if (flags & TCP_OPT_SACK) {
// TODO: implement SACK support
}
#endif
}
assert (i == optlen);
}
/*----------------------------------------------------------------------------*/
int
SendTCPPacketStandalone(struct mtcp_manager *mtcp,
uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport,
uint32_t seq, uint32_t ack_seq, uint16_t window, uint8_t flags,
uint8_t *payload, uint16_t payloadlen,
uint32_t cur_ts, uint32_t echo_ts)
{
struct tcphdr *tcph;
uint8_t *tcpopt;
uint32_t *ts;
uint16_t optlen;
optlen = CalculateOptionLength(flags);
if (payloadlen > TCP_DEFAULT_MSS + optlen) {
TRACE_ERROR("Payload size exceeds MSS.\n");
assert(0);
return ERROR;
}
tcph = (struct tcphdr *)IPOutputStandalone(mtcp, 0,
saddr, daddr, TCP_HEADER_LEN + optlen + payloadlen);
if (tcph == NULL) {
return ERROR;
}
memset(tcph, 0, TCP_HEADER_LEN + optlen);
tcph->source = sport;
tcph->dest = dport;
if (flags & TCP_FLAG_SYN)
tcph->syn = TRUE;
if (flags & TCP_FLAG_FIN)
tcph->fin = TRUE;
if (flags & TCP_FLAG_RST)
tcph->rst = TRUE;
if (flags & TCP_FLAG_PSH)
tcph->psh = TRUE;
tcph->seq = htonl(seq);
if (flags & TCP_FLAG_ACK) {
tcph->ack = TRUE;
tcph->ack_seq = htonl(ack_seq);
}
tcph->window = htons(MIN(window, TCP_MAX_WINDOW));
tcpopt = (uint8_t *)tcph + TCP_HEADER_LEN;
ts = (uint32_t *)(tcpopt + 4);
tcpopt[0] = TCP_OPT_NOP;
tcpopt[1] = TCP_OPT_NOP;
tcpopt[2] = TCP_OPT_TIMESTAMP;
tcpopt[3] = TCP_OPT_TIMESTAMP_LEN;
ts[0] = htonl(cur_ts);
ts[1] = htonl(echo_ts);
tcph->doff = (TCP_HEADER_LEN + optlen) >> 2;
// copy payload if exist
if (payloadlen > 0) {
memcpy((uint8_t *)tcph + TCP_HEADER_LEN + optlen, payload, payloadlen);
}
#if TCP_CALCULATE_CHECKSUM
tcph->check = TCPCalcChecksum((uint16_t *)tcph,
TCP_HEADER_LEN + optlen + payloadlen, saddr, daddr);
#endif
if (tcph->syn || tcph->fin) {
payloadlen++;
}
return payloadlen;
}
/*----------------------------------------------------------------------------*/
int
SendTCPPacket(struct mtcp_manager *mtcp, tcp_stream *cur_stream,
uint32_t cur_ts, uint8_t flags, uint8_t *payload, uint16_t payloadlen)
{
struct tcphdr *tcph;
uint16_t optlen;
uint8_t wscale = 0;
uint32_t window32 = 0;
optlen = CalculateOptionLength(flags);
if (payloadlen > cur_stream->sndvar->mss + optlen) {
TRACE_ERROR("Payload size exceeds MSS\n");
return ERROR;
}
tcph = (struct tcphdr *)IPOutput(mtcp, cur_stream,
TCP_HEADER_LEN + optlen + payloadlen);
if (tcph == NULL) {
return -2;
}
memset(tcph, 0, TCP_HEADER_LEN + optlen);
tcph->source = cur_stream->sport;
tcph->dest = cur_stream->dport;
if (flags & TCP_FLAG_SYN) {
tcph->syn = TRUE;
if (cur_stream->snd_nxt != cur_stream->sndvar->iss) {
TRACE_DBG("Stream %d: weird SYN sequence. "
"snd_nxt: %u, iss: %u\n", cur_stream->id,
cur_stream->snd_nxt, cur_stream->sndvar->iss);
}
#if 0
TRACE_FIN("Stream %d: Sending SYN. seq: %u, ack_seq: %u\n",
cur_stream->id, cur_stream->snd_nxt, cur_stream->rcv_nxt);
#endif
}
if (flags & TCP_FLAG_RST) {
TRACE_FIN("Stream %d: Sending RST.\n", cur_stream->id);
tcph->rst = TRUE;
}
if (flags & TCP_FLAG_PSH)
tcph->psh = TRUE;
if (flags & TCP_FLAG_WACK) {
tcph->seq = htonl(cur_stream->snd_nxt - 1);
TRACE_CLWND("%u Sending ACK to get new window advertisement. "
"seq: %u, peer_wnd: %u, snd_nxt - snd_una: %u\n",
cur_stream->id,
cur_stream->snd_nxt - 1, cur_stream->sndvar->peer_wnd,
cur_stream->snd_nxt - cur_stream->sndvar->snd_una);
} else if (flags & TCP_FLAG_FIN) {
tcph->fin = TRUE;
if (cur_stream->sndvar->fss == 0) {
TRACE_ERROR("Stream %u: not fss set. closed: %u\n",
cur_stream->id, cur_stream->closed);
}
tcph->seq = htonl(cur_stream->sndvar->fss);
cur_stream->sndvar->is_fin_sent = TRUE;
TRACE_FIN("Stream %d: Sending FIN. seq: %u, ack_seq: %u\n",
cur_stream->id, cur_stream->snd_nxt, cur_stream->rcv_nxt);
} else {
tcph->seq = htonl(cur_stream->snd_nxt);
}
if (flags & TCP_FLAG_ACK) {
tcph->ack = TRUE;
tcph->ack_seq = htonl(cur_stream->rcv_nxt);
cur_stream->sndvar->ts_lastack_sent = cur_ts;
cur_stream->last_active_ts = cur_ts;
UpdateTimeoutList(mtcp, cur_stream);
}
if (flags & TCP_FLAG_SYN) {
wscale = 0;
} else {
wscale = cur_stream->sndvar->wscale;
}
window32 = cur_stream->rcvvar->rcv_wnd >> wscale;
tcph->window = htons(MIN((uint16_t)window32, TCP_MAX_WINDOW));
/* if the advertised window is 0, we need to advertise again later */
if (window32 == 0) {
cur_stream->need_wnd_adv = TRUE;
}
GenerateTCPOptions(cur_stream, cur_ts, flags,
(uint8_t *)tcph + TCP_HEADER_LEN, optlen);
tcph->doff = (TCP_HEADER_LEN + optlen) >> 2;
// copy payload if exist
if (payloadlen > 0) {
memcpy((uint8_t *)tcph + TCP_HEADER_LEN + optlen, payload, payloadlen);
}
#if TCP_CALCULATE_CHECKSUM
tcph->check = TCPCalcChecksum((uint16_t *)tcph,
TCP_HEADER_LEN + optlen + payloadlen,
cur_stream->saddr, cur_stream->daddr);
#endif
cur_stream->snd_nxt += payloadlen;
if (tcph->syn || tcph->fin) {
cur_stream->snd_nxt++;
payloadlen++;
}
if (payloadlen > 0) {
if (cur_stream->state > TCP_ST_ESTABLISHED) {
TRACE_FIN("Payload after ESTABLISHED: length: %d, snd_nxt: %u\n",
payloadlen, cur_stream->snd_nxt);
}
/* update retransmission timer if have payload */
cur_stream->sndvar->ts_rto = cur_ts + cur_stream->sndvar->rto;
TRACE_RTO("Updating retransmission timer. "
"cur_ts: %u, rto: %u, ts_rto: %u\n",
cur_ts, cur_stream->sndvar->rto, cur_stream->sndvar->ts_rto);
AddtoRTOList(mtcp, cur_stream);
}
return payloadlen;
}
/*----------------------------------------------------------------------------*/
static int
FlushTCPSendingBuffer(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts)
{
struct tcp_send_vars *sndvar = cur_stream->sndvar;
const uint32_t maxlen = sndvar->mss - CalculateOptionLength(TCP_FLAG_ACK);
uint8_t *data;
uint32_t buffered_len;
uint32_t seq;
uint16_t len;
int16_t sndlen;
uint32_t window;
int packets = 0;
if (!sndvar->sndbuf) {
TRACE_ERROR("Stream %d: No send buffer available.\n", cur_stream->id);
assert(0);
return 0;
}
if (sndvar->sndbuf->len == 0) {
return 0;
}
window = MIN(sndvar->cwnd, sndvar->peer_wnd);
while (1) {
seq = cur_stream->snd_nxt;
if (TCP_SEQ_LT(seq, sndvar->sndbuf->head_seq)) {
TRACE_ERROR("Stream %d: Invalid sequence to send. "
"state: %s, seq: %u, head_seq: %u.\n",
cur_stream->id, TCPStateToString(cur_stream),
seq, sndvar->sndbuf->head_seq);
assert(0);
break;
}
buffered_len = sndvar->sndbuf->head_seq + sndvar->sndbuf->len - seq;
if (cur_stream->state > TCP_ST_ESTABLISHED) {
TRACE_FIN("head_seq: %u, len: %u, seq: %u, "
"buffered_len: %u\n", sndvar->sndbuf->head_seq,
sndvar->sndbuf->len, seq, buffered_len);
}
if (buffered_len == 0)
break;
data = sndvar->sndbuf->head +
(seq - sndvar->sndbuf->head_seq);
if (buffered_len > maxlen) {
len = maxlen;
} else {
len = buffered_len;
}
if (len <= 0)
break;
if (cur_stream->state > TCP_ST_ESTABLISHED) {
TRACE_FIN("Flushing after ESTABLISHED: seq: %u, len: %u, "
"buffered_len: %u\n", seq, len, buffered_len);
}
if (seq - sndvar->snd_una + len > window) {
/* Ask for new window advertisement to peer */
if (seq - sndvar->snd_una + len > sndvar->peer_wnd) {
#if 0
TRACE_CLWND("Full peer window. "
"peer_wnd: %u, (snd_nxt-snd_una): %u\n",
sndvar->peer_wnd, seq - sndvar->snd_una);
#endif
if (TS_TO_MSEC(cur_ts - sndvar->ts_lastack_sent) > 500) {
EnqueueACK(mtcp, cur_stream, cur_ts, ACK_OPT_WACK);
}
}
return -3;
}
sndlen = SendTCPPacket(mtcp, cur_stream, cur_ts,
TCP_FLAG_ACK, data, len);
if (sndlen < 0) {
return sndlen;
}
packets++;
}
return packets;
}
/*----------------------------------------------------------------------------*/
static inline int
SendControlPacket(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts)
{
struct tcp_send_vars *sndvar = cur_stream->sndvar;
int ret = 0;
if (cur_stream->state == TCP_ST_SYN_SENT) {
/* Send SYN here */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_SYN, NULL, 0);
} else if (cur_stream->state == TCP_ST_SYN_RCVD) {
/* Send SYN/ACK here */
cur_stream->snd_nxt = sndvar->iss;
ret = SendTCPPacket(mtcp, cur_stream, cur_ts,
TCP_FLAG_SYN | TCP_FLAG_ACK, NULL, 0);
} else if (cur_stream->state == TCP_ST_ESTABLISHED) {
/* Send ACK here */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0);
} else if (cur_stream->state == TCP_ST_CLOSE_WAIT) {
/* Send ACK for the FIN here */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0);
} else if (cur_stream->state == TCP_ST_LAST_ACK) {
/* if it is on ack_list, send it after sending ack */
if (sndvar->on_send_list || sndvar->on_ack_list) {
ret = -1;
} else {
/* Send FIN/ACK here */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts,
TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0);
}
} else if (cur_stream->state == TCP_ST_FIN_WAIT_1) {
/* if it is on ack_list, send it after sending ack */
if (sndvar->on_send_list || sndvar->on_ack_list) {
ret = -1;
} else {
/* Send FIN/ACK here */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts,
TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0);
}
} else if (cur_stream->state == TCP_ST_FIN_WAIT_2) {
/* Send ACK here */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0);
} else if (cur_stream->state == TCP_ST_CLOSING) {
if (sndvar->is_fin_sent) {
/* if the sequence is for FIN, send FIN */
if (cur_stream->snd_nxt == sndvar->fss) {
ret = SendTCPPacket(mtcp, cur_stream, cur_ts,
TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0);
} else {
ret = SendTCPPacket(mtcp, cur_stream, cur_ts,
TCP_FLAG_ACK, NULL, 0);
}
} else {
/* if FIN is not sent, send fin with ack */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts,
TCP_FLAG_FIN | TCP_FLAG_ACK, NULL, 0);
}
} else if (cur_stream->state == TCP_ST_TIME_WAIT) {
/* Send ACK here */
ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_ACK, NULL, 0);
} else if (cur_stream->state == TCP_ST_CLOSED) {
/* Send RST here */
TRACE_DBG("Stream %d: Try sending RST (TCP_ST_CLOSED)\n",
cur_stream->id);
/* first flush the data and ack */
if (sndvar->on_send_list || sndvar->on_ack_list) {
ret = -1;
} else {
ret = SendTCPPacket(mtcp, cur_stream, cur_ts, TCP_FLAG_RST, NULL, 0);
if (ret >= 0) {
DestroyTCPStream(mtcp, cur_stream);
}
}
}
return ret;
}
/*----------------------------------------------------------------------------*/
inline int
WriteTCPControlList(mtcp_manager_t mtcp,
struct mtcp_sender *sender, uint32_t cur_ts, int thresh)
{
tcp_stream *cur_stream;
tcp_stream *next, *last;
int cnt = 0;
int ret;
thresh = MIN(thresh, sender->control_list_cnt);
/* Send TCP control messages */
cnt = 0;
cur_stream = TAILQ_FIRST(&sender->control_list);
last = TAILQ_LAST(&sender->control_list, control_head);
while (cur_stream) {
if (++cnt > thresh)
break;
TRACE_LOOP("Inside control loop. cnt: %u, stream: %d\n",
cnt, cur_stream->id);
next = TAILQ_NEXT(cur_stream, sndvar->control_link);
TAILQ_REMOVE(&sender->control_list, cur_stream, sndvar->control_link);
sender->control_list_cnt--;
if (cur_stream->sndvar->on_control_list) {
cur_stream->sndvar->on_control_list = FALSE;
//TRACE_DBG("Stream %u: Sending control packet\n", cur_stream->id);
ret = SendControlPacket(mtcp, cur_stream, cur_ts);
if (ret < 0) {
TAILQ_INSERT_HEAD(&sender->control_list,
cur_stream, sndvar->control_link);
cur_stream->sndvar->on_control_list = TRUE;
sender->control_list_cnt++;
/* since there is no available write buffer, break */
break;
}
} else {
TRACE_ERROR("Stream %d: not on control list.\n", cur_stream->id);
}
if (cur_stream == last)
break;
cur_stream = next;
}
return cnt;
}
/*----------------------------------------------------------------------------*/
inline int
WriteTCPDataList(mtcp_manager_t mtcp,
struct mtcp_sender *sender, uint32_t cur_ts, int thresh)
{
tcp_stream *cur_stream;
tcp_stream *next, *last;
int cnt = 0;
int ret;
/* Send data */
cnt = 0;
cur_stream = TAILQ_FIRST(&sender->send_list);
last = TAILQ_LAST(&sender->send_list, send_head);
while (cur_stream) {
if (++cnt > thresh)
break;
TRACE_LOOP("Inside send loop. cnt: %u, stream: %d\n",
cnt, cur_stream->id);
next = TAILQ_NEXT(cur_stream, sndvar->send_link);
TAILQ_REMOVE(&sender->send_list, cur_stream, sndvar->send_link);
if (cur_stream->sndvar->on_send_list) {
ret = 0;
/* Send data here */
/* Only can send data when ESTABLISHED or CLOSE_WAIT */
if (cur_stream->state == TCP_ST_ESTABLISHED) {
if (cur_stream->sndvar->on_control_list) {
/* delay sending data after until on_control_list becomes off */
//TRACE_DBG("Stream %u: delay sending data.\n", cur_stream->id);
ret = -1;
} else {
ret = FlushTCPSendingBuffer(mtcp, cur_stream, cur_ts);
}
} else if (cur_stream->state == TCP_ST_CLOSE_WAIT ||
cur_stream->state == TCP_ST_FIN_WAIT_1 ||
cur_stream->state == TCP_ST_LAST_ACK) {
ret = FlushTCPSendingBuffer(mtcp, cur_stream, cur_ts);
} else {
TRACE_DBG("Stream %d: on_send_list at state %s\n",
cur_stream->id, TCPStateToString(cur_stream));
#if DUMP_STREAM
DumpStream(mtcp, cur_stream);
#endif
}
if (ret < 0) {
TAILQ_INSERT_TAIL(&sender->send_list, cur_stream, sndvar->send_link);
/* since there is no available write buffer, break */
break;
} else {
cur_stream->sndvar->on_send_list = FALSE;
sender->send_list_cnt--;
/* the ret value is the number of packets sent. */
/* decrease ack_cnt for the piggybacked acks */
#if ACK_PIGGYBACK
if (cur_stream->sndvar->ack_cnt > 0) {
if (cur_stream->sndvar->ack_cnt > ret) {
cur_stream->sndvar->ack_cnt -= ret;
} else {
cur_stream->sndvar->ack_cnt = 0;
}
}
#endif
#if 1
if (cur_stream->control_list_waiting) {
if (!cur_stream->sndvar->on_ack_list) {
cur_stream->control_list_waiting = FALSE;
AddtoControlList(mtcp, cur_stream, cur_ts);
}
}
#endif
}
} else {
TRACE_ERROR("Stream %d: not on send list.\n", cur_stream->id);
#ifdef DUMP_STREAM
DumpStream(mtcp, cur_stream);
#endif
}
if (cur_stream == last)
break;
cur_stream = next;
}
return cnt;
}
/*----------------------------------------------------------------------------*/
inline int
WriteTCPACKList(mtcp_manager_t mtcp,
struct mtcp_sender *sender, uint32_t cur_ts, int thresh)
{
tcp_stream *cur_stream;
tcp_stream *next, *last;
int to_ack;
int cnt = 0;
int ret;
/* Send aggregated acks */
cnt = 0;
cur_stream = TAILQ_FIRST(&sender->ack_list);
last = TAILQ_LAST(&sender->ack_list, ack_head);
while (cur_stream) {
if (++cnt > thresh)
break;
TRACE_LOOP("Inside ack loop. cnt: %u\n", cnt);
next = TAILQ_NEXT(cur_stream, sndvar->ack_link);
if (cur_stream->sndvar->on_ack_list) {
/* this list is only to ack the data packets */
/* if the ack is not data ack, then it will not process here */
to_ack = FALSE;
if (cur_stream->state == TCP_ST_ESTABLISHED ||
cur_stream->state == TCP_ST_CLOSE_WAIT ||
cur_stream->state == TCP_ST_FIN_WAIT_1 ||
cur_stream->state == TCP_ST_FIN_WAIT_2 ||
cur_stream->state == TCP_ST_TIME_WAIT) {
/* TIMEWAIT is possible since the ack is queued
at FIN_WAIT_2 */
if (cur_stream->rcvvar->rcvbuf) {
if (TCP_SEQ_LEQ(cur_stream->rcv_nxt,
cur_stream->rcvvar->rcvbuf->head_seq +
cur_stream->rcvvar->rcvbuf->merged_len)) {
to_ack = TRUE;
}
}
} else {
TRACE_DBG("Stream %u (%s): "
"Try sending ack at not proper state. "
"seq: %u, ack_seq: %u, on_control_list: %u\n",
cur_stream->id, TCPStateToString(cur_stream),
cur_stream->snd_nxt, cur_stream->rcv_nxt,
cur_stream->sndvar->on_control_list);
#ifdef DUMP_STREAM
DumpStream(mtcp, cur_stream);
#endif
}
if (to_ack) {
/* send the queued ack packets */
while (cur_stream->sndvar->ack_cnt > 0) {
ret = SendTCPPacket(mtcp, cur_stream,
cur_ts, TCP_FLAG_ACK, NULL, 0);
if (ret < 0) {
/* since there is no available write buffer, break */
break;
}
cur_stream->sndvar->ack_cnt--;
}
/* if is_wack is set, send packet to get window advertisement */
if (cur_stream->sndvar->is_wack) {
cur_stream->sndvar->is_wack = FALSE;
ret = SendTCPPacket(mtcp, cur_stream,
cur_ts, TCP_FLAG_ACK | TCP_FLAG_WACK, NULL, 0);
if (ret < 0) {
/* since there is no available write buffer, break */
cur_stream->sndvar->is_wack = TRUE;
}
}
if (!(cur_stream->sndvar->ack_cnt || cur_stream->sndvar->is_wack)) {
cur_stream->sndvar->on_ack_list = FALSE;
TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link);
sender->ack_list_cnt--;
}
} else {
cur_stream->sndvar->on_ack_list = FALSE;
cur_stream->sndvar->ack_cnt = 0;
cur_stream->sndvar->is_wack = 0;
TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link);
sender->ack_list_cnt--;
}
if (cur_stream->control_list_waiting) {
if (!cur_stream->sndvar->on_send_list) {
cur_stream->control_list_waiting = FALSE;
AddtoControlList(mtcp, cur_stream, cur_ts);
}
}
} else {
TRACE_ERROR("Stream %d: not on ack list.\n", cur_stream->id);
TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link);
sender->ack_list_cnt--;
#ifdef DUMP_STREAM
thread_printf(mtcp, mtcp->log_fp,
"Stream %u: not on ack list.\n", cur_stream->id);
DumpStream(mtcp, cur_stream);
#endif
}
if (cur_stream == last)
break;
cur_stream = next;
}
return cnt;
}
/*----------------------------------------------------------------------------*/
inline struct mtcp_sender *
GetSender(mtcp_manager_t mtcp, tcp_stream *cur_stream)
{
if (cur_stream->sndvar->nif_out < 0) {
return mtcp->g_sender;
} else if (cur_stream->sndvar->nif_out >= CONFIG.eths_num) {
TRACE_ERROR("(NEVER HAPPEN) Failed to find appropriate sender.\n");
return NULL;
} else {
return mtcp->n_sender[cur_stream->sndvar->nif_out];
}
}
/*----------------------------------------------------------------------------*/
inline void
AddtoControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream, uint32_t cur_ts)
{
#if TRY_SEND_BEFORE_QUEUE
int ret;
struct mtcp_sender *sender = GetSender(mtcp, cur_stream);
assert(sender != NULL);
ret = SendControlPacket(mtcp, cur_stream, cur_ts);
if (ret < 0) {
#endif
if (!cur_stream->sndvar->on_control_list) {
struct mtcp_sender *sender = GetSender(mtcp, cur_stream);
assert(sender != NULL);
cur_stream->sndvar->on_control_list = TRUE;
TAILQ_INSERT_TAIL(&sender->control_list, cur_stream, sndvar->control_link);
sender->control_list_cnt++;
//TRACE_DBG("Stream %u: added to control list (cnt: %d)\n",
// cur_stream->id, sender->control_list_cnt);
}
#if TRY_SEND_BEFORE_QUEUE
} else {
if (cur_stream->sndvar->on_control_list) {
cur_stream->sndvar->on_control_list = FALSE;
TAILQ_REMOVE(&sender->control_list, cur_stream, sndvar->control_link);
sender->control_list_cnt--;
}
}
#endif
}
/*----------------------------------------------------------------------------*/
inline void
AddtoSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream)
{
struct mtcp_sender *sender = GetSender(mtcp, cur_stream);
assert(sender != NULL);
if(!cur_stream->sndvar->sndbuf) {
TRACE_ERROR("[%d] Stream %d: No send buffer available.\n",
mtcp->ctx->cpu,
cur_stream->id);
assert(0);
return;
}
if (!cur_stream->sndvar->on_send_list) {
cur_stream->sndvar->on_send_list = TRUE;
TAILQ_INSERT_TAIL(&sender->send_list, cur_stream, sndvar->send_link);
sender->send_list_cnt++;
}
}
/*----------------------------------------------------------------------------*/
inline void
AddtoACKList(mtcp_manager_t mtcp, tcp_stream *cur_stream)
{
struct mtcp_sender *sender = GetSender(mtcp, cur_stream);
assert(sender != NULL);
if (!cur_stream->sndvar->on_ack_list) {
cur_stream->sndvar->on_ack_list = TRUE;
TAILQ_INSERT_TAIL(&sender->ack_list, cur_stream, sndvar->ack_link);
sender->ack_list_cnt++;
}
}
/*----------------------------------------------------------------------------*/
inline void
RemoveFromControlList(mtcp_manager_t mtcp, tcp_stream *cur_stream)
{
struct mtcp_sender *sender = GetSender(mtcp, cur_stream);
assert(sender != NULL);
if (cur_stream->sndvar->on_control_list) {
cur_stream->sndvar->on_control_list = FALSE;
TAILQ_REMOVE(&sender->control_list, cur_stream, sndvar->control_link);
sender->control_list_cnt--;
//TRACE_DBG("Stream %u: Removed from control list (cnt: %d)\n",
// cur_stream->id, sender->control_list_cnt);
}
}
/*----------------------------------------------------------------------------*/
inline void
RemoveFromSendList(mtcp_manager_t mtcp, tcp_stream *cur_stream)
{
struct mtcp_sender *sender = GetSender(mtcp, cur_stream);
assert(sender != NULL);
if (cur_stream->sndvar->on_send_list) {
cur_stream->sndvar->on_send_list = FALSE;
TAILQ_REMOVE(&sender->send_list, cur_stream, sndvar->send_link);
sender->send_list_cnt--;
}
}
/*----------------------------------------------------------------------------*/
inline void
RemoveFromACKList(mtcp_manager_t mtcp, tcp_stream *cur_stream)
{
struct mtcp_sender *sender = GetSender(mtcp, cur_stream);
assert(sender != NULL);
if (cur_stream->sndvar->on_ack_list) {
cur_stream->sndvar->on_ack_list = FALSE;
TAILQ_REMOVE(&sender->ack_list, cur_stream, sndvar->ack_link);
sender->ack_list_cnt--;
}
}
/*----------------------------------------------------------------------------*/
inline void
EnqueueACK(mtcp_manager_t mtcp,
tcp_stream *cur_stream, uint32_t cur_ts, uint8_t opt)
{
if (!(cur_stream->state == TCP_ST_ESTABLISHED ||
cur_stream->state == TCP_ST_CLOSE_WAIT ||
cur_stream->state == TCP_ST_FIN_WAIT_1 ||
cur_stream->state == TCP_ST_FIN_WAIT_2)) {
TRACE_DBG("Stream %u: Enqueueing ack at state %s\n",
cur_stream->id, TCPStateToString(cur_stream));
}
if (opt == ACK_OPT_NOW) {
if (cur_stream->sndvar->ack_cnt < cur_stream->sndvar->ack_cnt + 1) {
cur_stream->sndvar->ack_cnt++;
}
} else if (opt == ACK_OPT_AGGREGATE) {
if (cur_stream->sndvar->ack_cnt == 0) {
cur_stream->sndvar->ack_cnt = 1;
}
} else if (opt == ACK_OPT_WACK) {
cur_stream->sndvar->is_wack = TRUE;
}
AddtoACKList(mtcp, cur_stream);
}
/*----------------------------------------------------------------------------*/
inline void
DumpControlList(mtcp_manager_t mtcp, struct mtcp_sender *sender)
{
tcp_stream *stream;
TRACE_DBG("Dumping control list (count: %d):\n", sender->control_list_cnt);
TAILQ_FOREACH(stream, &sender->control_list, sndvar->control_link) {
TRACE_DBG("Stream id: %u in control list\n", stream->id);
}
}

View File

@ -0,0 +1,131 @@
/*
* TCP free fragment queue for ring buffer - tcp_rb_frag_queue.c/h
*
* EunYoung Jeong
*
* Part of this code borrows Click's simple queue implementation
*
* ============================== Click License =============================
*
* Copyright (c) 1999-2000 Massachusetts Institute of Technology
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, subject to the conditions
* listed in the Click LICENSE file. These conditions include: you must
* preserve this copyright notice, and you cannot mention the copyright
* holders in advertising related to the Software without their permission.
* The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
* notice is a summary of the Click LICENSE file; the license in that file is
* legally binding.
*/
#include "tcp_rb_frag_queue.h"
#include "debug.h"
/*----------------------------------------------------------------------------*/
#ifndef _INDEX_TYPE_
#define _INDEX_TYPE_
typedef uint32_t index_type;
typedef int32_t signed_index_type;
#endif
/*---------------------------------------------------------------------------*/
struct rb_frag_queue
{
index_type _capacity;
volatile index_type _head;
volatile index_type _tail;
struct fragment_ctx * volatile * _q;
};
/*----------------------------------------------------------------------------*/
static inline index_type
NextIndex(rb_frag_queue_t rb_fragq, index_type i)
{
return (i != rb_fragq->_capacity ? i + 1: 0);
}
/*---------------------------------------------------------------------------*/
static inline index_type
PrevIndex(rb_frag_queue_t rb_fragq, index_type i)
{
return (i != 0 ? i - 1: rb_fragq->_capacity);
}
/*---------------------------------------------------------------------------*/
static inline void
RBFragMemoryBarrier(struct fragment_ctx * volatile frag, volatile index_type index)
{
__asm__ volatile("" : : "m" (frag), "m" (index));
}
/*---------------------------------------------------------------------------*/
rb_frag_queue_t
CreateRBFragQueue(int capacity)
{
rb_frag_queue_t rb_fragq;
rb_fragq = (rb_frag_queue_t)calloc(1, sizeof(struct rb_frag_queue));
if (!rb_fragq)
return NULL;
rb_fragq->_q = (struct fragment_ctx **)
calloc(capacity + 1, sizeof(struct fragment_ctx *));
if (!rb_fragq->_q) {
free(rb_fragq);
return NULL;
}
rb_fragq->_capacity = capacity;
rb_fragq->_head = rb_fragq->_tail = 0;
return rb_fragq;
}
/*---------------------------------------------------------------------------*/
void
DestroyRBFragQueue(rb_frag_queue_t rb_fragq)
{
if (!rb_fragq)
return;
if (rb_fragq->_q) {
free((void *)rb_fragq->_q);
rb_fragq->_q = NULL;
}
free(rb_fragq);
}
/*---------------------------------------------------------------------------*/
int
RBFragEnqueue(rb_frag_queue_t rb_fragq, struct fragment_ctx *frag)
{
index_type h = rb_fragq->_head;
index_type t = rb_fragq->_tail;
index_type nt = NextIndex(rb_fragq, t);
if (nt != h) {
rb_fragq->_q[t] = frag;
RBFragMemoryBarrier(rb_fragq->_q[t], rb_fragq->_tail);
rb_fragq->_tail = nt;
return 0;
}
TRACE_ERROR("Exceed capacity of frag queue!\n");
return -1;
}
/*---------------------------------------------------------------------------*/
struct fragment_ctx *
RBFragDequeue(rb_frag_queue_t rb_fragq)
{
index_type h = rb_fragq->_head;
index_type t = rb_fragq->_tail;
if (h != t) {
struct fragment_ctx *frag = rb_fragq->_q[h];
RBFragMemoryBarrier(rb_fragq->_q[h], rb_fragq->_head);
rb_fragq->_head = NextIndex(rb_fragq, h);
assert(frag);
return frag;
}
return NULL;
}
/*---------------------------------------------------------------------------*/

401
mtcp/src/tcp_ring_buffer.c Normal file
View File

@ -0,0 +1,401 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <sys/types.h>
#include "tcp_ring_buffer.h"
#include "tcp_rb_frag_queue.h"
#include "memory_mgt.h"
#include "debug.h"
#define MAX_RB_SIZE (16*1024*1024)
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
/*----------------------------------------------------------------------------*/
struct rb_manager
{
size_t chunk_size;
uint32_t cur_num;
uint32_t cnum;
mem_pool_t mp;
mem_pool_t frag_mp;
rb_frag_queue_t free_fragq; /* free fragment queue (for app thread) */
rb_frag_queue_t free_fragq_int; /* free fragment quuee (only for mtcp) */
} rb_manager;
/*----------------------------------------------------------------------------*/
uint32_t
RBGetCurnum(rb_manager_t rbm)
{
return rbm->cur_num;
}
/*-----------------------------------------------------------------------------*/
void
RBPrintInfo(struct tcp_ring_buffer* buff)
{
printf("buff_data %p, buff_size %d, buff_mlen %d, "
"buff_clen %lu, buff_head %p (%d), buff_tail (%d)\n",
buff->data, buff->size, buff->merged_len, buff->cum_len,
buff->head, buff->head_offset, buff->tail_offset);
}
/*----------------------------------------------------------------------------*/
void
RBPrintStr(struct tcp_ring_buffer* buff)
{
RBPrintInfo(buff);
printf("%s\n", buff->head);
}
/*----------------------------------------------------------------------------*/
void
RBPrintHex(struct tcp_ring_buffer* buff)
{
int i;
RBPrintInfo(buff);
for (i = 0; i < buff->merged_len; i++) {
if (i != 0 && i % 16 == 0)
printf("\n");
printf("%0x ", *( (unsigned char*) buff->head + i));
}
printf("\n");
}
/*----------------------------------------------------------------------------*/
rb_manager_t
RBManagerCreate(size_t chunk_size, uint32_t cnum)
{
rb_manager_t rbm = (rb_manager_t) calloc(1, sizeof(rb_manager));
if (!rbm) {
perror("rbm_create calloc");
return NULL;
}
rbm->chunk_size = chunk_size;
rbm->cnum = cnum;
rbm->mp = (mem_pool_t)MPCreate(chunk_size, (uint64_t)chunk_size * cnum, 0);
if (!rbm->mp) {
TRACE_ERROR("Failed to allocate mp pool.\n");
free(rbm);
return NULL;
}
rbm->frag_mp = (mem_pool_t)MPCreate(sizeof(struct fragment_ctx),
sizeof(struct fragment_ctx) * cnum, 0);
if (!rbm->frag_mp) {
TRACE_ERROR("Failed to allocate frag_mp pool.\n");
MPDestroy(rbm->mp);
free(rbm);
return NULL;
}
rbm->free_fragq = CreateRBFragQueue(cnum);
if (!rbm->free_fragq) {
TRACE_ERROR("Failed to create free fragment queue.\n");
MPDestroy(rbm->mp);
MPDestroy(rbm->frag_mp);
free(rbm);
return NULL;
}
rbm->free_fragq_int = CreateRBFragQueue(cnum);
if (!rbm->free_fragq_int) {
TRACE_ERROR("Failed to create internal free fragment queue.\n");
MPDestroy(rbm->mp);
MPDestroy(rbm->frag_mp);
DestroyRBFragQueue(rbm->free_fragq);
free(rbm);
return NULL;
}
return rbm;
}
/*----------------------------------------------------------------------------*/
static inline void
FreeFragmentContextSingle(rb_manager_t rbm, struct fragment_ctx* frag)
{
if (frag->is_calloc)
free(frag);
else
MPFreeChunk(rbm->frag_mp, frag);
}
/*----------------------------------------------------------------------------*/
void
FreeFragmentContext(rb_manager_t rbm, struct fragment_ctx* fctx)
{
struct fragment_ctx *remove;
assert(fctx);
if (fctx == NULL)
return;
while (fctx) {
remove = fctx;
fctx = fctx->next;
FreeFragmentContextSingle(rbm, remove);
}
}
/*----------------------------------------------------------------------------*/
static struct fragment_ctx *
AllocateFragmentContext(rb_manager_t rbm)
{
/* this function should be called only in mtcp thread */
struct fragment_ctx *frag;
/* first try deqeue the fragment in free fragment queue */
frag = RBFragDequeue(rbm->free_fragq);
if (!frag) {
frag = RBFragDequeue(rbm->free_fragq_int);
if (!frag) {
/* next fall back to fetching from mempool */
frag = MPAllocateChunk(rbm->frag_mp);
if (!frag) {
TRACE_ERROR("fragments depleted, fall back to calloc\n");
frag = calloc(1, sizeof(struct fragment_ctx));
if (frag == NULL) {
TRACE_ERROR("calloc failed\n");
exit(-1);
}
frag->is_calloc = 1; /* mark it as allocated by calloc */
}
}
}
memset(frag, 0, sizeof(*frag));
return frag;
}
/*----------------------------------------------------------------------------*/
struct tcp_ring_buffer*
RBInit(rb_manager_t rbm, uint32_t init_seq)
{
struct tcp_ring_buffer* buff =
(struct tcp_ring_buffer*)calloc(1, sizeof(struct tcp_ring_buffer));
if (buff == NULL){
perror("rb_init buff");
return NULL;
}
buff->data = MPAllocateChunk(rbm->mp);
if(!buff->data){
perror("rb_init MPAllocateChunk");
return NULL;
}
//memset(buff->data, 0, rbm->chunk_size);
buff->size = rbm->chunk_size;
buff->head = buff->data;
buff->head_seq = init_seq;
buff->init_seq = init_seq;
rbm->cur_num++;
return buff;
}
/*----------------------------------------------------------------------------*/
void
RBFree(rb_manager_t rbm, struct tcp_ring_buffer* buff)
{
assert(buff);
if (buff->fctx) {
FreeFragmentContext(rbm, buff->fctx);
buff->fctx = NULL;
}
if (buff->data) {
MPFreeChunk(rbm->mp, buff->data);
}
rbm->cur_num--;
free(buff);
}
/*----------------------------------------------------------------------------*/
#define MAXSEQ ((uint32_t)(0xFFFFFFFF))
/*----------------------------------------------------------------------------*/
static inline uint32_t
GetMinSeq(uint32_t a, uint32_t b)
{
if (a == b) return a;
if (a < b)
return ((b - a) <= MAXSEQ/2) ? a : b;
/* b < a */
return ((a - b) <= MAXSEQ/2) ? b : a;
}
/*----------------------------------------------------------------------------*/
static inline uint32_t
GetMaxSeq(uint32_t a, uint32_t b)
{
if (a == b) return a;
if (a < b)
return ((b - a) <= MAXSEQ/2) ? b : a;
/* b < a */
return ((a - b) <= MAXSEQ/2) ? a : b;
}
/*----------------------------------------------------------------------------*/
static inline int
CanMerge(const struct fragment_ctx *a, const struct fragment_ctx *b)
{
uint32_t a_end = a->seq + a->len + 1;
uint32_t b_end = b->seq + b->len + 1;
if (GetMinSeq(a_end, b->seq) == a_end ||
GetMinSeq(b_end, a->seq) == b_end)
return 0;
return (1);
}
/*----------------------------------------------------------------------------*/
static inline void
MergeFragments(struct fragment_ctx *a, struct fragment_ctx *b)
{
/* merge a into b */
uint32_t min_seq, max_seq;
min_seq = GetMinSeq(a->seq, b->seq);
max_seq = GetMaxSeq(a->seq + a->len, b->seq + b->len);
b->seq = min_seq;
b->len = max_seq - min_seq;
}
/*----------------------------------------------------------------------------*/
int
RBPut(rb_manager_t rbm, struct tcp_ring_buffer* buff,
void* data, uint32_t len, uint32_t cur_seq)
{
int putx, end_off;
struct fragment_ctx *new_ctx;
struct fragment_ctx* iter;
struct fragment_ctx* prev, *pprev;
int merged = 0;
if (len <= 0)
return 0;
// if data offset is smaller than head sequence, then drop
if (GetMinSeq(buff->head_seq, cur_seq) != buff->head_seq)
return 0;
putx = cur_seq - buff->head_seq;
end_off = putx + len;
if (buff->size <= end_off) {
return -2;
}
// if buffer is at tail, move the data to the first of head
if (buff->size <= (buff->head_offset + end_off)) {
memmove(buff->data, buff->head, buff->last_len + 1);
buff->tail_offset -= buff->head_offset;
buff->head_offset = 0;
buff->head = buff->data;
}
//copy data to buffer
memcpy(buff->head + putx, data, len);
if (buff->tail_offset < buff->head_offset + end_off)
buff->tail_offset = buff->head_offset + end_off;
buff->last_len = buff->tail_offset - buff->head_offset;
buff->head[buff->last_len] = 0; /* null termination */
// create fragmentation context blocks
new_ctx = AllocateFragmentContext(rbm);
if (!new_ctx) {
perror("allocating new_ctx failed");
return 0;
}
new_ctx->seq = cur_seq;
new_ctx->len = len;
new_ctx->next = NULL;
// traverse the fragment list, and merge the new fragment if possible
for (iter = buff->fctx, prev = NULL, pprev = NULL;
iter != NULL;
pprev = prev, prev = iter, iter = iter->next) {
if (CanMerge(new_ctx, iter)) {
/* merge the first fragment into the second fragment */
MergeFragments(new_ctx, iter);
/* remove the first fragment */
if (prev == new_ctx) {
if (pprev)
pprev->next = iter;
else
buff->fctx = iter;
prev = pprev;
}
FreeFragmentContextSingle(rbm, new_ctx);
new_ctx = iter;
merged = 1;
}
else if (merged ||
GetMaxSeq(cur_seq + len, iter->seq) == iter->seq) {
/* merged at some point, but no more mergeable
then stop it now */
break;
}
}
if (!merged) {
if (buff->fctx == NULL) {
buff->fctx = new_ctx;
} else if (GetMinSeq(cur_seq, buff->fctx->seq) == cur_seq) {
/* if the new packet's seqnum is before the existing fragments */
new_ctx->next = buff->fctx;
buff->fctx = new_ctx;
} else {
/* if the seqnum is in-between the fragments or
at the last */
assert(GetMinSeq(cur_seq, prev->seq + prev->len) ==
prev->seq + prev->len);
prev->next = new_ctx;
new_ctx->next = iter;
}
}
if (buff->head_seq == buff->fctx->seq) {
buff->cum_len += buff->fctx->len - buff->merged_len;
buff->merged_len = buff->fctx->len;
}
return len;
}
/*----------------------------------------------------------------------------*/
size_t
RBRemove(rb_manager_t rbm, struct tcp_ring_buffer* buff, size_t len, int option)
{
/* this function should be called only in application thread */
if (buff->merged_len < len)
len = buff->merged_len;
if (len == 0)
return 0;
buff->head_offset += len;
buff->head = buff->data + buff->head_offset;
buff->head_seq += len;
buff->merged_len -= len;
buff->last_len -= len;
// modify fragementation chunks
if (len == buff->fctx->len) {
struct fragment_ctx* remove = buff->fctx;
buff->fctx = buff->fctx->next;
if (option == AT_APP) {
RBFragEnqueue(rbm->free_fragq, remove);
} else if (option == AT_MTCP) {
RBFragEnqueue(rbm->free_fragq_int, remove);
}
}
else if (len < buff->fctx->len) {
buff->fctx->seq += len;
buff->fctx->len -= len;
}
else {
assert(0);
}
return len;
}
/*----------------------------------------------------------------------------*/

131
mtcp/src/tcp_sb_queue.c Normal file
View File

@ -0,0 +1,131 @@
/*
* TCP free send buffer queue - tcp_sb_queue.c/h
*
* EunYoung Jeong
*
* Part of this code borrows Click's simple queue implementation
*
* ============================== Click License =============================
*
* Copyright (c) 1999-2000 Massachusetts Institute of Technology
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, subject to the conditions
* listed in the Click LICENSE file. These conditions include: you must
* preserve this copyright notice, and you cannot mention the copyright
* holders in advertising related to the Software without their permission.
* The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
* notice is a summary of the Click LICENSE file; the license in that file is
* legally binding.
*/
#include "tcp_sb_queue.h"
#include "debug.h"
/*----------------------------------------------------------------------------*/
#ifndef _INDEX_TYPE_
#define _INDEX_TYPE_
typedef uint32_t index_type;
typedef int32_t signed_index_type;
#endif
/*---------------------------------------------------------------------------*/
struct sb_queue
{
index_type _capacity;
volatile index_type _head;
volatile index_type _tail;
struct tcp_send_buffer * volatile * _q;
};
/*----------------------------------------------------------------------------*/
static inline index_type
NextIndex(sb_queue_t sq, index_type i)
{
return (i != sq->_capacity ? i + 1: 0);
}
/*---------------------------------------------------------------------------*/
static inline index_type
PrevIndex(sb_queue_t sq, index_type i)
{
return (i != 0 ? i - 1: sq->_capacity);
}
/*---------------------------------------------------------------------------*/
static inline void
SBMemoryBarrier(struct tcp_send_buffer * volatile buf, volatile index_type index)
{
__asm__ volatile("" : : "m" (buf), "m" (index));
}
/*---------------------------------------------------------------------------*/
sb_queue_t
CreateSBQueue(int capacity)
{
sb_queue_t sq;
sq = (sb_queue_t)calloc(1, sizeof(struct sb_queue));
if (!sq)
return NULL;
sq->_q = (struct tcp_send_buffer **)
calloc(capacity + 1, sizeof(struct tcp_send_buffer *));
if (!sq->_q) {
free(sq);
return NULL;
}
sq->_capacity = capacity;
sq->_head = sq->_tail = 0;
return sq;
}
/*---------------------------------------------------------------------------*/
void
DestroySBQueue(sb_queue_t sq)
{
if (!sq)
return;
if (sq->_q) {
free((void *)sq->_q);
sq->_q = NULL;
}
free(sq);
}
/*---------------------------------------------------------------------------*/
int
SBEnqueue(sb_queue_t sq, struct tcp_send_buffer *buf)
{
index_type h = sq->_head;
index_type t = sq->_tail;
index_type nt = NextIndex(sq, t);
if (nt != h) {
sq->_q[t] = buf;
SBMemoryBarrier(sq->_q[t], sq->_tail);
sq->_tail = nt;
return 0;
}
TRACE_ERROR("Exceed capacity of buf queue!\n");
return -1;
}
/*---------------------------------------------------------------------------*/
struct tcp_send_buffer *
SBDequeue(sb_queue_t sq)
{
index_type h = sq->_head;
index_type t = sq->_tail;
if (h != t) {
struct tcp_send_buffer *buf = sq->_q[h];
SBMemoryBarrier(sq->_q[h], sq->_head);
sq->_head = NextIndex(sq, h);
assert(buf);
return buf;
}
return NULL;
}
/*---------------------------------------------------------------------------*/

171
mtcp/src/tcp_send_buffer.c Normal file
View File

@ -0,0 +1,171 @@
#include <string.h>
#include "memory_mgt.h"
#include "debug.h"
#include "tcp_send_buffer.h"
#include "tcp_sb_queue.h"
#define MAX(a, b) ((a)>(b)?(a):(b))
#define MIN(a, b) ((a)<(b)?(a):(b))
/*----------------------------------------------------------------------------*/
struct sb_manager
{
size_t chunk_size;
uint32_t cur_num;
uint32_t cnum;
mem_pool_t mp;
sb_queue_t freeq;
} sb_manager;
/*----------------------------------------------------------------------------*/
uint32_t
SBGetCurnum(sb_manager_t sbm)
{
return sbm->cur_num;
}
/*----------------------------------------------------------------------------*/
sb_manager_t
SBManagerCreate(size_t chunk_size, uint32_t cnum)
{
sb_manager_t sbm = (sb_manager_t)calloc(1, sizeof(sb_manager));
if (!sbm) {
TRACE_ERROR("SBManagerCreate() failed. %s\n", strerror(errno));
return NULL;
}
sbm->chunk_size = chunk_size;
sbm->cnum = cnum;
sbm->mp = (mem_pool_t)MPCreate(chunk_size, (uint64_t)chunk_size * cnum, 0);
if (!sbm->mp) {
TRACE_ERROR("Failed to create mem pool for sb.\n");
free(sbm);
return NULL;
}
sbm->freeq = CreateSBQueue(cnum);
if (!sbm->freeq) {
TRACE_ERROR("Failed to create free buffer queue.\n");
return NULL;
}
return sbm;
}
/*----------------------------------------------------------------------------*/
struct tcp_send_buffer *
SBInit(sb_manager_t sbm, uint32_t init_seq)
{
struct tcp_send_buffer *buf;
/* first try dequeue from free buffer queue */
buf = SBDequeue(sbm->freeq);
if (!buf) {
buf = (struct tcp_send_buffer *)malloc(sizeof(struct tcp_send_buffer));
if (!buf) {
perror("calloc() for buf");
return NULL;
}
buf->data = MPAllocateChunk(sbm->mp);
if (!buf->data) {
TRACE_ERROR("Failed to fetch memory chunk for data.\n");
return NULL;
}
sbm->cur_num++;
}
buf->head = buf->data;
buf->head_off = buf->tail_off = 0;
buf->len = buf->cum_len = 0;
buf->size = sbm->chunk_size;
buf->init_seq = buf->head_seq = init_seq;
return buf;
}
/*----------------------------------------------------------------------------*/
#if 0
static void
SBFreeInternal(sb_manager_t sbm, struct tcp_send_buffer *buf)
{
if (!buf)
return;
if (buf->data) {
MPFreeChunk(sbm->mp, buf->data);
buf->data = NULL;
}
sbm->cur_num--;
free(buf);
}
#endif
/*----------------------------------------------------------------------------*/
void
SBFree(sb_manager_t sbm, struct tcp_send_buffer *buf)
{
if (!buf)
return;
SBEnqueue(sbm->freeq, buf);
}
/*----------------------------------------------------------------------------*/
size_t
SBPut(sb_manager_t sbm, struct tcp_send_buffer *buf, void *data, size_t len)
{
size_t to_put;
if (len <= 0)
return 0;
/* if no space, return -2 */
to_put = MIN(len, buf->size - buf->len);
if (to_put <= 0) {
return -2;
}
if (buf->tail_off + to_put < buf->size) {
/* if the data fit into the buffer, copy it */
memcpy(buf->data + buf->tail_off, data, to_put);
buf->tail_off += to_put;
} else {
/* if buffer overflows, move the existing payload and merge */
memmove(buf->data, buf->head, buf->len);
buf->head = buf->data;
buf->head_off = 0;
memcpy(buf->head + buf->len, data, to_put);
buf->tail_off = buf->len + to_put;
}
buf->len += to_put;
buf->cum_len += to_put;
return to_put;
}
/*----------------------------------------------------------------------------*/
size_t
SBRemove(sb_manager_t sbm, struct tcp_send_buffer *buf, size_t len)
{
size_t to_remove;
if (len <= 0)
return 0;
to_remove = MIN(len, buf->len);
if (to_remove <= 0) {
return -2;
}
buf->head_off += to_remove;
buf->head = buf->data + buf->head_off;
buf->head_seq += to_remove;
buf->len -= to_remove;
/* if buffer is empty, move the head to 0 */
if (buf->len == 0 && buf->head_off > 0) {
buf->head = buf->data;
buf->head_off = buf->tail_off = 0;
}
return to_remove;
}
/*---------------------------------------------------------------------------*/

609
mtcp/src/tcp_stream.c Normal file
View File

@ -0,0 +1,609 @@
#include "tcp_stream.h"
#include "fhash.h"
#include "tcp_in.h"
#include "tcp_out.h"
#include "tcp_ring_buffer.h"
#include "tcp_send_buffer.h"
#include "eventpoll.h"
#include "ip_out.h"
#include "timer.h"
#include "debug.h"
#define TCP_MAX_SEQ 4294967295
/*---------------------------------------------------------------------------*/
char *state_str[] = {"TCP_ST_CLOSED",
"TCP_ST_LISTEN",
"TCP_ST_SYN_SENT",
"TCP_ST_SYN_RCVD",
"TCP_ST_ESTABILSHED",
"TCP_ST_FIN_WAIT_1",
"TCP_ST_FIN_WAIT_2",
"TCP_ST_CLOSE_WAIT",
"TCP_ST_CLOSING",
"TCP_ST_LAST_ACK",
"TCP_ST_TIME_WAIT"
};
/*---------------------------------------------------------------------------*/
char *close_reason_str[] = {
"NOT_CLOSED",
"CLOSE",
"CLOSED",
"CONN_FAIL",
"CONN_LOST",
"RESET",
"NO_MEM",
"DENIED",
"TIMEDOUT"
};
/*---------------------------------------------------------------------------*/
inline char *
TCPStateToString(const tcp_stream *stream)
{
return state_str[stream->state];
}
/*---------------------------------------------------------------------------*/
unsigned int
HashFlow(const tcp_stream *flow)
{
#if 0
unsigned long hash = 5381;
int c;
int index;
char *str = (char *)&flow->saddr;
index = 0;
while ((c = *str++) && index++ < 12) {
if (index == 8) {
str = (char *)&flow->sport;
}
hash = ((hash << 5) + hash) + c;
}
return hash & (NUM_BINS - 1);
#else
unsigned int hash, i;
char *key = (char *)&flow->saddr;
for (hash = i = 0; i < 12; ++i) {
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash & (NUM_BINS - 1);
#endif
}
/*---------------------------------------------------------------------------*/
int
EqualFlow(const tcp_stream *flow1, const tcp_stream *flow2)
{
return (flow1->saddr == flow2->saddr &&
flow1->sport == flow2->sport &&
flow1->daddr == flow2->daddr &&
flow1->dport == flow2->dport);
}
/*---------------------------------------------------------------------------*/
inline void
RaiseReadEvent(mtcp_manager_t mtcp, tcp_stream *stream)
{
if (stream->socket) {
if (stream->socket->epoll & MTCP_EPOLLIN) {
AddEpollEvent(mtcp->ep,
MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLIN);
#if BLOCKING_SUPPORT
} else if (!(stream->socket->opts & MTCP_NONBLOCK)) {
if (!stream->on_rcv_br_list) {
stream->on_rcv_br_list = TRUE;
TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link);
mtcp->rcv_br_list_cnt++;
}
#endif
}
} else {
TRACE_EPOLL("Stream %d: Raising read without a socket!\n", stream->id);
}
}
/*---------------------------------------------------------------------------*/
inline void
RaiseWriteEvent(mtcp_manager_t mtcp, tcp_stream *stream)
{
if (stream->socket) {
if (stream->socket->epoll & MTCP_EPOLLOUT) {
AddEpollEvent(mtcp->ep,
MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLOUT);
#if BLOCKING_SUPPORT
} else if (!(stream->socket->opts & MTCP_NONBLOCK)) {
if (!stream->on_snd_br_list) {
stream->on_snd_br_list = TRUE;
TAILQ_INSERT_TAIL(&mtcp->snd_br_list, stream, sndvar->snd_br_link);
mtcp->snd_br_list_cnt++;
}
#endif
}
} else {
TRACE_EPOLL("Stream %d: Raising write without a socket!\n", stream->id);
}
}
/*---------------------------------------------------------------------------*/
inline void
RaiseCloseEvent(mtcp_manager_t mtcp, tcp_stream *stream)
{
if (stream->socket) {
if (stream->socket->epoll & MTCP_EPOLLRDHUP) {
AddEpollEvent(mtcp->ep,
MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLRDHUP);
} else if (stream->socket->epoll & MTCP_EPOLLIN) {
AddEpollEvent(mtcp->ep,
MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLIN);
#if BLOCKING_SUPPORT
} else if (!(stream->socket->opts & MTCP_NONBLOCK)) {
//pthread_cond_signal(&stream->rcvvar->read_cond);
//pthread_cond_signal(&stream->sndvar->write_cond);
if (!stream->on_rcv_br_list) {
stream->on_rcv_br_list = TRUE;
TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link);
mtcp->rcv_br_list_cnt++;
}
if (!stream->on_snd_br_list) {
stream->on_snd_br_list = TRUE;
TAILQ_INSERT_TAIL(&mtcp->snd_br_list, stream, sndvar->snd_br_link);
mtcp->snd_br_list_cnt++;
}
#endif
}
} else {
TRACE_EPOLL("Stream %d: Raising close without a socket!\n", stream->id);
}
}
/*---------------------------------------------------------------------------*/
inline void
RaiseErrorEvent(mtcp_manager_t mtcp, tcp_stream *stream)
{
if (stream->socket) {
if (stream->socket->epoll & MTCP_EPOLLERR) {
AddEpollEvent(mtcp->ep,
MTCP_EVENT_QUEUE, stream->socket, MTCP_EPOLLERR);
#if BLOCKING_SUPPORT
} else if (!(stream->socket->opts & MTCP_NONBLOCK)) {
if (!stream->on_rcv_br_list) {
stream->on_rcv_br_list = TRUE;
TAILQ_INSERT_TAIL(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link);
mtcp->rcv_br_list_cnt++;
}
if (!stream->on_snd_br_list) {
stream->on_snd_br_list = TRUE;
TAILQ_INSERT_TAIL(&mtcp->snd_br_list, stream, sndvar->snd_br_link);
mtcp->snd_br_list_cnt++;
}
#endif
}
} else {
TRACE_EPOLL("Stream %d: Raising error without a socket!\n", stream->id);
}
}
/*---------------------------------------------------------------------------*/
tcp_stream *
CreateTCPStream(mtcp_manager_t mtcp, socket_map_t socket, int type,
uint32_t saddr, uint16_t sport, uint32_t daddr, uint16_t dport)
{
tcp_stream *stream = NULL;
int ret;
uint8_t *sa;
uint8_t *da;
pthread_mutex_lock(&mtcp->ctx->flow_pool_lock);
stream = (tcp_stream *)MPAllocateChunk(mtcp->flow_pool);
if (!stream) {
TRACE_ERROR("Cannot allocate memory for the stream. "
"CONFIG.max_concurrency: %d, concurrent: %u\n",
CONFIG.max_concurrency, mtcp->flow_cnt);
pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock);
return NULL;
}
memset(stream, 0, sizeof(tcp_stream));
stream->rcvvar = (struct tcp_recv_vars *)MPAllocateChunk(mtcp->rv_pool);
if (!stream->rcvvar) {
MPFreeChunk(mtcp->flow_pool, stream);
pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock);
return NULL;
}
stream->sndvar = (struct tcp_send_vars *)MPAllocateChunk(mtcp->sv_pool);
if (!stream->sndvar) {
MPFreeChunk(mtcp->rv_pool, stream->rcvvar);
MPFreeChunk(mtcp->flow_pool, stream);
pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock);
return NULL;
}
memset(stream->rcvvar, 0, sizeof(struct tcp_recv_vars));
memset(stream->sndvar, 0, sizeof(struct tcp_send_vars));
stream->id = mtcp->g_id++;
stream->saddr = saddr;
stream->sport = sport;
stream->daddr = daddr;
stream->dport = dport;
ret = HTInsert(mtcp->tcp_flow_table, stream);
if (ret < 0) {
TRACE_ERROR("Stream %d: "
"Failed to insert the stream into hash table.\n", stream->id);
MPFreeChunk(mtcp->flow_pool, stream);
pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock);
return NULL;
}
stream->on_hash_table = TRUE;
mtcp->flow_cnt++;
pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock);
if (socket) {
stream->socket = socket;
socket->stream = stream;
}
stream->stream_type = type;
stream->state = TCP_ST_LISTEN;
stream->on_rto_idx = -1;
stream->sndvar->ip_id = 0;
stream->sndvar->mss = TCP_DEFAULT_MSS;
stream->sndvar->wscale = TCP_DEFAULT_WSCALE;
stream->sndvar->nif_out = GetOutputInterface(stream->daddr);
stream->sndvar->iss = rand() % TCP_MAX_SEQ;
//stream->sndvar->iss = 0;
stream->rcvvar->irs = 0;
stream->snd_nxt = stream->sndvar->iss;
stream->sndvar->snd_una = stream->sndvar->iss;
stream->sndvar->snd_wnd = CONFIG.sndbuf_size;
stream->rcv_nxt = 0;
stream->rcvvar->rcv_wnd = TCP_INITIAL_WINDOW;
stream->rcvvar->snd_wl1 = stream->rcvvar->irs - 1;
stream->sndvar->rto = TCP_INITIAL_RTO;
#if BLOCKING_SUPPORT
if (pthread_cond_init(&stream->rcvvar->read_cond, NULL)) {
perror("pthread_cond_init of read_cond");
return NULL;
}
if (pthread_cond_init(&stream->sndvar->write_cond, NULL)) {
perror("pthread_cond_init of write_cond");
return NULL;
}
#endif
#if USE_SPIN_LOCK
if (pthread_spin_init(&stream->rcvvar->read_lock, PTHREAD_PROCESS_PRIVATE)) {
#else
if (pthread_mutex_init(&stream->rcvvar->read_lock, NULL)) {
#endif
perror("pthread_mutex_init of read_lock");
#if BLOCKING_SUPPORT
pthread_cond_destroy(&stream->rcvvar->read_cond);
pthread_cond_destroy(&stream->sndvar->write_cond);
#endif
return NULL;
}
#if USE_SPIN_LOCK
if (pthread_spin_init(&stream->sndvar->write_lock, PTHREAD_PROCESS_PRIVATE)) {
perror("pthread_spin_init of write_lock");
pthread_spin_destroy(&stream->rcvvar->read_lock);
#else
if (pthread_mutex_init(&stream->sndvar->write_lock, NULL)) {
perror("pthread_mutex_init of write_lock");
pthread_mutex_destroy(&stream->rcvvar->read_lock);
#endif
#if BLOCKING_SUPPORT
pthread_cond_destroy(&stream->rcvvar->read_cond);
pthread_cond_destroy(&stream->sndvar->write_cond);
#endif
return NULL;
}
sa = (uint8_t *)&stream->saddr;
da = (uint8_t *)&stream->daddr;
TRACE_STREAM("CREATED NEW TCP STREAM %d: "
"%u.%u.%u.%u(%d) -> %u.%u.%u.%u(%d) (ISS: %u)\n", stream->id,
sa[0], sa[1], sa[2], sa[3], ntohs(stream->sport),
da[0], da[1], da[2], da[3], ntohs(stream->dport),
stream->sndvar->iss);
return stream;
}
/*---------------------------------------------------------------------------*/
void
DestroyTCPStream(mtcp_manager_t mtcp, tcp_stream *stream)
{
struct sockaddr_in addr;
int bound_addr = FALSE;
uint8_t *sa, *da;
int ret;
#ifdef DUMP_STREAM
if (stream->close_reason != TCP_ACTIVE_CLOSE &&
stream->close_reason != TCP_PASSIVE_CLOSE) {
thread_printf(mtcp, mtcp->log_fp,
"Stream %d abnormally closed.\n", stream->id);
DumpStream(mtcp, stream);
DumpControlList(mtcp, mtcp->n_sender[0]);
}
#endif
sa = (uint8_t *)&stream->saddr;
da = (uint8_t *)&stream->daddr;
TRACE_STREAM("DESTROY TCP STREAM %d: "
"%u.%u.%u.%u(%d) -> %u.%u.%u.%u(%d) (%s)\n", stream->id,
sa[0], sa[1], sa[2], sa[3], ntohs(stream->sport),
da[0], da[1], da[2], da[3], ntohs(stream->dport),
close_reason_str[stream->close_reason]);
if (stream->sndvar->sndbuf) {
TRACE_FSTAT("Stream %d: send buffer "
"cum_len: %lu, len: %u\n", stream->id,
stream->sndvar->sndbuf->cum_len,
stream->sndvar->sndbuf->len);
}
if (stream->rcvvar->rcvbuf) {
TRACE_FSTAT("Stream %d: recv buffer "
"cum_len: %lu, merged_len: %u, last_len: %u\n", stream->id,
stream->rcvvar->rcvbuf->cum_len,
stream->rcvvar->rcvbuf->merged_len,
stream->rcvvar->rcvbuf->last_len);
}
#if RTM_STAT
/* Triple duplicated ack stats */
if (stream->sndvar->rstat.tdp_ack_cnt) {
TRACE_FSTAT("Stream %d: triple duplicated ack: %u, "
"retransmission bytes: %u, average rtm bytes/ack: %u\n",
stream->id,
stream->sndvar->rstat.tdp_ack_cnt, stream->sndvar->rstat.tdp_ack_bytes,
stream->sndvar->rstat.tdp_ack_bytes / stream->sndvar->rstat.tdp_ack_cnt);
}
/* Retransmission timeout stats */
if (stream->sndvar->rstat.rto_cnt > 0) {
TRACE_FSTAT("Stream %d: timeout count: %u, bytes: %u\n", stream->id,
stream->sndvar->rstat.rto_cnt, stream->sndvar->rstat.rto_bytes);
}
/* Recovery stats */
if (stream->sndvar->rstat.ack_upd_cnt) {
TRACE_FSTAT("Stream %d: snd_nxt update count: %u, "
"snd_nxt update bytes: %u, average update bytes/update: %u\n",
stream->id,
stream->sndvar->rstat.ack_upd_cnt, stream->sndvar->rstat.ack_upd_bytes,
stream->sndvar->rstat.ack_upd_bytes / stream->sndvar->rstat.ack_upd_cnt);
}
#if TCP_OPT_SACK_ENABLED
if (stream->sndvar->rstat.sack_cnt) {
TRACE_FSTAT("Selective ack count: %u, bytes: %u, "
"average bytes/ack: %u\n",
stream->sndvar->rstat.sack_cnt, stream->sndvar->rstat.sack_bytes,
stream->sndvar->rstat.sack_bytes / stream->sndvar->rstat.sack_cnt);
} else {
TRACE_FSTAT("Selective ack count: %u, bytes: %u\n",
stream->sndvar->rstat.sack_cnt, stream->sndvar->rstat.sack_bytes);
}
if (stream->sndvar->rstat.tdp_sack_cnt) {
TRACE_FSTAT("Selective tdp ack count: %u, bytes: %u, "
"average bytes/ack: %u\n",
stream->sndvar->rstat.tdp_sack_cnt, stream->sndvar->rstat.tdp_sack_bytes,
stream->sndvar->rstat.tdp_sack_bytes / stream->sndvar->rstat.tdp_sack_cnt);
} else {
TRACE_FSTAT("Selective ack count: %u, bytes: %u\n",
stream->sndvar->rstat.tdp_sack_cnt, stream->sndvar->rstat.tdp_sack_bytes);
}
#endif /* TCP_OPT_SACK_ENABLED */
#endif /* RTM_STAT */
if (stream->is_bound_addr) {
bound_addr = TRUE;
addr.sin_addr.s_addr = stream->saddr;
addr.sin_port = stream->sport;
}
RemoveFromControlList(mtcp, stream);
RemoveFromSendList(mtcp, stream);
RemoveFromACKList(mtcp, stream);
if (stream->on_rto_idx >= 0)
RemoveFromRTOList(mtcp, stream);
if (stream->on_timewait_list)
RemoveFromTimewaitList(mtcp, stream);
if (CONFIG.tcp_timeout > 0)
RemoveFromTimeoutList(mtcp, stream);
#if BLOCKING_SUPPORT
if (stream->on_snd_br_list) {
stream->on_snd_br_list = FALSE;
TAILQ_REMOVE(&mtcp->snd_br_list, stream, sndvar->snd_br_link);
mtcp->snd_br_list_cnt--;
}
if (stream->on_rcv_br_list) {
stream->on_rcv_br_list = FALSE;
TAILQ_REMOVE(&mtcp->rcv_br_list, stream, rcvvar->rcv_br_link);
mtcp->rcv_br_list_cnt--;
}
if (!stream->epoll) {
pthread_cond_signal(&stream->rcvvar->read_cond);
pthread_cond_signal(&stream->sndvar->write_cond);
}
if (pthread_cond_destroy(&stream->rcvvar->read_cond)) {
perror("pthread_cond_destroy of read_cond");
}
if (pthread_cond_destroy(&stream->sndvar->write_cond)) {
perror("pthread_cond_destroy of write_cond");
}
#endif
SBUF_LOCK_DESTROY(&stream->rcvvar->read_lock);
SBUF_LOCK_DESTROY(&stream->sndvar->write_lock);
assert(stream->on_hash_table == TRUE);
/* free ring buffers */
if (stream->sndvar->sndbuf) {
SBFree(mtcp->rbm_snd, stream->sndvar->sndbuf);
stream->sndvar->sndbuf = NULL;
}
if (stream->rcvvar->rcvbuf) {
RBFree(mtcp->rbm_rcv, stream->rcvvar->rcvbuf);
stream->rcvvar->rcvbuf = NULL;
}
pthread_mutex_lock(&mtcp->ctx->flow_pool_lock);
/* remove from flow hash table */
HTRemove(mtcp->tcp_flow_table, stream);
stream->on_hash_table = FALSE;
mtcp->flow_cnt--;
MPFreeChunk(mtcp->rv_pool, stream->rcvvar);
MPFreeChunk(mtcp->sv_pool, stream->sndvar);
MPFreeChunk(mtcp->flow_pool, stream);
pthread_mutex_unlock(&mtcp->ctx->flow_pool_lock);
if (bound_addr) {
if (mtcp->ap) {
ret = FreeAddress(mtcp->ap, &addr);
} else {
ret = FreeAddress(ap, &addr);
}
if (ret < 0) {
TRACE_ERROR("(NEVER HAPPEN) Failed to free address.\n");
}
}
#ifdef NETSTAT
#if NETSTAT_PERTHREAD
TRACE_STREAM("Destroyed. Remaining flows: %u\n", mtcp->flow_cnt);
#endif /* NETSTAT_PERTHREAD */
#endif /* NETSTAT */
}
/*---------------------------------------------------------------------------*/
void
DumpStream(mtcp_manager_t mtcp, tcp_stream *stream)
{
uint8_t *sa, *da;
struct tcp_send_vars *sndvar = stream->sndvar;
struct tcp_recv_vars *rcvvar = stream->rcvvar;
sa = (uint8_t *)&stream->saddr;
da = (uint8_t *)&stream->daddr;
thread_printf(mtcp, mtcp->log_fp, "========== Stream %u: "
"%u.%u.%u.%u(%u) -> %u.%u.%u.%u(%u) ==========\n", stream->id,
sa[0], sa[1], sa[2], sa[3], ntohs(stream->sport),
da[0], da[1], da[2], da[3], ntohs(stream->dport));
thread_printf(mtcp, mtcp->log_fp,
"Stream id: %u, type: %u, state: %s, close_reason: %s\n",
stream->id, stream->stream_type,
TCPStateToString(stream), close_reason_str[stream->close_reason]);
if (stream->socket) {
socket_map_t socket = stream->socket;
thread_printf(mtcp, mtcp->log_fp, "Socket id: %d, type: %d, opts: %u\n"
"epoll: %u (IN: %u, OUT: %u, ERR: %u, RDHUP: %u, ET: %u)\n"
"events: %u (IN: %u, OUT: %u, ERR: %u, RDHUP: %u, ET: %u)\n",
socket->id, socket->socktype, socket->opts,
socket->epoll, socket->epoll & MTCP_EPOLLIN,
socket->epoll & MTCP_EPOLLOUT, socket->epoll & MTCP_EPOLLERR,
socket->epoll & MTCP_EPOLLRDHUP, socket->epoll & MTCP_EPOLLET,
socket->events, socket->events & MTCP_EPOLLIN,
socket->events & MTCP_EPOLLOUT, socket->events & MTCP_EPOLLERR,
socket->events & MTCP_EPOLLRDHUP, socket->events & MTCP_EPOLLET);
} else {
thread_printf(mtcp, mtcp->log_fp, "Socket: (null)\n");
}
thread_printf(mtcp, mtcp->log_fp,
"on_hash_table: %u, on_control_list: %u (wait: %u), on_send_list: %u, "
"on_ack_list: %u, is_wack: %u, ack_cnt: %u\n"
"on_rto_idx: %d, on_timewait_list: %u, on_timeout_list: %u, "
"on_rcv_br_list: %u, on_snd_br_list: %u\n"
"on_sendq: %u, on_ackq: %u, closed: %u, on_closeq: %u, "
"on_closeq_int: %u, on_resetq: %u, on_resetq_int: %u\n"
"have_reset: %u, is_fin_sent: %u, is_fin_ackd: %u, "
"saw_timestamp: %u, sack_permit: %u, "
"is_bound_addr: %u, need_wnd_adv: %u\n", stream->on_hash_table,
sndvar->on_control_list, stream->control_list_waiting, sndvar->on_send_list,
sndvar->on_ack_list, sndvar->is_wack, sndvar->ack_cnt,
stream->on_rto_idx, stream->on_timewait_list, stream->on_timeout_list,
stream->on_rcv_br_list, stream->on_snd_br_list,
sndvar->on_sendq, sndvar->on_ackq,
stream->closed, sndvar->on_closeq, sndvar->on_closeq_int,
sndvar->on_resetq, sndvar->on_resetq_int,
stream->have_reset, sndvar->is_fin_sent,
sndvar->is_fin_ackd, stream->saw_timestamp, stream->sack_permit,
stream->is_bound_addr, stream->need_wnd_adv);
thread_printf(mtcp, mtcp->log_fp, "========== Send variables ==========\n");
thread_printf(mtcp, mtcp->log_fp,
"ip_id: %u, mss: %u, eff_mss: %u, wscale: %u, nif_out: %d\n",
sndvar->ip_id, sndvar->mss, sndvar->eff_mss,
sndvar->wscale, sndvar->nif_out);
thread_printf(mtcp, mtcp->log_fp,
"snd_nxt: %u, snd_una: %u, iss: %u, fss: %u\nsnd_wnd: %u, "
"peer_wnd: %u, cwnd: %u, ssthresh: %u\n",
stream->snd_nxt, sndvar->snd_una, sndvar->iss, sndvar->fss,
sndvar->snd_wnd, sndvar->peer_wnd, sndvar->cwnd, sndvar->ssthresh);
if (sndvar->sndbuf) {
thread_printf(mtcp, mtcp->log_fp,
"Send buffer: init_seq: %u, head_seq: %u, "
"len: %d, cum_len: %lu, size: %d\n",
sndvar->sndbuf->init_seq, sndvar->sndbuf->head_seq,
sndvar->sndbuf->len, sndvar->sndbuf->cum_len, sndvar->sndbuf->size);
} else {
thread_printf(mtcp, mtcp->log_fp, "Send buffer: (null)\n");
}
thread_printf(mtcp, mtcp->log_fp,
"nrtx: %u, max_nrtx: %u, rto: %u, ts_rto: %u, "
"ts_lastack_sent: %u\n", sndvar->nrtx, sndvar->max_nrtx,
sndvar->rto, sndvar->ts_rto, sndvar->ts_lastack_sent);
thread_printf(mtcp, mtcp->log_fp,
"========== Receive variables ==========\n");
thread_printf(mtcp, mtcp->log_fp,
"rcv_nxt: %u, irs: %u, rcv_wnd: %u, "
"snd_wl1: %u, snd_wl2: %u\n",
stream->rcv_nxt, rcvvar->irs,
rcvvar->rcv_wnd, rcvvar->snd_wl1, rcvvar->snd_wl2);
if (rcvvar->rcvbuf) {
thread_printf(mtcp, mtcp->log_fp,
"Receive buffer: init_seq: %u, head_seq: %u, "
"merged_len: %d, cum_len: %lu, last_len: %d, size: %d\n",
rcvvar->rcvbuf->init_seq, rcvvar->rcvbuf->head_seq,
rcvvar->rcvbuf->merged_len, rcvvar->rcvbuf->cum_len,
rcvvar->rcvbuf->last_len, rcvvar->rcvbuf->size);
} else {
thread_printf(mtcp, mtcp->log_fp, "Receive buffer: (null)\n");
}
thread_printf(mtcp, mtcp->log_fp, "last_ack_seq: %u, dup_acks: %u\n",
rcvvar->last_ack_seq, rcvvar->dup_acks);
thread_printf(mtcp, mtcp->log_fp,
"ts_recent: %u, ts_lastack_rcvd: %u, ts_last_ts_upd: %u, "
"ts_tw_expire: %u\n", rcvvar->ts_recent, rcvvar->ts_lastack_rcvd,
rcvvar->ts_last_ts_upd, rcvvar->ts_tw_expire);
thread_printf(mtcp, mtcp->log_fp,
"srtt: %u, mdev: %u, mdev_max: %u, rttvar: %u, rtt_seq: %u\n",
rcvvar->srtt, rcvvar->mdev, rcvvar->mdev_max,
rcvvar->rttvar, rcvvar->rtt_seq);
}

214
mtcp/src/tcp_stream_queue.c Normal file
View File

@ -0,0 +1,214 @@
/*
* TCP stream queue - tcp_stream_queue.c/h
*
* EunYoung Jeong
*
* Part of this code borrows Click's simple queue implementation
*
* ============================== Click License =============================
*
* Copyright (c) 1999-2000 Massachusetts Institute of Technology
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, subject to the conditions
* listed in the Click LICENSE file. These conditions include: you must
* preserve this copyright notice, and you cannot mention the copyright
* holders in advertising related to the Software without their permission.
* The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
* notice is a summary of the Click LICENSE file; the license in that file is
* legally binding.
*/
#include <stdio.h>
#include <stdlib.h>
#include "tcp_stream_queue.h"
#include "debug.h"
#ifndef _INDEX_TYPE_
#define _INDEX_TYPE_
typedef uint32_t index_type;
typedef int32_t signed_index_type;
#endif
/*---------------------------------------------------------------------------*/
struct stream_queue
{
index_type _capacity;
volatile index_type _head;
volatile index_type _tail;
struct tcp_stream * volatile * _q;
};
/*----------------------------------------------------------------------------*/
stream_queue_int *
CreateInternalStreamQueue(int size)
{
stream_queue_int *sq;
sq = (stream_queue_int *)calloc(1, sizeof(stream_queue_int));
if (!sq) {
return NULL;
}
sq->array = (tcp_stream **)calloc(size, sizeof(tcp_stream *));
if (!sq->array) {
free(sq);
return NULL;
}
sq->size = size;
sq->first = sq->last = 0;
sq->count = 0;
return sq;
}
/*----------------------------------------------------------------------------*/
void
DestroyInternalStreamQueue(stream_queue_int *sq)
{
if (!sq)
return;
if (sq->array) {
free(sq->array);
sq->array = NULL;
}
free(sq);
}
/*----------------------------------------------------------------------------*/
int
StreamInternalEnqueue(stream_queue_int *sq, struct tcp_stream *stream)
{
if (sq->count >= sq->size) {
/* queue is full */
TRACE_INFO("[WARNING] Queue overflow. Set larger queue size! "
"count: %d, size: %d\n", sq->count, sq->size);
return -1;
}
sq->array[sq->last++] = stream;
sq->count++;
if (sq->last >= sq->size) {
sq->last = 0;
}
assert (sq->count <= sq->size);
return 0;
}
/*----------------------------------------------------------------------------*/
struct tcp_stream *
StreamInternalDequeue(stream_queue_int *sq)
{
struct tcp_stream *stream = NULL;
if (sq->count <= 0) {
return NULL;
}
stream = sq->array[sq->first++];
assert(stream != NULL);
if (sq->first >= sq->size) {
sq->first = 0;
}
sq->count--;
assert(sq->count >= 0);
return stream;
}
/*---------------------------------------------------------------------------*/
static inline index_type
NextIndex(stream_queue_t sq, index_type i)
{
return (i != sq->_capacity ? i + 1: 0);
}
/*---------------------------------------------------------------------------*/
static inline index_type
PrevIndex(stream_queue_t sq, index_type i)
{
return (i != 0 ? i - 1: sq->_capacity);
}
/*---------------------------------------------------------------------------*/
int
StreamQueueIsEmpty(stream_queue_t sq)
{
return (sq->_head == sq->_tail);
}
/*---------------------------------------------------------------------------*/
static inline void
StreamMemoryBarrier(tcp_stream * volatile stream, volatile index_type index)
{
__asm__ volatile("" : : "m" (stream), "m" (index));
}
/*---------------------------------------------------------------------------*/
stream_queue_t
CreateStreamQueue(int capacity)
{
stream_queue_t sq;
sq = (stream_queue_t)calloc(1, sizeof(struct stream_queue));
if (!sq)
return NULL;
sq->_q = (tcp_stream **)calloc(capacity + 1, sizeof(tcp_stream *));
if (!sq->_q) {
free(sq);
return NULL;
}
sq->_capacity = capacity;
sq->_head = sq->_tail = 0;
return sq;
}
/*---------------------------------------------------------------------------*/
void
DestroyStreamQueue(stream_queue_t sq)
{
if (!sq)
return;
if (sq->_q) {
free((void *)sq->_q);
sq->_q = NULL;
}
free(sq);
}
/*---------------------------------------------------------------------------*/
int
StreamEnqueue(stream_queue_t sq, tcp_stream *stream)
{
index_type h = sq->_head;
index_type t = sq->_tail;
index_type nt = NextIndex(sq, t);
if (nt != h) {
sq->_q[t] = stream;
StreamMemoryBarrier(sq->_q[t], sq->_tail);
sq->_tail = nt;
return 0;
}
TRACE_ERROR("Exceed capacity of stream queue!\n");
return -1;
}
/*---------------------------------------------------------------------------*/
tcp_stream *
StreamDequeue(stream_queue_t sq)
{
index_type h = sq->_head;
index_type t = sq->_tail;
if (h != t) {
tcp_stream *stream = sq->_q[h];
StreamMemoryBarrier(sq->_q[h], sq->_head);
sq->_head = NextIndex(sq, h);
assert(stream);
return stream;
}
return NULL;
}
/*---------------------------------------------------------------------------*/

Some files were not shown because too many files have changed in this diff Show More