From 529b2a4458161aa125a5c7ea68c0f1c878207e44 Mon Sep 17 00:00:00 2001 From: "Johann George (QLogic)" Date: Mon, 24 Sep 2007 11:11:24 -0700 Subject: [PATCH] Initial commit --- COPYING | 339 +++++++ Makefile | 18 + configure | 55 + help.txt | 708 +++++++++++++ ib.c | 2231 ++++++++++++++++++++++++++++++++++++++++ ip.c | 837 +++++++++++++++ mkhelp | 113 +++ qperf.c | 2913 +++++++++++++++++++++++++++++++++++++++++++++++++++++ qperf.h | 316 ++++++ 9 files changed, 7530 insertions(+) create mode 100644 COPYING create mode 100644 Makefile create mode 100755 configure create mode 100644 help.txt create mode 100644 ib.c create mode 100644 ip.c create mode 100755 mkhelp create mode 100644 qperf.c create mode 100644 qperf.h diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d511905 --- /dev/null +++ b/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4fe250b --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +CC = gcc +CFLAGS = -Wall -O -DRDMA + +all: qperf + +qperf: qperf.o ip.o ib.o help.o + $(CC) -DRDMA -o $@ $^ -libverbs + +help.c: help.txt + ./mkhelp RDMA + +.PHONY: clean +clean: + rm -f *.o help.c qperf + +.PHONY: install +install: + cp qperf /usr/local/bin diff --git a/configure b/configure new file mode 100755 index 0000000..b983a08 --- /dev/null +++ b/configure @@ -0,0 +1,55 @@ +#!/bin/sh +# +LIBIBV=libibverbs.so.1 + +# Look for InfiniBand verbs library. +# +for DIR in /lib /usr/lib64 /usr/lib /usr/local/lib /usr/local/ibed/lib64 ""; do + [ -e $DIR/$LIBIBV ] && + break +done + + +# Set up parameters. +# +if [ "`uname -m`" = ppc64 ]; then + K_M64=-m64 + echo "Making PowerPC version" +fi +if [ -z "$DIR" ]; then + echo "Could not find $LIBIBV; making non-RDMA version" +else + echo "Found $LIBIBV in $DIR" + K_IB_O="ib.o" + K_DEF_IB="-DRDMA" + if [ "$DIR" = "/usr/lib64" ]; then + K_LIBS="-libverbs" + else + K_LIBS="-lsysfs -Wl,--rpath -Wl,$DIR $DIR/$LIBIBV" + fi + K_IB=RDMA +fi + + +# Produce Makefile +# +cat <Makefile +CC = gcc $K_M64 +CFLAGS = -Wall -O $K_DEF_IB + +all: qperf + +qperf: qperf.o ip.o $K_IB_O help.o + \$(CC) $K_DEF_IB -o \$@ \$^ $K_LIBS + +help.c: help.txt + ./mkhelp $K_IB + +.PHONY: clean +clean: + rm -f *.o help.c qperf + +.PHONY: install +install: + cp qperf /usr/local/bin +EOF diff --git a/help.txt b/help.txt new file mode 100644 index 0000000..d253b68 --- /dev/null +++ b/help.txt @@ -0,0 +1,708 @@ +Main + Usage: + qperf [options] ... + Description + One typically runs qperf with no arguments on the server machine. On a + client machine, one specifies the hostname of the server machine + followed by a list of tests that might be run. + More Information + qperf --help examples Some examples of using qperf + qperf --help opts Summary of options + qperf --help options Description of options + qperf --help tests Short summary and description of tests + qperf --help More information on a particular test +Opts + --access_recv Mode (-Ar) Access receive data + --affinity PN (-a) Set processor affinity + --loc_affinity PN (-la) Set local processor affinity + --rem_affinity PN (-ra) Set remote processor affinity + --flip Mode (-f) Flip sender and receiver + --help Topic (-h) Get more information on a topic + --host Node (-H) Identify server node + --id Device:Port (-i) Set IB device and port + --loc_id Device:Port (-li) Set local IB device and port + --rem_id Device:Port (-ri) Set remote IB device and port + --listen_port Port (-lp) Set server listen port + --msg_size Size (-m) Set message size + --mtu_size Size (-M) Set MTU size (IB only) + --no_msgs Count (-n) Send Count messages + --poll Mode (-P) Set polling mode on/off + --loc_poll Mode (-lP) Set local polling mode on/off + --rem_poll Mode (-lP) Set remote polling mode on/off + --port Port (-p) Set TCP port used for tests + --precision Digits (-e) Set precision reported + --rate (-r) Set IB static rate + --loc_rate (-lr) Set local IB static rate + --rem_rate (-rr) Set remote IB static rate + --rd_atomic Count (-R) Set RDMA read/atomic count + --loc_rd_atomic Count (-lR) Set local RDMA read/atomic count + --rem_rd_atomic Count (-rR) Set remote RDMA read/atomic count + --sock_buf_size Size (-S) Set socket buffer size + --loc_sock_buf_size Size (-lS) Set local socket buffer size + --rem_sock_buf_size Size (-rS) Set remote socket buffer size + --time (-t) Set test duration + --timeout Time (-T) Set timeout + --loc_timeout Time (-lT) Set local timeout + --rem_timeout Time (-rT) Set remote timeout + --unify_nodes (-U) Unify nodes + --unify_units (-u) Unify units + --verbose (-v) Verbose; turn on all of -v[cstu] + --verbose_conf (-vc) Show configuration information + --verbose_stat (-vs) Show statistical information + --verbose_time (-vt) Show timing information + --verbose_used (-vu) Show information on parameters + --verbose_more (-vv) More verbose; turn on all of -v[CSTU] + --verbose_more_conf (-vC) Show more configuration information + --verbose_more_stat (-vS) Show more statistical information + --verbose_more_time (-vT) Show more timing information + --verbose_more_used (-vU) Show more information on parameters + --version (-V) Print out version + --wait Time (-W) Set time to wait for server +Options + --access_recv Mode (-Ar) + If Mode is non-zero, data is accessed once received. Otherwise, data + is ignored. By default, Mode is 0. + --affinity PN (-a) + Set processor affinity to PN. Processors are numbered sequentially + from 0. If PN is "any", any processor is allowed otherwise the + processor is limited to the one specified. + --loc_affinity PN (-la) + Set local processor affinity to PN. + --rem_affinity PN (-ra) + Set remote processor affinity to PN. + --flip Mode (-f) + Cause sender and receiver to play opposite roles. + --help Topic (-h) + Print out information about Topic. To see the list of topics, type + qperf --help + --host Host (-H) + Run test between the current node and the qperf running on node Host. + This can also be specified as the first non-option argument. + --id Device:Port (-i) + Use InfiniBand Device and Port. + --loc_id Device:Port (-li) + Use local InfiniBand Device and Port. + --rem_id Device:Port (-ri) + Use remote InfiniBand Device and Port. + --listen_port Port (-lp) + Set the port we listen on to ListenPort. This must be set to the + same port on both the server and client machines. The default value + is 19765. + --msg_size Size (-m) + Set the message size to Size. The default value varies by test. It + is assumed that the value is specified in bytes however, a trailing + kib or K, mib or M, or gib or G indicates that the size is being + specified in kibibytes, mebibytes or gibibytes respectively while a + trailing kb or k, mb or m, or gb or g indicates kilobytes, megabytes + or gigabytes respectively. + --mtu_size Size (-M) + Set the MTU size. Only relevant to the RDMA UC/RC tests. Units are + specified in the same manner as the --msg_size option. + --no_msgs N (-n) + Set test duration by number of messages sent instead of time. + --poll Mode (-P) + Turn polling mode on or off. This is only relevant to the RDMA tests + and determines whether they poll or wait. If Mode is 0, they wait; + otherwise they poll. + --loc_poll Mode (-lP) + Locally turn polling mode on or off. + --rem_poll Mode (-rP) + Remotely turn polling mode on or off. + --port Port (-p) + Use Port to run the socket tests. This is different from + --listen_port which is used for synchronization. This is only + relevant for the socket tests and refers to the TCP/UDP/SDP/RDS port + that the test is run on. + --precision Digits (-e) + Set the number of significant digits that are used to report results. + --rate Rate (-r) + Force InfiniBand static rate. Rate can be one of: 2.5, 5, 10, 20, + 30, 40, 60, 80, 120, 1xSDR (2.5 Gbps), 1xDDR (5 Gbps), 1xQDR (10 + Gbps), 4xSDR (2.5 Gbps), 4xDDR (5 Gbps), 4xQDR (10 Gbps), 8xSDR (2.5 + Gbps), 8xDDR (5 Gbps), 8xQDR (10 Gbps). + --loc_rate (-lr) + Force local InfiniBand static rate + --rem_rate (-rr) + Force remote InfiniBand static rate + --rd_atomic Count (-R) + Set the number of in-flight operations that can be handled for a RDMA + read or atomic operation. This is only relevant to the RDMA tests. + --loc_rd_atomic Count (-lR) + Set local read/atomic count. + --rem_rd_atomic Count (-rR) + Set remote read/atomic count. + --sock_buf_size Size (-S) + Set the socket buffer size. This is only relevant to the socket + tests. + --loc_sock_buf_size Size (-lS) + Set local socket buffer size. + --rem_sock_buf_size Size (-rS) + Set remote socket buffer size. + --time Time (-t) + Set test duration to Time. Specified in seconds however a trailing + m, h or d indicates that the time is specified in minutes, hours or + days respectively. + --timeout Time (-T) + Set timeout to Time. This is the timeout used for various things + such as exchanging messages. The default is 5 seconds. + --loc_timeout Time (-lT) + Set local timeout to Time. + --rem_timeout Time (-rT) + Set local timeout to Time. + --unify_nodes (-U) + Unify the nodes. Describe them in terms of local and remote rather + than send and receive. + --unify_units (-u) + Unify the units that results are shown in. Uses the lowest common + denominator. Helpful for scripts. + --verbose (-v) + Provide more detailed output. Turns on -vc, -vs, -vt and -vu. + --verbose_conf (-vc) + Provide information on configuration. + --verbose_stat (-vs) + Provide information on statistics. + --verbose_stat (-vt) + Provide information on timing. + --verbose_stat (-vu) + Provide information on parameters used. + --verbose_more (-vv) + Provide even more detailed output. Turns on -vC, -vS, -vT and -vU. + --verbose_conf (-vC) + Provide more information on configuration. + --verbose_stat (-vS) + Provide more information on statistics. + --verbose_stat (-vT) + Provide more information on timing. + --verbose_stat (-vU) + Provide more information on parameters used. + --version (-V) + The current version of qperf is printed. + --wait Time (-W) + If the server is not ready, continue to try connecting for Time + seconds before giving up. +Examples + For these examples, we assume that qperf is running on a machine called + myserver in server mode. To run qperf in server mode, run it with no + arguments. In all the subsequent examples, we run qperf on another machine + and connect to the server which we assume has a hostname of myserver. + * To run a TCP bandwidth and latency test: + qperf myserver tcp_bw tcp_lat + * To run a SDP bandwidth test for 10 seconds: + qperf -t 10 myserver sdp_bw + * To run a UDP latency test and then cause the server to terminate: + qperf myserver udp_lat quit + * To measure the RDMA UD latency and bandwidth: + qperf myserver ud_lat ud_bw + * To measure RDMA UC bi-directional bandwidth: + qperf myserver rc_bi_bw +Tests -RDMA + Miscellaneous + conf Show configuration + quit Cause the server to quit + Socket Based + tcp_bw TCP streaming one way bandwidth + tcp_lat TCP one way latency + udp_bw UDP streaming one way bandwidth + udp_lat UDP one way latency + sdp_bw SDP streaming one way bandwidth + sdp_lat SDP one way latency + rds_bw RDS streaming one way bandwidth + rds_lat RDS one way latency +Tests +RDMA + Miscellaneous + conf Show configuration + quit Cause the server to quit + Socket Based + tcp_bw TCP streaming one way bandwidth + tcp_lat TCP one way latency + udp_bw UDP streaming one way bandwidth + udp_lat UDP one way latency + sdp_bw SDP streaming one way bandwidth + sdp_lat SDP one way latency + rds_bw RDS streaming one way bandwidth + rds_lat RDS one way latency + RDMA Send/Receive + ud_bw UD streaming one way bandwidth + ud_bi_bw UD streaming two way bandwidth + ud_lat UD one way latency + rc_bw RC streaming one way bandwidth + rc_bi_bw RC streaming two way bandwidth + rc_lat RC one way latency + uc_bw UC streaming one way bandwidth + uc_bi_bw UC streaming two way bandwidth + uc_lat UC one way latency + RDMA + rc_rdma_read_bw RC RDMA read streaming one way bandwidth + rc_rdma_read_lat RC RDMA read one way latency + rc_rdma_write_bw RC RDMA write streaming one way bandwidth + rc_rdma_write_lat RC RDMA write one way latency + rc_rdma_write_poll_lat RC RDMA write one way polling latency + uc_rdma_write_bw UC RDMA write streaming one way bandwidth + uc_rdma_write_lat UC RDMA write one way latency + uc_rdma_write_poll_lat UC RDMA write one way polling latency + InfiniBand Atomics + rc_compare_swap_mr RC compare and swap messaging rate + rc_fetch_add_mr RC fetch and add messaging rate + Verification + ver_rc_compare_swap verify RC compare and swap + ver_rc_fetch_add verify RC fetch and add +conf + Purpose + Show configuration + Common Options + None + Description + Shows the node name, CPUs and OS of both nodes being used. +quit + Purpose + Quit + Common Options + None + Description + Causes the server to quit. +tcp_bw + Purpose + TCP streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +tcp_lat + Purpose + TCP one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using TCP sockets. +udp_bw + Purpose + UDP streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +udp_lat + Purpose + UDP one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using UDP sockets. +sdp_bw + Purpose + SDP streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +sdp_lat + Purpose + SDP one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using SDP sockets. +rds_bw + Purpose + RDS streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +rds_lat + Purpose + RDS one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using RDS sockets. +ud_bw +RDMA + Purpose + UD streaming one way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client sends messages to the server who notes how many it received. + The UD Send/Receive mechanism is used. +ud_bi_bw +RDMA + Purpose + UD streaming two way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Both the client and server exchange messages with each other using the + UD Send/Receive mechanism and note how many were received. +ud_lat +RDMA + Purpose + UD one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using UD Send/Receive. +rc_bw +RDMA + Purpose + RC streaming one way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client sends messages to the server who notes how many it received. + The RC Send/Receive mechanism is used. +rc_bi_bw +RDMA + Purpose + RC streaming two way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Both the client and server exchange messages with each other using the + RC Send/Receive mechanism and note how many were received. +rc_lat +RDMA + Purpose + RC one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using RC Send/Receive. +uc_bw +RDMA + Purpose + UC streaming one way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client sends messages to the server who notes how many it received. + The UC Send/Receive mechanism is used. +uc_bi_bw +RDMA + Purpose + UC streaming two way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Both the client and server exchange messages with each other using the + UC Send/Receive mechanism and note how many were received. +uc_lat +RDMA + Purpose + UC one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using UC Send/Receive. +rc_rdma_read_bw +RDMA + Purpose + RC RDMA read streaming one way bandwidth + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs RC RDMA Read operations and notes how + many of them complete. +rc_rdma_read_lat +RDMA + Purpose + RC RDMA read one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs RC RDMA Read operations waiting for + completion before starting the next one. +rc_rdma_write_bw +RDMA + Purpose + RC RDMA write streaming one way bandwidth + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs RC RDMA Write operations and notes how + many of them complete. +rc_rdma_write_lat +RDMA + Purpose + RC RDMA write one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + using RC RDMA write operations. +rc_rdma_write_poll_lat +RDMA + Purpose + RC RDMA write one way polling latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test using RC RDMA Write operations. First the + client performs an RDMA Write while the server stays in a tight loop + waiting for the memory buffer to change. The first and last bytes of + the memory buffer are tested to ensure that the entire message was + received. This is then repeated with both sides playing opposite + roles. Since this always polls, the -P (--poll) flag has no effect. +uc_rdma_write_bw +RDMA + Purpose + UC RDMA write streaming one way bandwidth + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs UC RDMA Write operations and notes how + many of them complete. +uc_rdma_write_lat +RDMA + Purpose + UC RDMA write one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + using UC RDMA write operations. +uc_rdma_write_poll_lat +RDMA + Purpose + UC RDMA write one way polling latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test using UC RDMA Write operations. First the + client performs an RDMA Write while the server stays in a tight loop + waiting for the memory buffer to change. The first and last bytes of + the memory buffer are tested to ensure that the entire message was + received. This is then repeated with both sides playing opposite + roles. Since this always polls, the -P (--poll) flag has no effect. +rc_compare_swap_mr +RDMA + Purpose + RC compare and swap messaging rate + Common Options + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs the RC Atomic Compare and Swap operation + and determines how many of them complete. +rc_fetch_add_mr +RDMA + Purpose + RC fetch and add messaging rate + Common Options + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs the RC Atomic Fetch and Add operation + and determines how many of them complete. +ver_rc_compare_swap +RDMA + Purpose + Verify RC compare and swap + Common Options + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Test the RC Compare and Swap Atomic operation. The server's memory + location starts with zero and the client successively exchanges, 0 for + 1, 1 for 2, etc. The results are checked for correctness. +ver_rc_fetch_add +RDMA + Purpose + Verify RC fetch and add + Common Options + --affinity PN (-a) Set processor affinity + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Tests the RC Fetch and Add Atomic operation. The server's memory + location starts with zero and the client successively adds one. The + results are checked for correctness. diff --git a/ib.c b/ib.c new file mode 100644 index 0000000..233fbc6 --- /dev/null +++ b/ib.c @@ -0,0 +1,2231 @@ +/* + * qperf - handle RDMA tests. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "qperf.h" + + +/* + * RDMA parameters. + */ +#define QKEY 0x11111111 /* Q_Key */ +#define NCQE 1024 /* Number of CQ entries */ +#define GRH_SIZE 40 /* IB GRH size */ +#define MTU_SIZE 2048 /* Default MTU Size */ +#define RETRY_CNT 7 /* RC/UC retry count */ +#define RNR_RETRY 7 /* RC/UC RNR retry count */ +#define RNR_TIMER 12 /* RC/UC RNR timeout */ +#define TIMEOUT 14 /* RC/UC timeout */ + + +/* + * Work request IDs. + */ +#define WRID_SEND 1 /* Send */ +#define WRID_RECV 2 /* Receive */ +#define WRID_RDMA 3 /* RDMA */ + + +/* + * Constants. + */ +#define K2 (2*1024) +#define K64 (64*1024) + + +/* + * For convenience. + */ +typedef enum ibv_wr_opcode OPCODE; + + +/* + * Atomics. + */ +typedef enum ATOMIC { + FETCH_ADD, /* Fetch and add */ + COMPARE_SWAP /* Compare and swap */ +} ATOMIC; + + +/* + * IO Mode. + */ +typedef enum IOMODE { + IO_SR, /* Send/Receive */ + IO_RDMA /* RDMA */ +} IOMODE; + + +/* + * RDMA connection context. + */ +typedef struct IBCON { + uint32_t lid; /* Local ID */ + uint32_t qpn; /* Queue pair number */ + uint32_t psn; /* Packet sequence number */ + uint32_t rkey; /* Remote key */ + uint64_t vaddr; /* Virtual address */ +} IBCON; + + +/* + * RDMA descriptor. + */ +typedef struct IBDEV { + IBCON lcon; /* Local context */ + IBCON rcon; /* Remote context */ + int mtu; /* MTU */ + int port; /* Port */ + int rate; /* Rate */ + int trans; /* QP transport */ + int maxinline; /* Maximum amount of inline data */ + char *buffer; /* Buffer */ + struct ibv_device **devlist; /* Device list */ + struct ibv_context *context; /* Context */ + struct ibv_comp_channel *channel; /* Channel */ + struct ibv_pd *pd; /* Protection domain */ + struct ibv_mr *mr; /* Memory region */ + struct ibv_cq *cq; /* Completion queue */ + struct ibv_qp *qp; /* QPair */ + struct ibv_ah *ah; /* Address handle */ +} IBDEV; + + +/* + * Names associated with a value. + */ +typedef struct NAMES { + int value; /* Value */ + char *name; /* Name */ +} NAMES; + + +/* + * RDMA speeds and names. + */ +typedef struct RATES { + const char *name; /* Name */ + uint32_t rate; /* Rate */ +} RATES; + + +/* + * Function prototypes. + */ +static void cq_error(int status); +static void dec_ibcon(IBCON *host); +static int do_error(int status, uint64_t *errors); +static void enc_ibcon(IBCON *host); +static void ib_bi_bw(int transport); +static void ib_client_atomic(ATOMIC atomic); +static void ib_client_bw(int transport); +static void ib_client_rdma_bw(int transport, OPCODE opcode); +static void ib_client_rdma_read_lat(int transport); +static void ib_close(IBDEV *ibdev); +static void ib_debug_info(IBDEV *ibdev); +static int ib_init(IBDEV *ibdev); +static int ib_mralloc(IBDEV *ibdev, int size); +static int ib_open(IBDEV *ibdev, int trans, int maxSendWR, int maxRecvWR); +static void ib_params_atomics(void); +static void ib_params_msgs(long msgSize, int use_poll_mode); +static int ib_poll(IBDEV *ibdev, struct ibv_wc *wc, int nwc); +static int ib_post_rdma(IBDEV *ibdev, OPCODE opcode, int n); +static int ib_post_compare_swap(IBDEV *ibdev, + int wrid, int offset, uint64_t compare, uint64_t swap); +static int ib_post_fetch_add(IBDEV *ibdev, + int wrid, int offset, uint64_t add); +static int ib_post_recv(IBDEV *ibdev, int n); +static int ib_post_send(IBDEV *ibdev, int n); +static void ib_pp_lat(int transport, IOMODE iomode); +static void ib_pp_lat_loop(IBDEV *ibdev, IOMODE iomode); +static int ib_prepare(IBDEV *ibdev); +static void ib_rdma_write_poll_lat(int transport); +static void ib_server_def(int transport); +static void ib_server_nop(int transport); +static char *opcode_name(int opcode); + + +/* + * List of errors we can get from a CQE. + */ +NAMES CQErrors[] ={ + { IBV_WC_SUCCESS, "Success" }, + { IBV_WC_LOC_LEN_ERR, "Local length error" }, + { IBV_WC_LOC_QP_OP_ERR, "Local QP operation failure" }, + { IBV_WC_LOC_EEC_OP_ERR, "Local EEC operation failure" }, + { IBV_WC_LOC_PROT_ERR, "Local protection error" }, + { IBV_WC_WR_FLUSH_ERR, "WR flush failure" }, + { IBV_WC_MW_BIND_ERR, "Memory window bind failure" }, + { IBV_WC_BAD_RESP_ERR, "Bad response" }, + { IBV_WC_LOC_ACCESS_ERR, "Local access failure" }, + { IBV_WC_REM_INV_REQ_ERR, "Remote invalid request" }, + { IBV_WC_REM_ACCESS_ERR, "Remote access failure" }, + { IBV_WC_REM_OP_ERR, "Remote operation failure" }, + { IBV_WC_RETRY_EXC_ERR, "Retries exceeded" }, + { IBV_WC_RNR_RETRY_EXC_ERR, "RNR retry exceeded" }, + { IBV_WC_LOC_RDD_VIOL_ERR, "Local RDD violation" }, + { IBV_WC_REM_INV_RD_REQ_ERR, "Remote invalid read request" }, + { IBV_WC_REM_ABORT_ERR, "Remote abort" }, + { IBV_WC_INV_EECN_ERR, "Invalid EECN" }, + { IBV_WC_INV_EEC_STATE_ERR, "Invalid EEC state" }, + { IBV_WC_FATAL_ERR, "Fatal error" }, + { IBV_WC_RESP_TIMEOUT_ERR, "Responder timeout" }, + { IBV_WC_GENERAL_ERR, "General error" }, +}; + + +/* + * Opcodes. + */ +NAMES Opcodes[] ={ + { IBV_WR_ATOMIC_CMP_AND_SWP, "compare and swap" }, + { IBV_WR_ATOMIC_FETCH_AND_ADD, "fetch and add" }, + { IBV_WR_RDMA_READ, "rdma read" }, + { IBV_WR_RDMA_WRITE, "rdma write" }, + { IBV_WR_RDMA_WRITE_WITH_IMM, "rdma write with immediate" }, + { IBV_WR_SEND, "send" }, + { IBV_WR_SEND_WITH_IMM, "send with immediate" }, +}; + + +/* + * Opcodes. + */ +RATES Rates[] ={ + { "", IBV_RATE_MAX }, + { "max", IBV_RATE_MAX }, + { "1xSDR", IBV_RATE_2_5_GBPS }, + { "1xDDR", IBV_RATE_5_GBPS }, + { "1xQDR", IBV_RATE_10_GBPS }, + { "4xSDR", IBV_RATE_10_GBPS }, + { "4xDDR", IBV_RATE_20_GBPS }, + { "4xQDR", IBV_RATE_40_GBPS }, + { "8xSDR", IBV_RATE_20_GBPS }, + { "8xDDR", IBV_RATE_40_GBPS }, + { "8xQDR", IBV_RATE_80_GBPS }, + { "2.5", IBV_RATE_2_5_GBPS }, + { "5", IBV_RATE_5_GBPS }, + { "10", IBV_RATE_10_GBPS }, + { "20", IBV_RATE_20_GBPS }, + { "30", IBV_RATE_30_GBPS }, + { "40", IBV_RATE_40_GBPS }, + { "60", IBV_RATE_60_GBPS }, + { "80", IBV_RATE_80_GBPS }, + { "120", IBV_RATE_120_GBPS }, +}; + + +/* + * Experimental (client side). + */ +void +run_client_experimental(void) +{ + IBDEV ibdev; + + ib_params_msgs(K64, 1); + if (!ib_open(&ibdev, IBV_QPT_UC, 1, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1)) + goto err; + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Experimental (server side). + */ +void +run_server_experimental(void) +{ + IBDEV ibdev; + int found = 0; + + if (!ib_open(&ibdev, IBV_QPT_UC, 0, 1)) + return; + if (!ib_init(&ibdev)) + goto err; + if (!ib_post_recv(&ibdev, 1)) + goto err; + if (!synchronize()) + goto err; + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (n < 0) + goto err; + if (n) { + found = 1; + break; + } + } + if (found) + printf("Received immediate data\n"); + else + printf("Failed to received immediate data\n"); + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure RC bi-directional bandwidth (client side). + */ +void +run_client_rc_bi_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + ib_params_msgs(K64, 1); + ib_bi_bw(IBV_QPT_RC); + show_results(BANDWIDTH); +} + + +/* + * Measure RC bi-directional bandwidth (server side). + */ +void +run_server_rc_bi_bw(void) +{ + ib_bi_bw(IBV_QPT_RC); +} + + +/* + * Measure RC bandwidth (client side). + */ +void +run_client_rc_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + par_use(L_NO_MSGS); + par_use(R_NO_MSGS); + ib_params_msgs(K64, 1); + ib_client_bw(IBV_QPT_RC); + show_results(BANDWIDTH); +} + + +/* + * Measure RC bandwidth (server side). + */ +void +run_server_rc_bw(void) +{ + ib_server_def(IBV_QPT_RC); +} + + +/* + * Measure RC compare and swap messaging rate (client side). + */ +void +run_client_rc_compare_swap_mr(void) +{ + ib_client_atomic(COMPARE_SWAP); +} + + +/* + * Measure RC compare and swap messaging rate (server side). + */ +void +run_server_rc_compare_swap_mr(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC fetch and add messaging rate (client side). + */ +void +run_client_rc_fetch_add_mr(void) +{ + ib_client_atomic(FETCH_ADD); +} + + +/* + * Measure RC fetch and add messaging rate (server side). + */ +void +run_server_rc_fetch_add_mr(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC latency (client side). + */ +void +run_client_rc_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_RC, IO_SR); +} + + +/* + * Measure RC latency (server side). + */ +void +run_server_rc_lat(void) +{ + ib_pp_lat(IBV_QPT_RC, IO_SR); +} + + +/* + * Measure RC RDMA read bandwidth (client side). + */ +void +run_client_rc_rdma_read_bw(void) +{ + par_use(L_RD_ATOMIC); + par_use(R_RD_ATOMIC); + ib_params_msgs(K64, 1); + ib_client_rdma_bw(IBV_QPT_RC, IBV_WR_RDMA_READ); + show_results(BANDWIDTH); +} + + +/* + * Measure RC RDMA read bandwidth (server side). + */ +void +run_server_rc_rdma_read_bw(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA read latency (client side). + */ +void +run_client_rc_rdma_read_lat(void) +{ + ib_params_msgs(1, 1); + ib_client_rdma_read_lat(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA read latency (server side). + */ +void +run_server_rc_rdma_read_lat(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA write bandwidth (client side). + */ +void +run_client_rc_rdma_write_bw(void) +{ + ib_params_msgs(K64, 1); + ib_client_rdma_bw(IBV_QPT_RC, IBV_WR_RDMA_WRITE_WITH_IMM); + show_results(BANDWIDTH); +} + + +/* + * Measure RC RDMA write bandwidth (server side). + */ +void +run_server_rc_rdma_write_bw(void) +{ + ib_server_def(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA write latency (client side). + */ +void +run_client_rc_rdma_write_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_RC, IO_RDMA); +} + + +/* + * Measure RC RDMA write latency (server side). + */ +void +run_server_rc_rdma_write_lat(void) +{ + ib_pp_lat(IBV_QPT_RC, IO_RDMA); +} + + +/* + * Measure RC RDMA write polling latency (client side). + */ +void +run_client_rc_rdma_write_poll_lat(void) +{ + ib_params_msgs(1, 0); + ib_rdma_write_poll_lat(IBV_QPT_RC); + show_results(LATENCY); +} + + +/* + * Measure RC RDMA write polling latency (server side). + */ +void +run_server_rc_rdma_write_poll_lat(void) +{ + ib_rdma_write_poll_lat(IBV_QPT_RC); +} + + +/* + * Measure UC bi-directional bandwidth (client side). + */ +void +run_client_uc_bi_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + ib_params_msgs(K64, 1); + ib_bi_bw(IBV_QPT_UC); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UC bi-directional bandwidth (server side). + */ +void +run_server_uc_bi_bw(void) +{ + ib_bi_bw(IBV_QPT_UC); +} + + +/* + * Measure UC bandwidth (client side). + */ +void +run_client_uc_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + par_use(L_NO_MSGS); + par_use(R_NO_MSGS); + ib_params_msgs(K64, 1); + ib_client_bw(IBV_QPT_UC); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UC bandwidth (server side). + */ +void +run_server_uc_bw(void) +{ + ib_server_def(IBV_QPT_UC); +} + + +/* + * Measure UC latency (client side). + */ +void +run_client_uc_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_UC, IO_SR); +} + + +/* + * Measure UC latency (server side). + */ +void +run_server_uc_lat(void) +{ + ib_pp_lat(IBV_QPT_UC, IO_SR); +} + + +/* + * Measure UC RDMA write bandwidth (client side). + */ +void +run_client_uc_rdma_write_bw(void) +{ + ib_params_msgs(K64, 1); + ib_client_rdma_bw(IBV_QPT_UC, IBV_WR_RDMA_WRITE_WITH_IMM); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UC RDMA write bandwidth (server side). + */ +void +run_server_uc_rdma_write_bw(void) +{ + ib_server_def(IBV_QPT_UC); +} + + +/* + * Measure UC RDMA write latency (client side). + */ +void +run_client_uc_rdma_write_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_UC, IO_RDMA); +} + + +/* + * Measure UC RDMA write latency (server side). + */ +void +run_server_uc_rdma_write_lat(void) +{ + ib_pp_lat(IBV_QPT_UC, IO_RDMA); +} + + +/* + * Measure UC RDMA write polling latency (client side). + */ +void +run_client_uc_rdma_write_poll_lat(void) +{ + ib_params_msgs(1, 1); + ib_rdma_write_poll_lat(IBV_QPT_UC); + show_results(LATENCY); +} + + +/* + * Measure UC RDMA write polling latency (server side). + */ +void +run_server_uc_rdma_write_poll_lat(void) +{ + ib_rdma_write_poll_lat(IBV_QPT_UC); +} + + +/* + * Measure UD bi-directional bandwidth (client side). + */ +void +run_client_ud_bi_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + ib_params_msgs(K2, 1); + ib_bi_bw(IBV_QPT_UD); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UD bi-directional bandwidth (server side). + */ +void +run_server_ud_bi_bw(void) +{ + ib_bi_bw(IBV_QPT_UD); +} + + +/* + * Measure UD bandwidth (client side). + */ +void +run_client_ud_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + par_use(L_NO_MSGS); + par_use(R_NO_MSGS); + ib_params_msgs(K2, 1); + ib_client_bw(IBV_QPT_UD); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UD bandwidth (server side). + */ +void +run_server_ud_bw(void) +{ + ib_server_def(IBV_QPT_UD); +} + + +/* + * Measure UD latency (client side). + */ +void +run_client_ud_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_UD, IO_SR); +} + + +/* + * Measure UD latency (server side). + */ +void +run_server_ud_lat(void) +{ + ib_pp_lat(IBV_QPT_UD, IO_SR); +} + +/* + * Verify RC compare and swap (client side). + */ +void +run_client_ver_rc_compare_swap(void) +{ + IBDEV ibdev; + uint64_t *result; + uint64_t last = 0; + uint64_t cur = 0; + uint64_t next = 0x0123456789abcdefULL; + int i; + int size; + + ib_params_atomics(); + if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0)) + goto err; + size = Req.rd_atomic * sizeof(uint64_t); + setv_u32(L_MSG_SIZE, size); + setv_u32(R_MSG_SIZE, size); + ib_mralloc(&ibdev, size); + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + for (i = 0; i < Req.rd_atomic; ++i) { + if (!ib_post_compare_swap(&ibdev, i, i*sizeof(uint64_t), cur, next)) + goto err; + cur = next; + next = cur + 1; + } + result = (uint64_t *) ibdev.buffer; + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + uint64_t res; + + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int x = wc[i].wr_id; + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.rem_r.no_bytes += sizeof(uint64_t); + LStat.rem_r.no_msgs++; + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + res = result[x]; + if (last != res) { + error("compare and swap doesn't match (expected %llx vs. %llx)", + (long long)last, (long long)res); + goto err; + } + if (last) + last++; + else + last = 0x0123456789abcdefULL; + next = cur + 1; + if (!ib_post_compare_swap(&ibdev, x, x*sizeof(uint64_t), + cur, next)) + goto err; + cur = next; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(MSG_RATE); +} + + +/* + * Verify RC compare and swap (server side). + */ +void +run_server_ver_rc_compare_swap(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Verify RC fetch and add (client side). + */ +void +run_client_ver_rc_fetch_add(void) +{ + IBDEV ibdev; + uint64_t *result; + uint64_t last = 0; + int i; + int size; + + ib_params_atomics(); + if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0)) + goto err; + size = Req.rd_atomic * sizeof(uint64_t); + setv_u32(L_MSG_SIZE, size); + setv_u32(R_MSG_SIZE, size); + ib_mralloc(&ibdev, size); + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + for (i = 0; i < Req.rd_atomic; ++i) { + if (!ib_post_fetch_add(&ibdev, i, i*sizeof(uint64_t), 1)) + goto err; + } + result = (uint64_t *) ibdev.buffer; + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + uint64_t res; + + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int x = wc[i].wr_id; + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.rem_r.no_bytes += sizeof(uint64_t); + LStat.rem_r.no_msgs++; + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + res = result[x]; + if (last != res) { + error("fetch and add doesn't match (expected %llx vs. %llx)", + (long long)last, (long long)res); + goto err; + } + last++; + if (!ib_post_fetch_add(&ibdev, x, x*sizeof(uint64_t), 1)) + goto err; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(MSG_RATE); +} + + +/* + * Verify RC fetch and add (server side). + */ +void +run_server_ver_rc_fetch_add(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure messaging rate for an atomic operation. + */ +static void +ib_client_atomic(ATOMIC atomic) +{ + int i; + int r; + IBDEV ibdev; + + ib_params_atomics(); + if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0)) + goto err; + setv_u32(L_MSG_SIZE, sizeof(uint64_t)); + setv_u32(R_MSG_SIZE, sizeof(uint64_t)); + ib_mralloc(&ibdev, sizeof(uint64_t)); + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + for (i = 0; i < Req.rd_atomic; ++i) { + r = (atomic == FETCH_ADD) + ? ib_post_fetch_add(&ibdev, 0, 0, 0) + : ib_post_compare_swap(&ibdev, 0, 0, 0, 0); + if (!r) + goto err; + } + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.rem_r.no_bytes += sizeof(uint64_t); + LStat.rem_r.no_msgs++; + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + r = (atomic == FETCH_ADD) + ? ib_post_fetch_add(&ibdev, 0, 0, 0) + : ib_post_compare_swap(&ibdev, 0, 0, 0, 0); + if (!r) + goto err; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(MSG_RATE); +} + + +/* + * Measure IB bandwidth (client side). + */ +static void +ib_client_bw(int transport) +{ + IBDEV ibdev; + + long sent = 0; + if (!ib_open(&ibdev, transport, NCQE, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_send(&ibdev, left_to_send(&sent, NCQE))) + goto err; + sent = NCQE; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (n > LStat.max_cqes) + LStat.max_cqes = n; + if (n < 0) + goto err; + if (Finished) + break; + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + if (id != WRID_SEND) + debug("bad WR ID %d", id); + else if (status != IBV_WC_SUCCESS) + if (!do_error(status, &LStat.s.no_errs)) + goto err; + } + if (Req.no_msgs) { + if (LStat.s.no_msgs + LStat.s.no_errs >= Req.no_msgs) + break; + n = left_to_send(&sent, n); + } + if (!ib_post_send(&ibdev, n)) + goto err; + sent += n; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Default action for the server is to post receive buffers and whenever it + * gets a completion entry, compute statistics and post more buffers. + */ +static void +ib_server_def(int transport) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, 0, NCQE)) + return; + if (!ib_init(&ibdev)) + goto err; + if (!ib_post_recv(&ibdev, NCQE)) + goto err; + if (!synchronize()) + goto err; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + if (n < 0) + goto err; + for (i = 0; i < n; ++i) { + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + if (Req.access_recv) + touch_data(ibdev.buffer, Req.msg_size); + } else if (!do_error(status, &LStat.r.no_errs)) + goto err; + } + if (Req.no_msgs) + if (LStat.r.no_msgs + LStat.r.no_errs >= Req.no_msgs) + break; + if (!ib_post_recv(&ibdev, n)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure bi-directional IB bandwidth. + */ +static void +ib_bi_bw(int transport) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, NCQE, NCQE)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!ib_post_recv(&ibdev, NCQE)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_send(&ibdev, NCQE)) + goto err; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + int noSend = 0; + int noRecv = 0; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + if (n < 0) + goto err; + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + switch (id) { + case WRID_SEND: + if (status != IBV_WC_SUCCESS) + if (!do_error(status, &LStat.s.no_errs)) + goto err; + ++noSend; + break; + case WRID_RECV: + if (status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + if (Req.access_recv) + touch_data(ibdev.buffer, Req.msg_size); + } else if (!do_error(status, &LStat.r.no_errs)) + goto err; + ++noRecv; + break; + default: + debug("bad WR ID %d", id); + } + } + if (noRecv) + if (!ib_post_recv(&ibdev, noRecv)) + goto err; + if (noSend) + if (!ib_post_send(&ibdev, noSend)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure ping-pong latency (client and server side). + */ +static void +ib_pp_lat(int transport, IOMODE iomode) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, 1, 1)) + goto err; + if (!ib_init(&ibdev)) + goto err; + ib_pp_lat_loop(&ibdev, iomode); +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + if (is_client()) + show_results(LATENCY); +} + + +/* + * Loop sending packets back and forth to measure ping-pong latency. + */ +static void +ib_pp_lat_loop(IBDEV *ibdev, IOMODE iomode) +{ + int done = 1; + if (!ib_post_recv(ibdev, 1)) + return; + if (!synchronize()) + return; + if (is_client()) { + if (iomode == IO_SR) { + if (!ib_post_send(ibdev, 1)) + return; + } else { + if (!ib_post_rdma(ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1)) + return; + } + done = 0; + } + + while (!Finished) { + int i; + struct ibv_wc wc[2]; + int n = ib_poll(ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n < 0) + return; + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + switch (id) { + case WRID_SEND: + case WRID_RDMA: + if (status != IBV_WC_SUCCESS) + if (!do_error(status, &LStat.s.no_errs)) + return; + done |= 1; + continue; + case WRID_RECV: + if (status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + if (!ib_post_recv(ibdev, 1)) + return; + } else if (!do_error(status, &LStat.r.no_errs)) + return; + done |= 2; + continue; + default: + debug("bad WR ID %d", id); + continue; + } + break; + } + if (done == 3) { + if (iomode == IO_SR) { + if (!ib_post_send(ibdev, 1)) + return; + } else { + if (!ib_post_rdma(ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1)) + return; + } + done = 0; + } + } + Successful = 1; +} + + +/* + * Loop sending packets back and forth using RDMA Write and polling to measure + * latency. Note that if we increase the number of entries of wc to be NCQE, + * on the PS HCA, the latency is much longer. + */ +static void +ib_rdma_write_poll_lat(int transport) +{ + IBDEV ibdev; + volatile char *p; + volatile char *q; + int send = is_client() ? 1 : 0; + int locID = send; + int remID = !locID; + + if (!ib_open(&ibdev, transport, NCQE, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + p = &ibdev.buffer[0]; + q = &ibdev.buffer[Req.msg_size-1]; + while (!Finished) { + *p = locID; + *q = locID; + if (send) { + int i; + int n; + struct ibv_wc wc[2]; + + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_WRITE, 1)) + goto err; + if (Finished) + break; + n = ibv_poll_cq(ibdev.cq, cardof(wc), wc); + if (n < 0) { + syserror("CQ poll failed"); + goto err; + } + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + if (id != WRID_RDMA) + debug("bad WR ID %d", id); + else if (status != IBV_WC_SUCCESS) { + if (!do_error(status, &LStat.s.no_errs)) + goto err; + } + } + } + while (!Finished) + if (*p == remID && *q == remID) + break; + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + send = 1; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure RDMA Read latency (client side). + */ +static void +ib_client_rdma_read_lat(int transport) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, 1, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_READ, 1)) + goto err; + while (!Finished) { + struct ibv_wc wc; + int n = ib_poll(&ibdev, &wc, 1); + if (n < 0) + goto err; + if (n == 0) + continue; + if (Finished) + break; + if (wc.wr_id != WRID_RDMA) { + debug("bad WR ID %d", (int)wc.wr_id); + continue; + } + if (wc.status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + LStat.rem_s.no_bytes += Req.msg_size; + LStat.rem_s.no_msgs++; + } else if (!do_error(wc.status, &LStat.s.no_errs)) + goto err; + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_READ, 1)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(LATENCY); +} + + +/* + * Measure RDMA bandwidth (client side). + */ +static void +ib_client_rdma_bw(int transport, OPCODE opcode) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, NCQE, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_rdma(&ibdev, opcode, NCQE)) + goto err; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + if (opcode == IBV_WR_RDMA_READ) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + LStat.rem_s.no_bytes += Req.msg_size; + LStat.rem_s.no_msgs++; + } + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + } + if (!ib_post_rdma(&ibdev, opcode, n)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Server just waits and lets driver take care of any requests. + */ +static void +ib_server_nop(int transport) +{ + IBDEV ibdev; + + /* workaround: Size of RQ should be 0; bug in Mellanox driver */ + if (!ib_open(&ibdev, transport, 0, 1)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + while (!Finished) + pause(); + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Set default IB parameters for tests that use messages. + */ +static void +ib_params_msgs(long msgSize, int use_poll_mode) +{ + setp_u32(0, L_MSG_SIZE, msgSize); + setp_u32(0, R_MSG_SIZE, msgSize); + setp_u32(0, L_MTU_SIZE, MTU_SIZE); + setp_u32(0, R_MTU_SIZE, MTU_SIZE); + par_use(L_ID); + par_use(R_ID); + par_use(L_MTU_SIZE); + par_use(R_MTU_SIZE); + par_use(L_RATE); + par_use(R_RATE); + if (use_poll_mode) { + par_use(L_POLL_MODE); + par_use(R_POLL_MODE); + } + opt_check(); +} + + +/* + * Set default IB parameters for tests that use atomics. + */ +static void +ib_params_atomics(void) +{ + setp_u32(0, L_MTU_SIZE, MTU_SIZE); + setp_u32(0, R_MTU_SIZE, MTU_SIZE); + par_use(L_ID); + par_use(R_ID); + par_use(L_POLL_MODE); + par_use(R_POLL_MODE); + par_use(L_RATE); + par_use(R_RATE); + par_use(L_RD_ATOMIC); + par_use(R_RD_ATOMIC); + opt_check(); + + setv_u32(L_MSG_SIZE, 0); +} + + +/* + * IB initialization. + */ +static int +ib_init(IBDEV *ibdev) +{ + IBCON ibcon; + + if (is_client()) { + client_send_request(); + enc_init(&ibcon); + enc_ibcon(&ibdev->lcon); + if (!send_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + if (!recv_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + dec_init(&ibcon); + dec_ibcon(&ibdev->rcon); + } else { + if (!recv_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + dec_init(&ibcon); + dec_ibcon(&ibdev->rcon); + enc_init(&ibcon); + enc_ibcon(&ibdev->lcon); + if (!send_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + } + if (!ib_prepare(ibdev)) + return 0; + ib_debug_info(ibdev); + return 1; +} + + +/* + * Show debugging information. + */ +static void +ib_debug_info(IBDEV *ibdev) +{ + debug("L: lid=%04x qpn=%06x psn=%06x rkey=%08x vaddr=%010x", + ibdev->lcon.lid, ibdev->lcon.qpn, ibdev->lcon.psn, + ibdev->lcon.rkey, ibdev->lcon.vaddr); + debug("R: lid=%04x qpn=%06x psn=%06x rkey=%08x vaddr=%010x", + ibdev->rcon.lid, ibdev->rcon.qpn, ibdev->rcon.psn, + ibdev->rcon.rkey, ibdev->rcon.vaddr); +} + + +/* + * Open a RDMA device. + */ +static int +ib_open(IBDEV *ibdev, int trans, int maxSendWR, int maxRecvWR) +{ + /* Clear structure */ + memset(ibdev, 0, sizeof(*ibdev)); + + /* Check and set MTU */ + { + int mtu = Req.mtu_size; + if (mtu == 256) + ibdev->mtu = IBV_MTU_256; + else if (mtu == 512) + ibdev->mtu = IBV_MTU_512; + else if (mtu == 1024) + ibdev->mtu = IBV_MTU_1024; + else if (mtu == 2048) + ibdev->mtu = IBV_MTU_2048; + else if (mtu == 4096) + ibdev->mtu = IBV_MTU_4096; + else + error_die("Bad MTU: %d; must be 256/512/1K/2K/4K", mtu); + } + + /* Set transport type */ + ibdev->trans = trans; + + /* Set port */ + { + int port = 1; + char *p = index(Req.id, ':'); + if (p) { + *p++ = '\0'; + port = atoi(p); + if (port < 1) + error_die("Bad IB port: %d; must be at least 1", port); + } + ibdev->port = port; + } + + /* Set rate */ + { + RATES *q = Rates; + RATES *r = q + cardof(Rates); + + for (;; ++q) { + if (q >= r) { + syserror("Bad rate: %s", Req.rate); + goto err; + } + if (streq(Req.rate, q->name)) { + ibdev->rate = q->rate; + break; + } + } + } + + /* Determine device and open */ + { + struct ibv_device *device; + char *name = Req.id[0] ? Req.id : 0; + + ibdev->devlist = ibv_get_device_list(0); + if (!ibdev->devlist) { + syserror("Failed to find any IB devices"); + goto err; + } + if (!name) + device = *ibdev->devlist; + else { + struct ibv_device **d = ibdev->devlist; + while ((device = *d++)) + if (streq(ibv_get_device_name(device), name)) + break; + } + if (!device) { + syserror("Failed to find IB device"); + goto err; + } + ibdev->context = ibv_open_device(device); + if (!ibdev->context) { + syserror("Failed to open device %s", ibv_get_device_name(device)); + goto err; + } + } + + /* Allocate completion channel */ + ibdev->channel = ibv_create_comp_channel(ibdev->context); + if (!ibdev->channel) { + syserror("Failed to create completion channel"); + goto err; + } + + /* Allocate protection domain */ + ibdev->pd = ibv_alloc_pd(ibdev->context); + if (!ibdev->pd) { + syserror("Failed to allocate protection domain"); + goto err; + } + + /* Allocate message buffer and memory region */ + { + int bufSize = Req.msg_size; + int pageSize = sysconf(_SC_PAGESIZE); + if (trans == IBV_QPT_UD) + bufSize += GRH_SIZE; + if (bufSize == 0) + bufSize = 1; + if (posix_memalign((void **)&ibdev->buffer, pageSize, bufSize) != 0) { + syserror("Failed to allocate memory"); + goto err; + } + memset(ibdev->buffer, 0, bufSize); + int flags = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC; + ibdev->mr = ibv_reg_mr(ibdev->pd, ibdev->buffer, bufSize, flags); + if (!ibdev->mr) { + syserror("Failed to allocate memory region"); + goto err; + } + } + + /* Create completion queue */ + ibdev->cq = ibv_create_cq(ibdev->context, + maxSendWR+maxRecvWR, 0, ibdev->channel, 0); + if (!ibdev->cq) { + syserror("Failed to create completion queue"); + goto err; + } + + /* Create queue pair */ + { + struct ibv_qp_init_attr attr ={ + .send_cq = ibdev->cq, + .recv_cq = ibdev->cq, + .cap ={ + .max_send_wr = maxSendWR, + .max_recv_wr = maxRecvWR, + .max_send_sge = 1, + .max_recv_sge = 1, + .max_inline_data = 0, + }, + .qp_type = ibdev->trans, + }; + ibdev->qp = ibv_create_qp(ibdev->pd, &attr); + if (!ibdev->qp) { + syserror("Failed to create QP"); + goto err; + } + } + + /* Modify queue pair to INIT state */ + { + struct ibv_qp_attr attr ={ + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = ibdev->port + }; + int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; + + if (ibdev->trans == IBV_QPT_UD) { + flags |= IBV_QP_QKEY; + attr.qkey = QKEY; + } else if (ibdev->trans == IBV_QPT_RC) { + flags |= IBV_QP_ACCESS_FLAGS; + attr.qp_access_flags = + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC; + } else if (ibdev->trans == IBV_QPT_UC) { + flags |= IBV_QP_ACCESS_FLAGS; + attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; + } + if (ibv_modify_qp(ibdev->qp, &attr, flags) != SUCCESS0) { + syserror("Failed to modify QP to INIT state"); + goto err; + } + } + + /* Get QP attributes */ + { + struct ibv_qp_attr qp_attr; + struct ibv_qp_init_attr qp_init_attr; + + if (ibv_query_qp(ibdev->qp, &qp_attr, 0, &qp_init_attr) != SUCCESS0) { + syserror("Query QP failed"); + goto err; + } + ibdev->maxinline = qp_attr.cap.max_inline_data; + } + + /* Get device properties */ + { + struct ibv_device_attr dev_attr; + + if (ibv_query_device(ibdev->context, &dev_attr) != SUCCESS0) { + syserror("Query device failed"); + goto err; + } + if (Req.rd_atomic == 0) + Req.rd_atomic = dev_attr.max_qp_rd_atom; + else if (Req.rd_atomic > dev_attr.max_qp_rd_atom) + error("Device only supports %d (< %d) RDMA reads or atomic ops", + dev_attr.max_qp_rd_atom, Req.rd_atomic); + } + + /* Set up local context */ + { + struct ibv_port_attr port_attr; + + int stat = ibv_query_port(ibdev->context, ibdev->port, &port_attr); + if (stat != SUCCESS0) { + syserror("Query port failed"); + goto err; + } + srand48(getpid()*time(0)); + + ibdev->lcon.lid = port_attr.lid; + ibdev->lcon.qpn = ibdev->qp->qp_num; + ibdev->lcon.psn = lrand48() & 0xffffff; + ibdev->lcon.rkey = 0; + ibdev->lcon.vaddr = 0; + } + + /* Allocate memory region */ + if (!ib_mralloc(ibdev, Req.msg_size)) + goto err; + return 1; + +err: + ib_close(ibdev); + return 0; +} + + +/* + * Allocate a memory region. + */ +static int +ib_mralloc(IBDEV *ibdev, int size) +{ + int pageSize; + + if (size == 0) + return 1; + if (ibdev->trans == IBV_QPT_UD) + size += GRH_SIZE; + pageSize = sysconf(_SC_PAGESIZE); + if (posix_memalign((void **)&ibdev->buffer, pageSize, size) != 0) { + syserror("Failed to allocate memory"); + goto err; + } + memset(ibdev->buffer, 0, size); + int flags = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC; + ibdev->mr = ibv_reg_mr(ibdev->pd, ibdev->buffer, size, flags); + if (!ibdev->mr) { + syserror("Failed to allocate memory region"); + goto err; + } + + ibdev->lcon.rkey = ibdev->mr->rkey; + ibdev->lcon.vaddr = (unsigned long)ibdev->buffer; + return 1; + +err: + if (ibdev->buffer) { + free(ibdev->buffer); + ibdev->buffer = 0; + } + return 0; +} + + +/* + * Prepare the IB device for receiving and sending. + */ +static int +ib_prepare(IBDEV *ibdev) +{ + int flags; + struct ibv_qp_attr rtr_attr ={ + .qp_state = IBV_QPS_RTR, + .path_mtu = ibdev->mtu, + .dest_qp_num = ibdev->rcon.qpn, + .rq_psn = ibdev->rcon.psn, + .min_rnr_timer = RNR_TIMER, + .max_dest_rd_atomic = Req.rd_atomic, + .ah_attr = { + .dlid = ibdev->rcon.lid, + .port_num = ibdev->port, + .static_rate = ibdev->rate + } + }; + struct ibv_qp_attr rts_attr ={ + .qp_state = IBV_QPS_RTS, + .timeout = TIMEOUT, + .retry_cnt = RETRY_CNT, + .rnr_retry = RNR_RETRY, + .sq_psn = ibdev->lcon.psn, + .max_rd_atomic = Req.rd_atomic + }; + struct ibv_ah_attr ah_attr ={ + .dlid = ibdev->rcon.lid, + .port_num = ibdev->port, + .static_rate = ibdev->rate + }; + + if (ibdev->trans == IBV_QPT_UD) { + /* Modify queue pair to RTR */ + flags = IBV_QP_STATE; + if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTR"); + + /* Modify queue pair to RTS */ + flags = IBV_QP_STATE | IBV_QP_SQ_PSN; + if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTS"); + + /* Create address handle */ + ibdev->ah = ibv_create_ah(ibdev->pd, &ah_attr); + if (!ibdev->ah) + return syserror("Failed to create address handle"); + } else if (ibdev->trans == IBV_QPT_RC) { + /* Modify queue pair to RTR */ + flags = IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER; + if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTR"); + + /* Modify queue pair to RTS */ + flags = IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC; + if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTS"); + } else if (ibdev->trans == IBV_QPT_UC) { + /* Modify queue pair to RTR */ + flags = IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN; + if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTR"); + + /* Modify queue pair to RTS */ + flags = IBV_QP_STATE | + IBV_QP_SQ_PSN; + if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTS"); + } + if (!Req.poll_mode) { + if (ibv_req_notify_cq(ibdev->cq, 0) != SUCCESS0) + return syserror("Failed to request CQ notification"); + } + return 1; +} + + +/* + * Close a RDMA device. We ust destroy the CQ before the QP otherwise the + * ibv_destroy_qp call might hang. + */ +static void +ib_close(IBDEV *ibdev) +{ + if (ibdev->ah) + ibv_destroy_ah(ibdev->ah); + if (ibdev->cq) + ibv_destroy_cq(ibdev->cq); + if (ibdev->qp) + ibv_destroy_qp(ibdev->qp); + if (ibdev->mr) + ibv_dereg_mr(ibdev->mr); + if (ibdev->pd) + ibv_dealloc_pd(ibdev->pd); + if (ibdev->channel) + ibv_destroy_comp_channel(ibdev->channel); + if (ibdev->context) + ibv_close_device(ibdev->context); + if (ibdev->buffer) + free(ibdev->buffer); + if (ibdev->devlist) + free(ibdev->devlist); + memset(ibdev, 0, sizeof(*ibdev)); +} + + +/* + * Post a compare and swap request. + */ +static int +ib_post_compare_swap(IBDEV *ibdev, + int wrid, int offset, uint64_t compare, uint64_t swap) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t)ibdev->buffer + offset, + .length = sizeof(uint64_t), + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = wrid, + .sg_list = &sge, + .num_sge = 1, + .opcode = IBV_WR_ATOMIC_CMP_AND_SWP, + .send_flags = IBV_SEND_SIGNALED, + .wr = { + .atomic = { + .remote_addr = ibdev->rcon.vaddr, + .rkey = ibdev->rcon.rkey, + .compare_add = compare, + .swap = swap + } + } + }; + struct ibv_send_wr *badWR; + + errno = 0; + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post compare and swap"); + } + + LStat.s.no_bytes += sizeof(uint64_t); + LStat.s.no_msgs++; + return 1; +} + + +/* + * Post a fetch and add request. + */ +static int +ib_post_fetch_add(IBDEV *ibdev, int wrid, int offset, uint64_t add) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer + offset, + .length = sizeof(uint64_t), + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = wrid, + .sg_list = &sge, + .num_sge = 1, + .opcode = IBV_WR_ATOMIC_FETCH_AND_ADD, + .send_flags = IBV_SEND_SIGNALED, + .wr = { + .atomic = { + .remote_addr = ibdev->rcon.vaddr, + .rkey = ibdev->rcon.rkey, + .compare_add = add + } + } + }; + struct ibv_send_wr *badWR; + + errno = 0; + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post fetch and add"); + } + + LStat.s.no_bytes += sizeof(uint64_t); + LStat.s.no_msgs++; + return 1; +} + + +/* + * Post n sends. + */ +static int +ib_post_send(IBDEV *ibdev, int n) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer, + .length = Req.msg_size, + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = WRID_SEND, + .sg_list = &sge, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, + }; + struct ibv_send_wr *badWR; + + if (ibdev->trans == IBV_QPT_UD) { + wr.wr.ud.ah = ibdev->ah; + wr.wr.ud.remote_qpn = ibdev->rcon.qpn; + wr.wr.ud.remote_qkey = QKEY; + } + if (Req.msg_size <= ibdev->maxinline) + wr.send_flags |= IBV_SEND_INLINE; + errno = 0; + while (n-- > 0) { + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post send"); + } + LStat.s.no_bytes += Req.msg_size; + LStat.s.no_msgs++; + } + + return 1; +} + + +/* + * Post n receives. + */ +static int +ib_post_recv(IBDEV *ibdev, int n) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer, + .length = Req.msg_size, + .lkey = ibdev->mr->lkey + }; + struct ibv_recv_wr wr ={ + .wr_id = WRID_RECV, + .sg_list = &sge, + .num_sge = 1, + }; + struct ibv_recv_wr *badWR; + + if (ibdev->trans == IBV_QPT_UD) + sge.length += GRH_SIZE; + + errno = 0; + while (n-- > 0) { + if (ibv_post_recv(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post receive"); + } + } + return 1; +} + + +/* + * Post n RDMA requests. + */ +static int +ib_post_rdma(IBDEV *ibdev, OPCODE opcode, int n) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer, + .length = Req.msg_size, + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = WRID_RDMA, + .sg_list = &sge, + .num_sge = 1, + .opcode = opcode, + .send_flags = IBV_SEND_SIGNALED, + .wr = { + .rdma = { + .remote_addr = ibdev->rcon.vaddr, + .rkey = ibdev->rcon.rkey + } + } + }; + struct ibv_send_wr *badWR; + + if (opcode != IBV_WR_RDMA_READ && Req.msg_size <= ibdev->maxinline) + wr.send_flags |= IBV_SEND_INLINE; + errno = 0; + while (n--) { + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post %s", opcode_name(wr.opcode)); + } + if (opcode != IBV_WR_RDMA_READ) { + LStat.s.no_bytes += Req.msg_size; + LStat.s.no_msgs++; + } + } + return 1; +} + + +/* + * Poll the completion queue. + */ +static int +ib_poll(IBDEV *ibdev, struct ibv_wc *wc, int nwc) +{ + int n; + char *msg; + + if (!Req.poll_mode && !Finished) { + void *ectx; + struct ibv_cq *ecq; + + if (ibv_get_cq_event(ibdev->channel, &ecq, &ectx) != SUCCESS0) + {msg = "failed to get CQ event"; goto err;} + if (ecq != ibdev->cq) + {msg = "CQ event for unknown CQ"; goto err;} + if (ibv_req_notify_cq(ibdev->cq, 0) != SUCCESS0) + {msg = "failed to request CQ notification"; goto err;} + } + n = ibv_poll_cq(ibdev->cq, nwc, wc); + if (n < 0) + {msg = "CQ poll failed"; goto err;} + return n; + +err: + if (Finished && errno == EINTR) + return 0; + syserror(msg); + return -1; +} + + +/* + * Encode a IBCON structure into a data stream. + */ +static void +enc_ibcon(IBCON *host) +{ + enc_int(host->lid, sizeof(host->lid)); + enc_int(host->qpn, sizeof(host->qpn)); + enc_int(host->psn, sizeof(host->psn)); + enc_int(host->rkey, sizeof(host->rkey)); + enc_int(host->vaddr, sizeof(host->vaddr)); +} + + +/* + * Decode a IBCON structure from a data stream. + */ +static void +dec_ibcon(IBCON *host) +{ + host->lid = dec_int(sizeof(host->lid)); + host->qpn = dec_int(sizeof(host->qpn)); + host->psn = dec_int(sizeof(host->psn)); + host->rkey = dec_int(sizeof(host->rkey)); + host->vaddr = dec_int(sizeof(host->vaddr)); +} + + +/* + * Handle a CQ error and return true if it is recoverable. + */ +static int +do_error(int status, uint64_t *errors) +{ + ++*errors; + cq_error(status); + return 0; +} + + +/* + * Print out a CQ error given a status. + */ +static void +cq_error(int status) +{ + int i; + + for (i = 0; i < cardof(CQErrors); ++i) { + if (CQErrors[i].value == status) { + error("%s failed: %s", TestName, CQErrors[i].name); + return; + } + } + error("%s failed: CQ error %d", TestName, status); +} + + +/* + * Return the name of an opcode. + */ +static char * +opcode_name(int opcode) +{ + int i; + + for (i = 0; i < cardof(Opcodes); ++i) + if (Opcodes[i].value == opcode) + return Opcodes[i].name; + return "unknown operation"; +} diff --git a/ip.c b/ip.c new file mode 100644 index 0000000..8b048de --- /dev/null +++ b/ip.c @@ -0,0 +1,837 @@ +/* + * qperf - handle socket tests. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include "qperf.h" + + +/* + * Parameters. + */ +#define AF_INET_SDP 27 /* Family for SDP */ +#define AF_INET_RDS 30 /* Family for RDS */ + + +/* + * Function prototypes. + */ +static void datagram_client_bw(int domain); +static void datagram_client_init(int *fd, int domain, + struct sockaddr_in *addr); +static void datagram_client_lat(int domain); +static void datagram_server_bw(int domain); +static int datagram_server_init(int *fd, int domain); +static void datagram_server_lat(int domain); +static uint32_t decode_port(uint32_t *p); +static void encode_port(uint32_t *p, uint32_t port); +static void ip_parameters(long msgSize); +static int recv_full(int fd, void *ptr, int len); +static int send_full(int fd, void *ptr, int len); +static int set_socket_buffer_size(int fd); +static void socket_client_bw(int domain); +static void socket_client_init(int *fd, int domain); +static void socket_client_lat(int domain); +static void socket_server_bw(int domain); +static int socket_server_init(int *fd, int domain); +static void socket_server_lat(int domain); + + +/* + * Measure RDS bandwidth (client side). + */ +void +run_client_rds_bw(void) +{ + ip_parameters(8*1024); + datagram_client_bw(AF_INET_RDS); +} + + +/* + * Measure RDS bandwidth (server side). + */ +void +run_server_rds_bw(void) +{ + datagram_server_bw(AF_INET_RDS); +} + + +/* + * Measure RDS latency (client side). + */ +void +run_client_rds_lat(void) +{ + ip_parameters(1); + datagram_client_lat(AF_INET_RDS); +} + + +/* + * Measure RDS latency (server side). + */ +void +run_server_rds_lat(void) +{ + datagram_server_lat(AF_INET_RDS); +} + + +/* + * Measure UDP bandwidth (client side). + */ +void +run_client_udp_bw(void) +{ + ip_parameters(32*1024); + datagram_client_bw(AF_INET); +} + + +/* + * Measure UDP bandwidth (server side). + */ +void +run_server_udp_bw(void) +{ + datagram_server_bw(AF_INET); +} + + +/* + * Measure UDP latency (client side). + */ +void +run_client_udp_lat(void) +{ + ip_parameters(1); + datagram_client_lat(AF_INET); +} + + +/* + * Measure UDP latency (server side). + */ +void +run_server_udp_lat(void) +{ + datagram_server_lat(AF_INET); +} + + +/* + * Measure SDP bandwidth (client side). + */ +void +run_client_sdp_bw(void) +{ + ip_parameters(64*1024); + socket_client_bw(AF_INET_SDP); +} + + +/* + * Measure SDP bandwidth (server side). + */ +void +run_server_sdp_bw(void) +{ + socket_server_bw(AF_INET_SDP); +} + + +/* + * Measure SDP latency (client side). + */ +void +run_client_sdp_lat(void) +{ + ip_parameters(1); + socket_client_lat(AF_INET_SDP); +} + + +/* + * Measure SDP latency (server side). + */ +void +run_server_sdp_lat(void) +{ + socket_server_lat(AF_INET_SDP); +} + + +/* + * Measure TCP bandwidth (client side). + */ +void +run_client_tcp_bw(void) +{ + ip_parameters(64*1024); + socket_client_bw(AF_INET); +} + + +/* + * Measure TCP bandwidth (server side). + */ +void +run_server_tcp_bw(void) +{ + socket_server_bw(AF_INET); +} + + +/* + * Measure TCP latency (client side). + */ +void +run_client_tcp_lat(void) +{ + ip_parameters(1); + socket_client_lat(AF_INET); +} + + +/* + * Measure TCP latency (server side). + */ +void +run_server_tcp_lat(void) +{ + socket_server_lat(AF_INET); +} + + +/* + * Measure socket bandwidth (client side). + */ +static void +socket_client_bw(int domain) +{ + char *buf; + int sockFD; + + socket_client_init(&sockFD, domain); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = send_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(BANDWIDTH); +} + + +/* + * Measure socket bandwidth (server side). + */ +static void +socket_server_bw(int domain) +{ + int sockFD; + char *buf = 0; + + if (!socket_server_init(&sockFD, domain)) + return; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + int n = recv_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +/* + * Measure socket latency (client side). + */ +static void +socket_client_lat(int domain) +{ + char *buf; + int sockFD; + + socket_client_init(&sockFD, domain); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = send_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + + n = recv_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(LATENCY); +} + + +/* + * Measure socket latency (server side). + */ +static void +socket_server_lat(int domain) +{ + int sockFD; + char *buf = 0; + + if (!socket_server_init(&sockFD, domain)) + return; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + int n = recv_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + + n = send_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +/* + * Socket client initialization. + */ +static void +socket_client_init(int *fd, int domain) +{ + uint32_t port; + struct hostent *host; + struct sockaddr_in clientAddr; + struct sockaddr_in serverAddr; + socklen_t clientLen = sizeof(clientAddr); + + client_send_request(); + *fd = socket(domain, SOCK_STREAM, 0); + if (*fd < 0) + syserror_die("socket failed"); + clientAddr.sin_family = AF_INET; + clientAddr.sin_addr.s_addr = htonl(INADDR_ANY); + clientAddr.sin_port = htons(0); + if (bind(*fd, (struct sockaddr *)&clientAddr, clientLen) < 0) + syserror_die("bind failed"); + if (getsockname(*fd, (struct sockaddr *)&clientAddr, &clientLen) < 0) + syserror_die("getsockname failed"); + if (!set_socket_buffer_size(*fd)) + die(); + + host = gethostbyname(ServerName); + if (!host) + error_die("cannot find machine %s", ServerName); + serverAddr.sin_family = AF_INET; + if (host->h_length > sizeof(serverAddr.sin_addr)) + error_die("address too large to handle"); + memcpy(&serverAddr.sin_addr.s_addr, host->h_addr, host->h_length); + if (!recv_mesg(&port, sizeof(port), "port")) + die(); + port = decode_port(&port); + debug("sending from %s port %d to %d", + domain == AF_INET_SDP ? "SDP" : "TCP", + ntohs(clientAddr.sin_port), port); + serverAddr.sin_port = htons(port); + if (connect(*fd, &serverAddr, sizeof(serverAddr)) < 0) + syserror_die("connect failed"); +} + + +/* + * Socket server initialization. + */ +static int +socket_server_init(int *fd, int domain) +{ + uint32_t port; + int listenFD; + struct sockaddr_in addr; + socklen_t len = sizeof(addr); + int stat = 0; + int one = 1; + + listenFD = socket(domain, SOCK_STREAM, 0); + if (listenFD < 0) + return syserror("socket failed"); + if (setsockopt(listenFD, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) + return syserror("failed to reuse address on socket"); + memset(&addr, 0, len); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = htons(Req.port); + if (bind(listenFD, (struct sockaddr *)&addr, len) < 0) { + syserror("bind failed"); + goto err; + } + if (getsockname(listenFD, (struct sockaddr *)&addr, &len) < 0) { + syserror("getsockname failed"); + goto err; + } + port = ntohs(addr.sin_port); + if (listen(listenFD, 1) < 0) { + syserror("listen failed"); + goto err; + } + encode_port(&port, port); + if (!send_mesg(&port, sizeof(port), "port")) + goto err; + len = sizeof(addr); + *fd = accept(listenFD, (struct sockaddr *)&addr, &len); + if (*fd < 0) { + syserror("accept failed"); + goto err; + } + debug("accepted connection"); + if (!set_socket_buffer_size(*fd)) { + close(*fd); + goto err; + } + stat = 1; +err: + close(listenFD); + return stat; +} + + +/* + * Set both the send and receive socket buffer sizes. + */ +static int +set_socket_buffer_size(int fd) +{ + int size = Req.sock_buf_size; + + if (!size) + return 1; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)) < 0) + return syserror("failed to set send buffer size on socket"); + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)) < 0) + return syserror("failed to set receive buffer size on socket"); + return 1; +} + + +static void +datagram_client_bw(int domain) +{ + char *buf; + int sockFD; + struct sockaddr_in serverAddr; + + datagram_client_init(&sockFD, domain, &serverAddr); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = sendto(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&serverAddr, sizeof(serverAddr)); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(BANDWIDTH_SR); +} + + +static void +datagram_server_bw(int domain) +{ + int sockFD; + char *buf = 0; + + if (!datagram_server_init(&sockFD, domain)) + return; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + int n = recv(sockFD, buf, Req.msg_size, 0); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +static void +datagram_client_lat(int domain) +{ + char *buf; + int sockFD; + struct sockaddr_in addr; + + datagram_client_init(&sockFD, domain, &addr); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = sendto(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&addr, sizeof(addr)); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + + n = recv(sockFD, buf, Req.msg_size, 0); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(LATENCY); +} + + +/* + * Set default IP parameters and ensure that any that are set are being used. + */ +static void +ip_parameters(long msgSize) +{ + setp_u32(0, L_MSG_SIZE, msgSize); + setp_u32(0, R_MSG_SIZE, msgSize); + par_use(L_PORT); + par_use(R_PORT); + par_use(L_SOCK_BUF_SIZE); + par_use(R_SOCK_BUF_SIZE); + opt_check(); +} + + +static void +datagram_server_lat(int domain) +{ + int sockFD; + char *buf = 0; + + if (!datagram_server_init(&sockFD, domain)) + goto err; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + struct sockaddr_in clientAddr; + socklen_t clientLen = sizeof(clientAddr); + int n = recvfrom(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&clientAddr, &clientLen); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + + n = sendto(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&clientAddr, clientLen); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +/* + * Datagram client initialization. + */ +static void +datagram_client_init(int *fd, int domain, struct sockaddr_in *serverAddr) +{ + uint32_t port; + struct hostent *host; + struct sockaddr_in clientAddr; + socklen_t clientLen = sizeof(clientAddr); + + client_send_request(); + *fd = socket(domain, SOCK_DGRAM, 0); + if (*fd < 0) + syserror_die("socket failed"); + clientAddr.sin_family = AF_INET; + clientAddr.sin_addr.s_addr = htonl(INADDR_ANY); + clientAddr.sin_port = htons(0); + if (bind(*fd, (struct sockaddr *)&clientAddr, clientLen) < 0) + syserror_die("bind failed"); + if (getsockname(*fd, (struct sockaddr *)&clientAddr, &clientLen) < 0) + syserror_die("getsockname failed"); + if (!set_socket_buffer_size(*fd)) + die(); + + host = gethostbyname(ServerName); + if (!host) + error_die("cannot find machine %s", ServerName); + serverAddr->sin_family = AF_INET; + if (host->h_length > sizeof(serverAddr->sin_addr)) + error_die("address too large to handle"); + memcpy(&serverAddr->sin_addr.s_addr, host->h_addr, host->h_length); + if (!recv_mesg(&port, sizeof(port), "port")) + die(); + port = decode_port(&port); + debug("sending from %s port %d to %d", + domain == AF_INET ? "UDP" : "RDS", ntohs(clientAddr.sin_port), port); + serverAddr->sin_port = htons(port); +} + + +/* + * Datagram server initialization. + */ +static int +datagram_server_init(int *fd, int domain) +{ + uint32_t port; + struct sockaddr_in addr; + socklen_t len = sizeof(addr); + + *fd = socket(domain, SOCK_DGRAM, 0); + if (*fd < 0) + return syserror("socket failed"); + memset(&addr, 0, len); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = htons(Req.port); + if (bind(*fd, (struct sockaddr *)&addr, len) < 0) { + syserror("bind failed"); + goto err; + } + if (getsockname(*fd, (struct sockaddr *)&addr, &len) < 0) { + syserror("getsockname failed"); + goto err; + } + if (!set_socket_buffer_size(*fd)) + goto err; + encode_port(&port, ntohs(addr.sin_port)); + if (!send_mesg(&port, sizeof(port), "port")) + goto err; + return 1; + +err: + close(*fd); + return 0; +} + + +/* + * Send a complete message to a socket. A zero byte write indicates an end of + * file which suggests that we are finished. + */ +static int +send_full(int fd, void *ptr, int len) +{ + int n = len; + while (!Finished && n) { + int i = write(fd, ptr, n); + if (i < 0) + return i; + ptr += i; + n -= i; + if (i == 0) + set_finished(); + } + return len-n; +} + + +/* + * Receive a complete message from a socket. A zero byte read indicates an end + * of file which suggests that we are finished. + */ +static int +recv_full(int fd, void *ptr, int len) +{ + int n = len; + while (!Finished && n) { + int i = read(fd, ptr, n); + if (i < 0) + return i; + ptr += i; + n -= i; + if (i == 0) + set_finished(); + } + return len-n; +} + + +/* + * Encode a port which is stored as a 32 bit unsigned. + */ +static void +encode_port(uint32_t *p, uint32_t port) +{ + enc_init(p); + enc_int(port, sizeof(port)); +} + + +/* + * Decode a port which is stored as a 32 bit unsigned. + */ +static uint32_t +decode_port(uint32_t *p) +{ + dec_init(p); + return dec_int(sizeof(uint32_t)); +} diff --git a/mkhelp b/mkhelp new file mode 100755 index 0000000..2e7c8dc --- /dev/null +++ b/mkhelp @@ -0,0 +1,113 @@ +#!/usr/bin/env perl +# +use strict; +use warnings; +use diagnostics; + +my $help_txt = "help.txt"; +my $help_c = "help.c"; +my $top = " +/* + * This was generated from $help_txt. Do not modify directly. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +char *Usage[] ={ +"; +my $end = " + 0, +}; +"; + +sub panic { + print STDERR @_, "\n"; + exit 1; +} + +sub main() { + my %defs; + $defs{$_} = 1 for (@ARGV); + my $iFile; + open($iFile, "<", $help_txt) or + panic("cannot find $help_txt"); + my $str = ""; + my $keep = 1; + while (<$iFile>) { + chomp; + s/\s+$//; + if (/^ /) { + if ($keep) { + s///; + s/(["\\])/\\$1/g; + s/$/\\n/; + if (/^(.{68}(?>[^\\]?))(..*)/) { + $str .= " "x8 . "\"$1\"\n"; + $str .= " "x12 . "\"$2\"\n"; + } else { + $str .= " "x8 . "\"$_\"\n"; + } + } + } else { + my @args = split; + my $arg0 = lc(shift @args); + $keep = 1; + for (@args) { + if (/^\+(.*)/) { + $keep = 0 unless ($defs{$1}); + } elsif (/^-(.*)/) { + $keep = 0 if ($defs{$1}); + } + } + if ($keep) { + if ($str) { + chop $str; + $str .= ",\n"; + } + $str .= " "x4 . "\"$arg0\",\n"; + } + } + } + close $iFile; + if ($str) { + chop $str; + $str .= ",\n"; + } + $top =~ s/^\n//; + $end =~ s/^\n//; + my $oFile; + open($oFile, ">", $help_c) or + panic("cannot create $help_c"); + print $oFile $top, $str, $end; + close $oFile; +} + +main(); diff --git a/qperf.c b/qperf.c new file mode 100644 index 0000000..43f4873 --- /dev/null +++ b/qperf.c @@ -0,0 +1,2913 @@ +/* + * qperf - main. + * Run performance tests over TCP/IP and RDMA. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "qperf.h" + + +/* + * Configurable parameters. If your change makes this version of qperf + * incompatible with previous versions (usually a change to the Req structure), + * increment VER_MIN and set VER_INC to 0. Otherwise, just increment VER_INC. + * VER_MAJ is reserved for major changes. + */ +#define VER_MAJ 0 /* Major version */ +#define VER_MIN 2 /* Minor version */ +#define VER_INC 0 /* Incremental version */ +#define LISTENQ 5 /* Size of listen queue */ +#define BUFSIZE 1024 /* Size of buffers */ +#define SYNCMESG "SyN" /* Synchronize message */ +#define SYNCSIZE sizeof(SYNCMESG) /* Size of synchronize message */ + + +/* + * For convenience. + */ +#define with(c) |(c<<8) + + +/* + * Option list. + */ +typedef struct OPTION { + char *name; /* Name of option */ + short server_valid; /* Option valid on server */ + void (*func)(); /* Function to call */ + int arg1; /* First argument */ + int arg2; /* Second argument */ +} OPTION; + + +/* + * Parameter information. + */ +typedef struct PAR_INFO { + PAR_INDEX index; /* Index into parameter table */ + int type; /* Type */ + void *ptr; /* Pointer to value */ + char *name; /* Option name */ + int set; /* Parameter has been set */ + int used; /* Parameter has been used */ + int inuse; /* Parameter is in use */ +} PAR_INFO; + + +/* + * Parameter name association. + */ +typedef struct PAR_NAME { + char *name; /* Name */ + PAR_INDEX loc_i; /* Local index */ + PAR_INDEX rem_i; /* Remote index */ +} PAR_NAME; + + +/* + * Test prototype. + */ +typedef struct TEST { + char *name; /* Test name */ + void (*client)(void); /* Client function */ + void (*server)(void); /* Server function */ +} TEST; + + +/* + * Used to save output data for formatting. + */ +typedef struct SHOW { + char *pref; /* Name prefix */ + char *name; /* Name */ + char *data; /* Data */ + char *unit; /* Unit */ + char *altn; /* Alternative value */ +} SHOW; + + +/* + * Configuration information. + */ +typedef struct CONF { + char node[STRSIZE]; /* Node */ + char cpu[STRSIZE]; /* CPU */ + char os[STRSIZE]; /* Operating System */ + char qperf[STRSIZE]; /* Qperf version */ +} CONF; + + +/* + * Function prototypes. + */ +static void add_ustat(USTAT *l, USTAT *r); +static long arg_long(char ***argvp); +static long arg_size(char ***argvp); +static char *arg_strn(char ***argvp); +static long arg_time(char ***argvp); +static void bug_die(char *fmt, ...); +static void calc_node(RESN *resn, STAT *stat); +static void calc_results(void); +static void client(TEST *test); +static int cmpsub(char *s2, char *s1); +static char *commify(char *data); +static void dec_req(REQ *host); +static void dec_stat(STAT *host); +static void dec_ustat(USTAT *host); +static void do_args(char *args[]); +static void enc_req(REQ *host); +static void enc_stat(STAT *host); +static void enc_ustat(USTAT *host); +static TEST *find_test(char *name); +static OPTION *find_option(char *name); +static void get_conf(CONF *conf); +static void get_cpu(CONF *conf); +static double get_seconds(void); +static void get_times(CLOCK timex[T_N]); +static void initialize(void); +static void init_lstat(void); +static void init_vars(void); +static int nice_1024(char *pref, char *name, long long value); +static void opt_help(OPTION *option, char ***argvp); +static void opt_misc(OPTION *option, char ***argvp); +static void opt_strn(OPTION *option, char ***argvp); +static void opt_long(OPTION *option, char ***argvp); +static void opt_size(OPTION *option, char ***argvp); +static void opt_time(OPTION *option, char ***argvp); +static void opt_vers(OPTION *option, char ***argvp); +static PAR_INFO *par_info(PAR_INDEX index); +static PAR_INFO *par_set(char *name, PAR_INDEX index); +static int par_isset(PAR_INDEX index); +static void place_any(char *pref, char *name, char *unit, char *data, + char *altn); +static void place_show(void); +static void place_val(char *pref, char *name, char *unit, double value); +static char *qasprintf(char *fmt, ...); +static int recv_sync(void); +static void run_client_conf(void); +static void run_client_quit(void); +static void run_server_conf(void); +static void run_server_quit(void); +static int send_recv_mesg(int sr, char *item, int fd, char *buf, int len); +static int send_sync(void); +static void server(void); +static void server_listen(void); +static int server_recv_request(void); +static void set_affinity(void); +static int set_nonblock(int fd); +static void set_signals(void); +static void show_debug(void); +static void show_info(MEASURE measure); +static void show_rest(void); +static void show_used(void); +static void sig_alrm(int signo, siginfo_t *siginfo, void *ucontext); +static char *skip_colon(char *s); +static void start_timing(int seconds); +static void strncopy(char *d, char *s, int n); +static int verbose(int type, double value); +static void view_band(int type, char *pref, char *name, double value); +static void view_cost(int type, char *pref, char *name, double value); +static void view_cpus(int type, char *pref, char *name, double value); +static void view_rate(int type, char *pref, char *name, double value); +static void view_long(int type, char *pref, char *name, long long value); +static void view_size(int type, char *pref, char *name, long long value); +static void view_strn(int type, char *pref, char *name, char *value); +static void view_time(int type, char *pref, char *name, double value); + + +/* + * Configurable variables. + */ +static int ListenPort = 19765; +static int Precision = 3; +static int ServerTimeout = 5; + + +/* + * Static variables. + */ +static REQ RReq; +static int Debug; +static uint8_t *DecodePtr; +static int ExitStatus; +static uint8_t *EncodePtr; +static STAT IStat; +static int ListenFD; +static int ProcStatFD; +static int RemoteFD; +static STAT RStat; +static int ShowIndex; +static SHOW ShowTable[256]; +static int UnifyUnits; +static int UnifyNodes; +static int VerboseConf; +static int VerboseStat; +static int VerboseTime; +static int VerboseUsed; +static int Wait; + + +/* + * Global variables. + */ +RES Res; +REQ Req; +STAT LStat; +char *TestName; +char *ServerName; +int Successful; +volatile int Finished; + + +/* + * Parameter names. This is used to print out the names of the parameters that + * have been set. + */ +PAR_NAME ParName[] ={ + { "access_recv", L_ACCESS_RECV, R_ACCESS_RECV }, + { "affinity", L_AFFINITY, R_AFFINITY }, + { "flip", L_FLIP, R_FLIP }, + { "id", L_ID, R_ID }, + { "msg_size", L_MSG_SIZE, R_MSG_SIZE }, + { "mtu_size", L_MTU_SIZE, R_MTU_SIZE }, + { "no_msgs", L_NO_MSGS, R_NO_MSGS }, + { "poll_mode", L_POLL_MODE, R_POLL_MODE }, + { "port", L_PORT, R_PORT }, + { "rd_atomic", L_RD_ATOMIC, R_RD_ATOMIC }, + { "sock_buf_size", L_SOCK_BUF_SIZE, R_SOCK_BUF_SIZE }, + { "time", L_TIME, R_TIME }, + { "timeout", L_TIMEOUT, R_TIMEOUT }, +}; + + +/* + * Parameters. These must be listed in the same order as the indices are + * defined. + */ +PAR_INFO ParInfo[P_N] ={ + { P_NULL, }, + { L_ACCESS_RECV, 'l', &Req.access_recv }, + { R_ACCESS_RECV, 'l', &RReq.access_recv }, + { L_AFFINITY, 'l', &Req.affinity }, + { R_AFFINITY, 'l', &RReq.affinity }, + { L_FLIP, 'l', &Req.flip }, + { R_FLIP, 'l', &RReq.flip }, + { L_ID, 'p', &Req.id }, + { R_ID, 'p', &RReq.id }, + { L_MSG_SIZE, 's', &Req.msg_size }, + { R_MSG_SIZE, 's', &RReq.msg_size }, + { L_MTU_SIZE, 's', &Req.mtu_size }, + { R_MTU_SIZE, 's', &RReq.mtu_size }, + { L_NO_MSGS, 'l', &Req.no_msgs }, + { R_NO_MSGS, 'l', &RReq.no_msgs }, + { L_POLL_MODE, 'l', &Req.poll_mode }, + { R_POLL_MODE, 'l', &RReq.poll_mode }, + { L_PORT, 'l', &Req.port }, + { R_PORT, 'l', &RReq.port }, + { L_RATE, 'p', &Req.rate }, + { R_RATE, 'p', &RReq.rate }, + { L_RD_ATOMIC, 'l', &Req.rd_atomic }, + { R_RD_ATOMIC, 'l', &RReq.rd_atomic }, + { L_SOCK_BUF_SIZE, 's', &Req.sock_buf_size }, + { R_SOCK_BUF_SIZE, 's', &RReq.sock_buf_size }, + { L_TIME, 't', &Req.time }, + { R_TIME, 't', &RReq.time }, + { L_TIMEOUT, 't', &Req.timeout }, + { R_TIMEOUT, 't', &RReq.timeout }, +}; + + +/* + * Options. + */ +OPTION Options[] ={ + { "--access_recv", 0, &opt_long, L_ACCESS_RECV, R_ACCESS_RECV }, + { "-Ar", 0, &opt_long, L_ACCESS_RECV, R_ACCESS_RECV }, + { "--affinity", 0, &opt_long, L_AFFINITY, R_AFFINITY }, + { "-a", 0, &opt_long, L_AFFINITY, R_AFFINITY }, + { "--loc_affinity", 0, &opt_long, L_AFFINITY, }, + { "-la", 0, &opt_long, L_AFFINITY, }, + { "--rem_affinity", 0, &opt_long, R_AFFINITY }, + { "-ra", 0, &opt_long, R_AFFINITY }, + { "--debug", 1, &opt_misc, 'D', }, + { "-D", 1, &opt_misc, 'D', }, + { "--flip", 0, &opt_long, L_FLIP, R_FLIP }, + { "-f", 0, &opt_long, L_FLIP, R_FLIP }, + { "--help", 0, &opt_help }, + { "-h", 0, &opt_help }, + { "--host", 0, &opt_misc, 'H', }, + { "-H", 0, &opt_misc, 'H', }, + { "--id", 0, &opt_strn, L_ID, R_ID }, + { "-i", 0, &opt_strn, L_ID, R_ID }, + { "--loc_id", 0, &opt_strn, L_ID, }, + { "-li", 0, &opt_strn, L_ID, }, + { "--rem_id", 0, &opt_strn, R_ID }, + { "-ri", 0, &opt_strn, R_ID }, + { "--listen_port", 1, &opt_misc, 'l','p' }, + { "-lp", 1, &opt_misc, 'l','p' }, + { "--msg_size", 0, &opt_size, L_MSG_SIZE, R_MSG_SIZE }, + { "-m", 0, &opt_size, L_MSG_SIZE, R_MSG_SIZE }, + { "--mtu_size", 0, &opt_size, L_MTU_SIZE, R_MTU_SIZE }, + { "-M", 0, &opt_size, L_MTU_SIZE, R_MTU_SIZE }, + { "--no_msgs", 0, &opt_long, L_NO_MSGS, R_NO_MSGS }, + { "-n", 0, &opt_long, L_NO_MSGS, R_NO_MSGS }, + { "--poll", 0, &opt_long, L_POLL_MODE, R_POLL_MODE }, + { "-P", 0, &opt_long, L_POLL_MODE, R_POLL_MODE }, + { "--loc_poll", 0, &opt_long, L_POLL_MODE, }, + { "-lP", 0, &opt_long, L_POLL_MODE, }, + { "--rem_poll", 0, &opt_long, R_POLL_MODE }, + { "-rP", 0, &opt_long, R_POLL_MODE }, + { "--port", 0, &opt_long, L_PORT, R_PORT }, + { "-p", 0, &opt_long, L_PORT, R_PORT }, + { "--precision", 0, &opt_misc, 'e', }, + { "-e", 0, &opt_misc, 'e', }, + { "--rate", 0, &opt_strn, L_RATE, R_RATE }, + { "-r", 0, &opt_strn, L_RATE, R_RATE }, + { "--loc_rate", 0, &opt_strn, L_RATE }, + { "-lr", 0, &opt_strn, L_RATE }, + { "--rem_rate", 0, &opt_strn, R_RATE }, + { "-rr", 0, &opt_strn, R_RATE }, + { "-rd_atomic", 0, &opt_long, L_RD_ATOMIC, R_RD_ATOMIC }, + { "-R", 0, &opt_long, L_RD_ATOMIC, R_RD_ATOMIC }, + { "--loc_rd_atomic", 0, &opt_long, L_RD_ATOMIC, }, + { "-lR", 0, &opt_long, L_RD_ATOMIC, }, + { "--rem_rd_atomic", 0, &opt_long, R_RD_ATOMIC }, + { "-rR", 0, &opt_long, R_RD_ATOMIC }, + { "--sock_buf_size", 0, &opt_size, L_SOCK_BUF_SIZE, R_SOCK_BUF_SIZE }, + { "-S", 0, &opt_size, L_SOCK_BUF_SIZE, R_SOCK_BUF_SIZE }, + { "--loc_sock_buf_size", 0, &opt_size, L_SOCK_BUF_SIZE }, + { "-lS", 0, &opt_size, L_SOCK_BUF_SIZE }, + { "--rem_sock_buf_size", 0, &opt_size, R_SOCK_BUF_SIZE }, + { "-rS", 0, &opt_size, R_SOCK_BUF_SIZE }, + { "--time", 0, &opt_time, L_TIME, R_TIME }, + { "-t", 0, &opt_time, L_TIME, R_TIME }, + { "--timeout", 0, &opt_time, L_TIMEOUT, R_TIMEOUT }, + { "-T", 0, &opt_time, L_TIMEOUT, R_TIMEOUT }, + { "--loc_timeout", 0, &opt_time, L_TIMEOUT }, + { "-lT", 0, &opt_time, L_TIMEOUT }, + { "--rem_timeout", 0, &opt_time, R_TIMEOUT }, + { "-rT", 0, &opt_time, R_TIMEOUT }, + { "--server_timeout", 0, &opt_misc, 's', 't' }, + { "-st", 0, &opt_misc, 's', 't' }, + { "--unify_nodes", 0, &opt_misc, 'U' }, + { "-U", 0, &opt_misc, 'U' }, + { "--unify_units", 0, &opt_misc, 'u' }, + { "-u", 0, &opt_misc, 'u' }, + { "--verbose", 0, &opt_misc, 'v' }, + { "-v", 0, &opt_misc, 'v' }, + { "--verbose_conf", 0, &opt_misc, 'v', 'c' }, + { "-vc", 0, &opt_misc, 'v', 'c' }, + { "--verbose_stat", 0, &opt_misc, 'v', 's' }, + { "-vs", 0, &opt_misc, 'v', 's' }, + { "--verbose_time", 0, &opt_misc, 'v', 't' }, + { "-vt", 0, &opt_misc, 'v', 't' }, + { "--verbose_used", 0, &opt_misc, 'v', 'u' }, + { "-vu", 0, &opt_misc, 'v', 'u' }, + { "--verbose_more", 0, &opt_misc, 'v', 'v' }, + { "-vv", 0, &opt_misc, 'v', 'v' }, + { "--verbose_more_conf", 0, &opt_misc, 'v', 'c' }, + { "-vC", 0, &opt_misc, 'v', 'C' }, + { "--verbose_more_stat", 0, &opt_misc, 'v', 's' }, + { "-vS", 0, &opt_misc, 'v', 'S' }, + { "--verbose_more_time", 0, &opt_misc, 'v', 't' }, + { "-vT", 0, &opt_misc, 'v', 'T' }, + { "--verbose_more_used", 0, &opt_misc, 'v', 'u' }, + { "-vU", 0, &opt_misc, 'v', 'U' }, + { "--version", 0, &opt_vers, }, + { "-V", 0, &opt_vers, }, + { "--wait", 0, &opt_misc, 'W', }, + { "-W", 0, &opt_misc, 'W', }, +}; + + +/* + * Tests. + */ +#define test(n) { #n, run_client_##n, run_server_##n } +TEST Tests[] ={ + test(conf), + test(quit), + test(rds_bw), + test(rds_lat), + test(sdp_bw), + test(sdp_lat), + test(tcp_bw), + test(tcp_lat), + test(udp_bw), + test(udp_lat), +#ifdef RDMA + test(rc_bi_bw), + test(rc_bw), + test(rc_compare_swap_mr), + test(rc_fetch_add_mr), + test(rc_lat), + test(rc_rdma_read_bw), + test(rc_rdma_read_lat), + test(rc_rdma_write_bw), + test(rc_rdma_write_lat), + test(rc_rdma_write_poll_lat), + test(uc_bi_bw), + test(uc_bw), + test(uc_lat), + test(uc_rdma_write_bw), + test(uc_rdma_write_lat), + test(uc_rdma_write_poll_lat), + test(ud_bi_bw), + test(ud_bw), + test(ud_lat), + test(ver_rc_compare_swap), + test(ver_rc_fetch_add), +#endif +}; + + +int +main(int argc, char *argv[]) +{ + initialize(); + set_signals(); + do_args(&argv[1]); + return ExitStatus; +} + + +/* + * Initialize. + */ +static void +initialize(void) +{ + init_vars(); +} + + +/* + * Initialize variables. + */ +static void +init_vars(void) +{ + int i; + + for (i = 0; i < P_N; ++i) + if (ParInfo[i].index != i) + bug_die("initialize: ParInfo: out of order: %d", i); + ProcStatFD = open("/proc/stat", 0); + if (ProcStatFD < 0) + syserror_die("Cannot open /proc/stat"); + IStat.no_cpus = sysconf(_SC_NPROCESSORS_ONLN); + IStat.no_ticks = sysconf(_SC_CLK_TCK); +} + + +/* + * Look for a colon and skip past it and any spaces. + */ +static char * +skip_colon(char *s) +{ + for (;;) { + int c = *s++; + if (c == ':') + break; + if (c == '\0') + return 0; + } + while (*s == ' ') + s++; + return s; +} + + +/* + * A case insensitive string compare. s2 must at least contain all of s1 but + * can be longer. + */ +static int +cmpsub(char *s2, char *s1) +{ + for (;;) { + int c1 = *s1++; + int c2 = *s2++; + if (c1 == '\0') + return 1; + if (c2 == '\0') + return 0; + if (tolower(c1) != tolower(c2)) + return 0; + } +} + + +/* + * Set up signal handlers. + */ +static void +set_signals(void) +{ + struct sigaction alrm ={ .sa_sigaction = sig_alrm }; + sigaction(SIGALRM, &alrm, 0); + sigaction(SIGPIPE, &alrm, 0); +} + + +/* + * Note that time is up. + */ +static void +sig_alrm(int signo, siginfo_t *siginfo, void *ucontext) +{ + set_finished(); +} + + +/* + * Parse arguments. + */ +static void +do_args(char *args[]) +{ + int isClient = 0; + int testSpecified = 0; + + while (*args) { + char *arg = *args; + if (arg[0] == '-') { + OPTION *option = find_option(arg); + if (!option) + error_die("%s: bad option; try qperf --help", arg); + if (!option->server_valid) + isClient = 1; + option->func(option, &args); + } else { + isClient = 1; + if (!ServerName) + ServerName = arg; + else { + TEST *p = find_test(arg); + if (!p) + error_die("%s: bad test; try qperf --help", arg); + client(p); + testSpecified = 1; + } + ++args; + } + } + if (!isClient) + server(); + else if (!testSpecified) { + if (!ServerName) + error_die("You used a client only option but did not specify the " + "server name.\nDo you want to be a client or server?"); + if (find_test(ServerName)) + error_die("Must specify host name first; try qperf --help"); + error_die("Must specify a test type; try qperf --help"); + } +} + + +/* + * Given the name of an option, find it. + */ +static OPTION * +find_option(char *name) +{ + int n = cardof(Options); + OPTION *p = Options; + for (; n--; ++p) + if (streq(name, p->name)) + return p; + return 0; +} + + +/* + * Given the name of a test, find it. + */ +static TEST * +find_test(char *name) +{ + int n = cardof(Tests); + TEST *p = Tests; + for (; n--; ++p) + if (streq(name, p->name)) + return p; + return 0; +} + + +/* + * Print out a help message. + */ +static void +opt_help(OPTION *option, char ***argvp) +{ + char **usage; + char *category = (*argvp)[1]; + + if (!category) + category = "main"; + for (usage = Usage; *usage; usage += 2) + if (streq(*usage, category)) + break; + if (!*usage) + error_die("Cannot find help category %s; try: qperf --help"); + printf("%s", usage[1]); + exit(0); +} + + +/* + * Handle options requiring a long argument. + */ +static void +opt_long(OPTION *option, char ***argvp) +{ + long l = arg_long(argvp); + setp_u32(option->name, option->arg1, l); + setp_u32(option->name, option->arg2, l); +} + + +/* + * Handle miscellaneous options. + */ +static void +opt_misc(OPTION *option, char ***argvp) +{ + switch (option->arg1 with (option->arg2)) { + case 'e': + Precision = arg_long(argvp); + return; + case 'u': + UnifyUnits = 1; + break; + case 'v': + VerboseConf = 1; + VerboseStat = 1; + VerboseTime = 1; + VerboseUsed = 1; + break; + case 'D': + Debug = 1; + break; + case 'H': + ServerName = arg_strn(argvp); + return; + case 'U': + UnifyNodes = 1; + break; + case 'W': + Wait = arg_time(argvp); + return; + case ('l') with ('p'): + ListenPort = arg_long(argvp); + return; + case ('s') with ('t'): + ServerTimeout = arg_time(argvp); + return; + case ('v') with ('c'): + VerboseConf = 1; + break; + case ('v') with ('s'): + VerboseStat = 1; + break; + case ('v') with ('t'): + VerboseTime = 1; + break; + case ('v') with ('u'): + VerboseUsed = 1; + break; + case ('v') with ('v'): + VerboseConf = 2; + VerboseStat = 2; + VerboseTime = 2; + VerboseUsed = 2; + break; + case ('v') with ('C'): + VerboseConf = 2; + break; + case ('v') with ('S'): + VerboseStat = 2; + break; + case ('v') with ('T'): + VerboseTime = 2; + break; + case ('v') with ('U'): + VerboseUsed = 2; + break; + default: + bug_die("opt_misc: unknown argument: %s", option->name); + } + *argvp += 1; +} + + +/* + * Handle options requiring a size argument. + */ +static void +opt_size(OPTION *option, char ***argvp) +{ + long l = arg_size(argvp); + setp_u32(option->name, option->arg1, l); + setp_u32(option->name, option->arg2, l); +} + + +/* + * Handle options requiring a string argument. + */ +static void +opt_strn(OPTION *option, char ***argvp) +{ + char *s = arg_strn(argvp); + setp_str(option->name, option->arg1, s); + setp_str(option->name, option->arg2, s); +} + + +/* + * Handle options requiring a time argument. + */ +static void +opt_time(OPTION *option, char ***argvp) +{ + long l = arg_time(argvp); + setp_u32(option->name, option->arg1, l); + setp_u32(option->name, option->arg2, l); +} + + +/* + * Print out our current version. + */ +static void +opt_vers(OPTION *option, char ***argvp) +{ + printf("qperf %d.%d.%d\n", VER_MAJ, VER_MIN, VER_INC); + exit(0); +} + + +/* + * If any options were set but were not used, print out a warning message for + * the user. + */ +void +opt_check(void) +{ + PAR_INFO *p; + PAR_INFO *q; + PAR_INFO *r = endof(ParInfo); + + for (p = ParInfo; p < r; ++p) { + if (p->used || !p->set) + continue; + error("warning: %s set but not used in test %s", p->name, TestName); + for (q = p+1; q < r; ++q) + if (q->set && q->name == p->name) + q->set = 0; + } +} + + +/* + * Return the value of a long argument. It must be non-negative. + */ +static long +arg_long(char ***argvp) +{ + char **argv = *argvp; + char *p; + long l; + + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + l = strtol(argv[1], &p, 10); + if (p[0] != '\0') + error_die("Bad argument: %s", argv[1]); + if (l < 0) + error_die("%s requires a non-negative number", argv[0]); + *argvp += 2; + return l; +} + + +/* + * Return the value of a size argument. + */ +static long +arg_size(char ***argvp) +{ + char *p; + long double d; + long l = 0; + char **argv = *argvp; + + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + d = strtold(argv[1], &p); + if (d < 0) + error_die("%s requires a non-negative number", argv[0]); + + if (p[0] == '\0') + l = d; + else { + if (streq(p, "kb") || streq(p, "k")) + l = (long)(d * (1000)); + else if (streq(p, "mb") || streq(p, "m")) + l = (long)(d * (1000 * 1000)); + else if (streq(p, "gb") || streq(p, "g")) + l = (long)(d * (1000 * 1000 * 1000)); + else if (streq(p, "kib") || streq(p, "K")) + l = (long)(d * (1024)); + else if (streq(p, "mib") || streq(p, "M")) + l = (long)(d * (1024 * 1024)); + else if (streq(p, "gib") || streq(p, "G")) + l = (long)(d * (1024 * 1024 * 1024)); + else + error_die("Bad argument: %s", argv[1]); + } + *argvp += 2; + return l; +} + + +/* + * Return the value of a string argument. + */ +static char * +arg_strn(char ***argvp) +{ + char **argv = *argvp; + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + *argvp += 2; + return argv[1]; +} + + +/* + * Return the value of a size argument. + */ +static long +arg_time(char ***argvp) +{ + char *p; + long double d; + + long l = 0; + char **argv = *argvp; + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + d = strtold(argv[1], &p); + if (d < 0) + error_die("%s requires a non-negative number", argv[0]); + + if (p[0] == '\0') + l = (long)d; + else { + int u = *p; + if (p[1] != '\0') + error_die("Bad argument: %s", argv[1]); + if (u == 's' || u == 'S') + l = (long)d; + else if (u == 'm' || u == 'M') + l = (long)(d * (60)); + else if (u == 'h' || u == 'H') + l = (long)(d * (60 * 60)); + else if (u == 'd' || u == 'D') + l = (long)(d * (60 * 60 * 24)); + else + error_die("Bad argument: %s", argv[1]); + } + *argvp += 2; + return l; +} + + +/* + * Set a value stored in a 32 bit value without letting anyone know we set it. + */ +void +setv_u32(PAR_INDEX index, uint32_t l) +{ + PAR_INFO *p = par_info(index); + *((uint32_t *)p->ptr) = l; +} + + +/* + * Set an option stored in a 32 bit value. + */ +void +setp_u32(char *name, PAR_INDEX index, uint32_t l) +{ + PAR_INFO *p = par_set(name, index); + if (!p) + return; + *((uint32_t *)p->ptr) = l; +} + + +/* + * Set an option stored in a string vector. + */ +void +setp_str(char *name, PAR_INDEX index, char *s) +{ + PAR_INFO *p = par_set(name, index); + if (!p) + return; + if (strlen(s) >= STRSIZE) + error_die("%s: too long", s); + strcpy(p->ptr, s); +} + + +/* + * Note a parameter as being used. + */ +void +par_use(PAR_INDEX index) +{ + PAR_INFO *p = par_info(index); + p->used = 1; + p->inuse = 1; +} + + +/* + * Set the PAR_INFO.name value. + */ +static PAR_INFO * +par_set(char *name, PAR_INDEX index) +{ + PAR_INFO *p = par_info(index); + if (index == P_NULL) + return 0; + if (name) { + p->name = name; + p->set = 1; + } else { + p->used = 1; + p->inuse = 1; + if (p->name) + return 0; + } + return p; +} + + +/* + * Determine if a parameter is set. + */ +static int +par_isset(PAR_INDEX index) +{ + return par_info(index)->name != 0; +} + + +/* + * Index the ParInfo table. + */ +static PAR_INFO * +par_info(PAR_INDEX index) +{ + PAR_INFO *p = &ParInfo[index]; + + if (index != p->index) + bug_die("par_info: table out of order: %d != %d", index, p-index); + return p; +} + + +/* + * Server. + */ +static void +server(void) +{ + pid_t pid; + + server_listen(); + for (;;) { + TEST *test; + + debug("waiting for request"); + if (!server_recv_request()) + continue; + if (Req.ver_maj != VER_MAJ || Req.ver_min != VER_MIN) { + int h_maj = Req.ver_maj; + int h_min = Req.ver_min; + int h_inc = Req.ver_inc; + int l_maj = VER_MAJ; + int l_min = VER_MIN; + int l_inc = VER_INC; + char *msg = "upgrade %s from %d.%d.%d to %d.%d.%d"; + char *low = "client"; + + if (l_maj > h_maj || (l_maj == h_maj && l_min > h_min)) { + h_maj = VER_MAJ; + h_min = VER_MIN; + h_inc = VER_INC; + l_maj = Req.ver_maj; + l_min = Req.ver_min; + l_inc = Req.ver_inc; + low = "server"; + } + error(msg, low, l_maj, l_min, l_inc, h_maj, h_min, h_inc); + continue; + } + if (Req.req_index >= cardof(Tests)) { + error("server: bad request index: %d", Req.req_index); + continue; + } + test = &Tests[Req.req_index]; + TestName = test->name; + debug("request is %s", TestName); + pid = fork(); + if (pid == 0) { + init_lstat(); + Finished = 0; + Successful = 0; + set_affinity(); + (test->server)(); + stop_timing(); + exit(0); + } else + waitpid(pid, 0, 0); + close(RemoteFD); + } + close(ListenFD); +} + + +/* + * Listen for any requests. + */ +static void +server_listen(void) +{ + int stat; + char *service; + struct addrinfo *r; + struct addrinfo *res; + struct addrinfo hints ={ + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + service = qasprintf("%d", ListenPort); + stat = getaddrinfo(0, service, &hints, &res); + if (stat != SUCCESS0) + error_die("getaddrinfo failed: %s", gai_strerror(stat)); + free(service); + + ListenFD = -1; + for (r = res; r; r = r->ai_next) { + ListenFD = socket(r->ai_family, r->ai_socktype, r->ai_protocol); + if (ListenFD >= 0) { + int one = 1; + stat = setsockopt(ListenFD, SOL_SOCKET, SO_REUSEADDR, + &one, sizeof(one)); + if (stat < 0) + syserror_die("setsockopt failed"); + if (bind(ListenFD, r->ai_addr, r->ai_addrlen) == SUCCESS0) + break; + close(ListenFD); + ListenFD = -1; + } + } + freeaddrinfo(res); + if (ListenFD < 0) + error_die("Unable to bind to listen port"); + + Req.timeout = ServerTimeout; + if (listen(ListenFD, LISTENQ) < 0) + syserror_die("listen failed"); +} + + +/* + * Accept a request from a client. + */ +static int +server_recv_request(void) +{ + REQ req; + socklen_t clientLen; + struct sockaddr_in clientAddr; + + clientLen = sizeof(clientAddr); + RemoteFD = accept(ListenFD, (struct sockaddr *)&clientAddr, &clientLen); + if (RemoteFD < 0) + return syserror("accept failed"); + if (!set_nonblock(RemoteFD)) + goto err; + if (!recv_mesg(&req, sizeof(req), "request data")) + goto err; + dec_init(&req); + dec_req(&Req); + return 1; + +err: + close(RemoteFD); + return 0; +} + + +/* + * Client. + */ +static void +client(TEST *test) +{ + int i; + + for (i = 0; i < P_N; ++i) + ParInfo[i].inuse = 0; + if (!par_isset(L_NO_MSGS)) + setp_u32(0, L_TIME, 2); + if (!par_isset(R_NO_MSGS)) + setp_u32(0, R_TIME, 2); + setp_u32(0, L_TIMEOUT, 5); + setp_u32(0, R_TIMEOUT, 5); + par_use(L_AFFINITY); + par_use(R_AFFINITY); + par_use(L_TIME); + par_use(R_TIME); + + set_affinity(); + RReq.ver_maj = VER_MAJ; + RReq.ver_min = VER_MIN; + RReq.ver_inc = VER_INC; + RReq.req_index = test - Tests; + TestName = test->name; + debug("sending request %s", TestName); + init_lstat(); + printf("%s:\n", TestName); + Finished = 0; + Successful = 0; + (*test->client)(); + close(RemoteFD); + if (!Successful) + ExitStatus = 1; + place_show(); +} + + +/* + * Send a request to the server. + */ +void +client_send_request(void) +{ + REQ req; + int stat; + char *service; + struct addrinfo *r; + struct addrinfo *res; + struct addrinfo hints ={ + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + service = qasprintf("%d", ListenPort); + stat = getaddrinfo(ServerName, service, &hints, &res); + if (stat != SUCCESS0) + error_die("getaddrinfo failed: %s", gai_strerror(stat)); + free(service); + + RemoteFD = -1; + if (Wait) + start_timing(Wait); + for (;;) { + for (r = res; r; r = r->ai_next) { + RemoteFD = socket(r->ai_family, r->ai_socktype, r->ai_protocol); + if (RemoteFD >= 0) { + if (connect(RemoteFD, r->ai_addr, r->ai_addrlen) == SUCCESS0) + break; + close(RemoteFD); + RemoteFD = -1; + } + } + if (RemoteFD >= 0 || !Wait || Finished) + break; + sleep(1); + } + if (Wait) + stop_timing(); + freeaddrinfo(res); + if (RemoteFD < 0) + error_die("Failed to connect"); + if (!set_nonblock(RemoteFD)) + die(); + enc_init(&req); + enc_req(&RReq); + if (!send_mesg(&req, sizeof(req), "request data")) + die(); +} + + +/* + * Set a file descriptor to non-blocking. + */ +static int +set_nonblock(int fd) +{ + int one = 1; + if (ioctl(fd, FIONBIO, &one) < 0) + return syserror("failed to set to non-blocking"); + return 1; +} + + +/* + * Synchronize the client and server. + */ +int +synchronize(void) +{ + if (is_client()) { + if (!send_sync()) + return 0; + if (!recv_sync()) + return 0; + } else { + if (!recv_sync()) + return 0; + if (!send_sync()) + return 0; + } + debug("sync completed"); + start_timing(Req.time); + return 1; +} + + +/* + * Exchange results. We sync up only to ensure that the client is out of its + * loop so we can close our socket or whatever communication medium we are + * using. + */ +void +exchange_results(void) +{ + STAT stat; + + if (!Successful) + return; + Successful = 0; + if (is_client()) { + if (!recv_mesg(&stat, sizeof(stat), "results")) + return; + dec_init(&stat); + dec_stat(&RStat); + if (!send_sync()) + return; + } else { + enc_init(&stat); + enc_stat(&LStat); + if (!send_mesg(&stat, sizeof(stat), "results")) + return; + if (!recv_sync()) + return; + } + Successful = 1; +} + + +/* + * Send a synchronize message. + */ +static int +send_sync(void) +{ + return send_mesg(SYNCMESG, SYNCSIZE, "sync"); +} + + +/* + * Receive a synchronize message. + */ +static int +recv_sync(void) +{ + char data[SYNCSIZE]; + + if (!recv_mesg(data, sizeof(data), "sync")) + return 0; + if (memcmp(data, SYNCMESG, SYNCSIZE) != SUCCESS0) + return error("sync failure: data does not match"); + return 1; +} + + +/* + * Send a message to the client. + */ +int +send_mesg(void *ptr, int len, char *item) +{ + debug("sending %s", item); + return send_recv_mesg('s', item, RemoteFD, ptr, len); +} + + +/* + * Receive a response from the server. + */ +int +recv_mesg(void *ptr, int len, char *item) +{ + debug("waiting for %s", item); + return send_recv_mesg('r', item, RemoteFD, ptr, len); +} + + +/* + * Send or receive a message to a file descriptor timing out after a certain + * amount of time. + */ +static int +send_recv_mesg(int sr, char *item, int fd, char *buf, int len) +{ + typedef ssize_t (IO)(int fd, void *buf, size_t count); + double etime; + fd_set *fdset; + fd_set rfdset; + fd_set wfdset; + char *action; + IO *func; + + if (sr == 'r') { + func = (IO *)read; + fdset = &rfdset; + action = "receive"; + } else { + func = (IO *)write; + fdset = &wfdset; + action = "send"; + } + + etime = get_seconds() + Req.timeout; + while (len) { + int n; + double time; + struct timeval timeval; + + errno = 0; + time = etime - get_seconds(); + if (time <= 0) + return error("failed to %s %s: timed out", action, item); + n = time += 1.0 / (1000*1000); + timeval.tv_sec = n; + timeval.tv_usec = (time-n) * 1000*1000; + + FD_ZERO(&rfdset); + FD_ZERO(&wfdset); + FD_SET(fd, fdset); + if (select(fd+1, &rfdset, &wfdset, 0, &timeval) < 0) + return syserror("failed to %s %s: select failed", action, item); + if (!FD_ISSET(fd, fdset)) + continue; + n = func(fd, buf, len); + if (n < 0) + return syserror("failed to %s %s", action, item); + if (n == 0) { + char *side = is_client() ? "server" : "client"; + return syserror("failed to %s %s: %s not responding", + action, item, side); + } + len -= n; + } + return 1; +} + + +/* + * Initialize local status information. + */ +static void +init_lstat(void) +{ + memcpy(&LStat, &IStat, sizeof(LStat)); +} + + +/* + * Show configuration (client side). + */ +static void +run_client_conf(void) +{ + CONF lconf; + CONF rconf; + + client_send_request(); + if (!recv_mesg(&rconf, sizeof(rconf), "configuration")) + return; + get_conf(&lconf); + view_strn('a', "", "loc_node", lconf.node); + view_strn('a', "", "loc_cpu", lconf.cpu); + view_strn('a', "", "loc_os", lconf.os); + view_strn('a', "", "loc_qperf", lconf.qperf); + view_strn('a', "", "rem_node", rconf.node); + view_strn('a', "", "rem_cpu", rconf.cpu); + view_strn('a', "", "rem_os", rconf.os); + view_strn('a', "", "rem_qperf", rconf.qperf); +} + + +/* + * Show configuration (server side). + */ +static void +run_server_conf(void) +{ + CONF conf; + get_conf(&conf); + send_mesg(&conf, sizeof(conf), "configuration"); +} + + +/* + * Get configuration. + */ +static void +get_conf(CONF *conf) +{ + struct utsname utsname; + + uname(&utsname); + strncopy(conf->node, utsname.nodename, sizeof(conf->node)); + snprintf(conf->os, sizeof(conf->os), "%s %s", utsname.sysname, + utsname.release); + get_cpu(conf); + snprintf(conf->qperf, sizeof(conf->qperf), "%d.%d.%d", + VER_MAJ, VER_MIN, VER_INC); +} + + +/* + * Get CPU information. + */ +static void +get_cpu(CONF *conf) +{ + char count[STRSIZE]; + char speed[STRSIZE]; + char buf[BUFSIZE]; + char cpu[BUFSIZE]; + char mhz[BUFSIZE]; + + int cpus = 0; + int mixed = 0; + FILE *fp = fopen("/proc/cpuinfo", "r"); + if (!fp) + error_die("Cannot open /proc/cpuinfo"); + cpu[0] = '\0'; + mhz[0] = '\0'; + while (fgets(buf, sizeof(buf), fp)) { + int n = strlen(buf); + if (cmpsub(buf, "model name")) { + ++cpus; + if (!mixed) { + if (cpu[0] == '\0') + strncopy(cpu, buf, sizeof(cpu)); + else if (!streq(buf, cpu)) + mixed = 1; + } + } else if (cmpsub(buf, "cpu MHz")) { + if (!mixed) { + if (mhz[0] == '\0') + strncopy(mhz, buf, sizeof(mhz)); + else if (!streq(buf, mhz)) + mixed = 1; + } + } + while (n && buf[n-1] != '\n') { + if (!fgets(buf, sizeof(buf), fp)) + break; + n = strlen(buf); + } + } + fclose(fp); + + /* CPU name */ + if (mixed) + strncopy(cpu, "Mixed CPUs", sizeof(cpu)); + else { + char *p = cpu; + char *q = skip_colon(cpu); + if (!q) + return; + for (;;) { + if (*q == '(' && cmpsub(q, "(r)")) + q += 3; + else if (*q == '(' && cmpsub(q, "(tm)")) + q += 4; + if (tolower(*q) == 'c' && cmpsub(q, "cpu ")) + q += 4; + if (tolower(*q) == 'p' && cmpsub(q, "processor ")) + q += 10; + else if (q[0] == ' ' && q[1] == ' ') + q += 1; + else if (q[0] == '\n') + q += 1; + else if (!(*p++ = *q++)) + break; + } + } + + /* CPU speed */ + speed[0] = '\0'; + if (!mixed) { + int n = strlen(cpu); + if (n < 3 || cpu[n-2] != 'H' || cpu[n-1] != 'z') { + char *q = skip_colon(mhz); + if (q) { + int freq = atoi(q); + if (freq < 1000) + snprintf(speed, sizeof(speed), " %dMHz", freq); + else + snprintf(speed, sizeof(speed), " %.1fGHz", freq/1000.0); + } + } + } + + /* Number of CPUs */ + if (cpus == 1) + count[0] = '\0'; + else if (cpus == 2) + snprintf(count, sizeof(count), "Dual-Core "); + else if (cpus == 4) + snprintf(count, sizeof(count), "Quad-Core "); + else + snprintf(count, sizeof(count), "%d-Core ", cpus); + + snprintf(conf->cpu, sizeof(conf->cpu), "%s%s%s", count, cpu, speed); +} + + +/* + * Quit (client side). + */ +static void +run_client_quit(void) +{ + opt_check(); + client_send_request(); + synchronize(); + exit(0); +} + + +/* + * Quit (server side). The read is to ensure that the client first quits to + * ensure that everything closes down cleanly. + */ +static void +run_server_quit(void) +{ + char buf[1]; + + synchronize(); + read(RemoteFD, buf, sizeof(buf)); + exit(0); +} + + +/* + * Start timing. + */ +static void +start_timing(int seconds) +{ + struct itimerval itimerval = {{0}}; + + get_times(LStat.time_s); + setitimer(ITIMER_REAL, &itimerval, 0); + if (!seconds) + return; + + debug("starting timer"); + itimerval.it_value.tv_sec = seconds; + itimerval.it_interval.tv_usec = 1; + setitimer(ITIMER_REAL, &itimerval, 0); +} + + +/* + * Stop timing. Note that the end time is obtained by the first call to + * set_finished. In the tests, usually, when SIGALRM goes off, it is executing + * a read or write system call which gets interrupted. If SIGALRM goes off + * after Finished is checked but before the system call is performed, the + * system call will be executed and it will take the second SIGALRM call + * generated by the interval timer to wake it up. Hence, we save the end times + * in sig_alrm. Note that if Finished is set, we reject any packets that are + * sent or arrive in order not to cheat. + */ +void +stop_timing(void) +{ + struct itimerval itimerval = {{0}}; + + set_finished(); + setitimer(ITIMER_REAL, &itimerval, 0); + debug("stopping timer"); +} + + +/* + * Establish the current test as finished. + */ +void +set_finished(void) +{ + if (Finished++ == 0) + get_times(LStat.time_e); +} + + +/* + * Show results. + */ +void +show_results(MEASURE measure) +{ + calc_results(); + show_info(measure); +} + + +/* + * Calculate results. + */ +static void +calc_results(void) +{ + double no_msgs; + double locTime; + double remTime; + double midTime; + double gB = 1000 * 1000 * 1000; + + if (!Successful) + return; + + add_ustat(&LStat.s, &RStat.rem_s); + add_ustat(&LStat.r, &RStat.rem_r); + add_ustat(&RStat.s, &LStat.rem_s); + add_ustat(&RStat.r, &LStat.rem_r); + + memset(&Res, 0, sizeof(Res)); + calc_node(&Res.l, &LStat); + calc_node(&Res.r, &RStat); + no_msgs = LStat.r.no_msgs + RStat.r.no_msgs; + if (no_msgs) + Res.latency = Res.l.time_real / no_msgs; + + locTime = Res.l.time_real; + remTime = Res.r.time_real; + midTime = (locTime + remTime) / 2; + + if (locTime == 0 || remTime == 0) + return; + + /* Calculate messaging rate */ + if (!RStat.r.no_msgs) + Res.msg_rate = LStat.r.no_msgs / remTime; + else if (!LStat.r.no_msgs) + Res.msg_rate = RStat.r.no_msgs / locTime; + else + Res.msg_rate = (LStat.r.no_msgs + RStat.r.no_msgs) / midTime; + + /* Calculate send bandwidth */ + if (!RStat.s.no_bytes) + Res.send_bw = LStat.s.no_bytes / locTime; + else if (!LStat.s.no_bytes) + Res.send_bw = RStat.s.no_bytes / remTime; + else + Res.send_bw = (LStat.s.no_bytes + RStat.s.no_bytes) / midTime; + + /* Calculate receive bandwidth. */ + if (!RStat.r.no_bytes) + Res.recv_bw = LStat.r.no_bytes / locTime; + else if (!LStat.r.no_bytes) + Res.recv_bw = RStat.r.no_bytes / remTime; + else + Res.recv_bw = (LStat.r.no_bytes + RStat.r.no_bytes) / midTime; + + /* Calculate costs */ + if (LStat.s.no_bytes && !LStat.r.no_bytes && !RStat.s.no_bytes) + Res.send_cost = Res.l.time_cpu*gB / LStat.s.no_bytes; + else if (RStat.s.no_bytes && !RStat.r.no_bytes && !LStat.s.no_bytes) + Res.send_cost = Res.r.time_cpu*gB / RStat.s.no_bytes; + if (RStat.r.no_bytes && !RStat.s.no_bytes && !LStat.r.no_bytes) + Res.recv_cost = Res.r.time_cpu*gB / RStat.r.no_bytes; + else if (LStat.r.no_bytes && !LStat.s.no_bytes && !RStat.r.no_bytes) + Res.recv_cost = Res.l.time_cpu*gB / LStat.r.no_bytes; +} + + +/* + * Determine the number of packets left to send. + */ +int +left_to_send(long *sentp, int room) +{ + int n; + + if (!Req.no_msgs) + return room; + n = Req.no_msgs - *sentp; + if (n <= 0) + return 0; + if (n > room) + return room; + return n; +} + + +/* + * Touch data. + */ +void +touch_data(void *p, int n) +{ + uint64_t a; + volatile uint64_t *p64 = p; + + while (n >= sizeof(*p64)) { + a = *p64++; + n -= sizeof(*p64); + } + if (n) { + volatile uint8_t *p8 = (uint8_t *)p64; + while (n >= sizeof(*p8)) { + a = *p8++; + n -= sizeof(*p8); + } + } +} + + +/* + * Combine statistics that the remote node kept track of with those that the + * local node kept. + */ +static void +add_ustat(USTAT *l, USTAT *r) +{ + l->no_bytes += r->no_bytes; + l->no_msgs += r->no_msgs; + l->no_errs += r->no_errs; +} + + +/* + * Calculate time values for a node. + */ +static void +calc_node(RESN *resn, STAT *stat) +{ + int i; + CLOCK cpu; + double s = stat->time_e[T_REAL] - stat->time_s[T_REAL]; + + memset(resn, 0, sizeof(*resn)); + if (s == 0) + return; + if (stat->no_ticks == 0) + return; + + resn->time_real = s / stat->no_ticks; + + cpu = 0; + for (i = 0; i < T_N; ++i) + if (i != T_REAL && i != T_IDLE) + cpu += stat->time_e[i] - stat->time_s[i]; + resn->time_cpu = (float) cpu / stat->no_ticks; + + resn->cpu_user = (stat->time_e[T_USER] - stat->time_s[T_USER] + + stat->time_e[T_NICE] - stat->time_s[T_NICE]) / s; + + resn->cpu_intr = (stat->time_e[T_IRQ] - stat->time_s[T_IRQ] + + stat->time_e[T_SOFTIRQ] - stat->time_s[T_SOFTIRQ]) / s; + + resn->cpu_idle = (stat->time_e[T_IDLE] - stat->time_s[T_IDLE]) / s; + + resn->cpu_kernel = (stat->time_e[T_KERNEL] - stat->time_s[T_KERNEL] + + stat->time_e[T_STEAL] - stat->time_s[T_STEAL]) / s; + + resn->cpu_io_wait = (stat->time_e[T_IOWAIT] - stat->time_s[T_IOWAIT]) / s; + + resn->cpu_total = resn->cpu_user + resn->cpu_intr + + resn->cpu_kernel + resn->cpu_io_wait; +} + + +/* + * Show relevant values. + */ +static void +show_info(MEASURE measure) +{ + if (!Successful) + return; + if (measure == LATENCY) { + view_time('a', "", "latency", Res.latency); + view_rate('s', "", "msg_rate", Res.msg_rate); + } else if (measure == MSG_RATE) { + view_rate('a', "", "msg_rate", Res.msg_rate); + } else if (measure == BANDWIDTH) { + view_band('a', "", "bw", Res.recv_bw); + view_rate('s', "", "msg_rate", Res.msg_rate); + } else if (measure == BANDWIDTH_SR) { + view_band('a', "", "send_bw", Res.send_bw); + view_band('a', "", "recv_bw", Res.recv_bw); + view_rate('s', "", "msg_rate", Res.msg_rate); + } + show_used(); + view_cost('t', "", "send_cost", Res.send_cost); + view_cost('t', "", "recv_cost", Res.recv_cost); + show_rest(); + if (Debug) + show_debug(); +} + + +/* + * Show parameters the user set. + */ +static void +show_used(void) +{ + PAR_NAME *p; + PAR_NAME *q = endof(ParName); + + if (!VerboseUsed) + return; + for (p = ParName; p < q; ++p) { + PAR_INFO *l = par_info(p->loc_i); + PAR_INFO *r = par_info(p->rem_i); + + if (!l->inuse && !r->inuse) + continue; + if (VerboseUsed < 2 && !l->set & !r->set) + continue; + if (l->type == 'l') { + uint32_t lv = *(uint32_t *)l->ptr; + uint32_t rv = *(uint32_t *)r->ptr; + if (lv == rv) + view_long('u', "", p->name, lv); + else { + view_long('u', "loc_", p->name, lv); + view_long('u', "rem_", p->name, rv); + } + } else if (l->type == 'p') { + if (streq(l->ptr, r->ptr)) + view_strn('u', "", p->name, l->ptr); + else { + view_strn('u', "loc_", p->name, l->ptr); + view_strn('u', "rem_", p->name, r->ptr); + } + } else if (l->type == 's') { + uint32_t lv = *(uint32_t *)l->ptr; + uint32_t rv = *(uint32_t *)r->ptr; + if (lv == rv) + view_size('u', "", p->name, lv); + else { + view_size('u', "loc_", p->name, lv); + view_size('u', "rem_", p->name, rv); + } + } else if (l->type == 't') { + uint32_t lv = *(uint32_t *)l->ptr; + uint32_t rv = *(uint32_t *)r->ptr; + if (lv == rv) + view_time('u', "", p->name, lv); + else { + view_time('u', "loc_", p->name, lv); + view_time('u', "rem_", p->name, rv); + } + } + } +} + + +/* + * Show the remaining parameters. + */ +static void +show_rest(void) +{ + RESN *resnS; + RESN *resnR; + STAT *statS; + STAT *statR; + int srmode = 0; + + if (!UnifyNodes) { + uint64_t ls = LStat.s.no_bytes; + uint64_t lr = LStat.r.no_bytes; + uint64_t rs = RStat.s.no_bytes; + uint64_t rr = RStat.r.no_bytes; + + if (ls && !rs && rr && !lr) { + srmode = 1; + resnS = &Res.l; + resnR = &Res.r; + statS = &LStat; + statR = &RStat; + } else if (rs && !ls && lr && !rr) { + srmode = 1; + resnS = &Res.r; + resnR = &Res.l; + statS = &RStat; + statR = &LStat; + } + } + + if (srmode) { + view_cpus('t', "", "send_cpus_used", resnS->cpu_total); + view_cpus('T', "", "send_cpus_user", resnS->cpu_user); + view_cpus('T', "", "send_cpus_intr", resnS->cpu_intr); + view_cpus('T', "", "send_cpus_kernel", resnS->cpu_kernel); + view_cpus('T', "", "send_cpus_iowait", resnS->cpu_io_wait); + view_time('T', "", "send_real_time", resnS->time_real); + view_time('T', "", "send_cpu_time", resnS->time_cpu); + view_long('S', "", "send_errors", statS->s.no_errs); + view_size('S', "", "send_bytes", statS->s.no_bytes); + view_long('S', "", "send_msgs", statS->s.no_msgs); + view_long('S', "", "send_max_cqe", statS->max_cqes); + + view_cpus('t', "", "recv_cpus_used", resnR->cpu_total); + view_cpus('T', "", "recv_cpus_user", resnR->cpu_user); + view_cpus('T', "", "recv_cpus_intr", resnR->cpu_intr); + view_cpus('T', "", "recv_cpus_kernel", resnR->cpu_kernel); + view_cpus('T', "", "recv_cpus_iowait", resnR->cpu_io_wait); + view_time('T', "", "recv_real_time", resnR->time_real); + view_time('T', "", "recv_cpu_time", resnR->time_cpu); + view_long('S', "", "recv_errors", statR->r.no_errs); + view_size('S', "", "recv_bytes", statR->r.no_bytes); + view_long('S', "", "recv_msgs", statR->r.no_msgs); + view_long('S', "", "recv_max_cqe", statR->max_cqes); + } else { + view_cpus('t', "", "loc_cpus_used", Res.l.cpu_total); + view_cpus('T', "", "loc_cpus_user", Res.l.cpu_user); + view_cpus('T', "", "loc_cpus_intr", Res.l.cpu_intr); + view_cpus('T', "", "loc_cpus_kernel", Res.l.cpu_kernel); + view_cpus('T', "", "loc_cpus_iowait", Res.l.cpu_io_wait); + view_time('T', "", "loc_real_time", Res.l.time_real); + view_time('T', "", "loc_cpu_time", Res.l.time_cpu); + view_long('S', "", "loc_send_errors", LStat.s.no_errs); + view_long('S', "", "loc_recv_errors", LStat.r.no_errs); + view_size('S', "", "loc_send_bytes", LStat.s.no_bytes); + view_size('S', "", "loc_recv_bytes", LStat.r.no_bytes); + view_long('S', "", "loc_send_msgs", LStat.s.no_msgs); + view_long('S', "", "loc_recv_msgs", LStat.r.no_msgs); + view_long('S', "", "loc_max_cqe", LStat.max_cqes); + + view_cpus('t', "", "rem_cpus_used", Res.r.cpu_total); + view_cpus('T', "", "rem_cpus_user", Res.r.cpu_user); + view_cpus('T', "", "rem_cpus_intr", Res.r.cpu_intr); + view_cpus('T', "", "rem_cpus_kernel", Res.r.cpu_kernel); + view_cpus('T', "", "rem_cpus_iowait", Res.r.cpu_io_wait); + view_time('T', "", "rem_real_time", Res.r.time_real); + view_time('T', "", "rem_cpu_time", Res.r.time_cpu); + view_long('S', "", "rem_send_errors", RStat.s.no_errs); + view_long('S', "", "rem_recv_errors", RStat.r.no_errs); + view_size('S', "", "rem_send_bytes", RStat.s.no_bytes); + view_size('S', "", "rem_recv_bytes", RStat.r.no_bytes); + view_long('S', "", "rem_send_msgs", RStat.s.no_msgs); + view_long('S', "", "rem_recv_msgs", RStat.r.no_msgs); + view_long('S', "", "rem_max_cqe", RStat.max_cqes); + } +} + + +/* + * Show all values. + */ +static void +show_debug(void) +{ + /* Local node */ + view_long('d', "", "l_no_cpus", LStat.no_cpus); + view_long('d', "", "l_no_ticks", LStat.no_ticks); + view_long('d', "", "l_max_cqes", LStat.max_cqes); + + if (LStat.no_ticks) { + double t = LStat.no_ticks; + CLOCK *s = LStat.time_s; + CLOCK *e = LStat.time_e; + double real = (e[T_REAL] - s[T_REAL]) / t; + double user = (e[T_USER] - s[T_USER]) / t; + double nice = (e[T_NICE] - s[T_NICE]) / t; + double system = (e[T_KERNEL] - s[T_KERNEL]) / t; + double idle = (e[T_IDLE] - s[T_IDLE]) / t; + double iowait = (e[T_IOWAIT] - s[T_IOWAIT]) / t; + double irq = (e[T_IRQ] - s[T_IRQ]) / t; + double softirq = (e[T_SOFTIRQ] - s[T_SOFTIRQ]) / t; + double steal = (e[T_STEAL] - s[T_STEAL]) / t; + + view_time('d', "", "l_timer_real", real); + view_time('d', "", "l_timer_user", user); + view_time('d', "", "l_timer_nice", nice); + view_time('d', "", "l_timer_system", system); + view_time('d', "", "l_timer_idle", idle); + view_time('d', "", "l_timer_iowait", iowait); + view_time('d', "", "l_timer_irq", irq); + view_time('d', "", "l_timer_softirq", softirq); + view_time('d', "", "l_timer_steal", steal); + } + + view_size('d', "", "l_s_no_bytes", LStat.s.no_bytes); + view_long('d', "", "l_s_no_msgs", LStat.s.no_msgs); + view_long('d', "", "l_s_no_errs", LStat.s.no_errs); + + view_size('d', "", "l_r_no_bytes", LStat.r.no_bytes); + view_long('d', "", "l_r_no_msgs", LStat.r.no_msgs); + view_long('d', "", "l_r_no_errs", LStat.r.no_errs); + + view_size('d', "", "l_rem_s_no_bytes", LStat.rem_s.no_bytes); + view_long('d', "", "l_rem_s_no_msgs", LStat.rem_s.no_msgs); + view_long('d', "", "l_rem_s_no_errs", LStat.rem_s.no_errs); + + view_size('d', "", "l_rem_r_no_bytes", LStat.rem_r.no_bytes); + view_long('d', "", "l_rem_r_no_msgs", LStat.rem_r.no_msgs); + view_long('d', "", "l_rem_r_no_errs", LStat.rem_r.no_errs); + + /* Remote node */ + view_long('d', "", "r_no_cpus", RStat.no_cpus); + view_long('d', "", "r_no_ticks", RStat.no_ticks); + view_long('d', "", "r_max_cqes", RStat.max_cqes); + + if (RStat.no_ticks) { + double t = RStat.no_ticks; + CLOCK *s = RStat.time_s; + CLOCK *e = RStat.time_e; + + double real = (e[T_REAL] - s[T_REAL]) / t; + double user = (e[T_USER] - s[T_USER]) / t; + double nice = (e[T_NICE] - s[T_NICE]) / t; + double system = (e[T_KERNEL] - s[T_KERNEL]) / t; + double idle = (e[T_IDLE] - s[T_IDLE]) / t; + double iowait = (e[T_IOWAIT] - s[T_IOWAIT]) / t; + double irq = (e[T_IRQ] - s[T_IRQ]) / t; + double softirq = (e[T_SOFTIRQ] - s[T_SOFTIRQ]) / t; + double steal = (e[T_STEAL] - s[T_STEAL]) / t; + + view_time('d', "", "r_timer_real", real); + view_time('d', "", "r_timer_user", user); + view_time('d', "", "r_timer_nice", nice); + view_time('d', "", "r_timer_system", system); + view_time('d', "", "r_timer_idle", idle); + view_time('d', "", "r_timer_iowait", iowait); + view_time('d', "", "r_timer_irq", irq); + view_time('d', "", "r_timer_softirq", softirq); + view_time('d', "", "r_timer_steal", steal); + } + + view_size('d', "", "r_s_no_bytes", RStat.s.no_bytes); + view_long('d', "", "r_s_no_msgs", RStat.s.no_msgs); + view_long('d', "", "r_s_no_errs", RStat.s.no_errs); + + view_size('d', "", "r_r_no_bytes", RStat.r.no_bytes); + view_long('d', "", "r_r_no_msgs", RStat.r.no_msgs); + view_long('d', "", "r_r_no_errs", RStat.r.no_errs); + + view_size('d', "", "r_rem_s_no_bytes", RStat.rem_s.no_bytes); + view_long('d', "", "r_rem_s_no_msgs", RStat.rem_s.no_msgs); + view_long('d', "", "r_rem_s_no_errs", RStat.rem_s.no_errs); + + view_size('d', "", "r_rem_r_no_bytes", RStat.rem_r.no_bytes); + view_long('d', "", "r_rem_r_no_msgs", RStat.rem_r.no_msgs); + view_long('d', "", "r_rem_r_no_errs", RStat.rem_r.no_errs); +} + + +/* + * Show a cost in terms of seconds per gigabyte. + */ +static void +view_cost(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ "ns/GB", "us/GB", "ms/GB", "sec/GB" }; + + value *= 1E9; + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Show the number of cpus. + */ +static void +view_cpus(int type, char *pref, char *name, double value) +{ + value *= 100; + if (!verbose(type, value)) + return; + place_val(pref, name, "% cpus", value); +} + + +/* + * Show a messaging rate. + */ +static void +view_rate(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ "/sec", "K/sec", "M/sec", "G/sec", "T/sec" }; + + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Show a number. + */ +static void +view_long(int type, char *pref, char *name, long long value) +{ + int n = 0; + double val = value; + static char *tab[] ={ "", "thousand", "million", "billion", "trillion" }; + + if (!verbose(type, val)) + return; + if (!UnifyUnits && val >= 1000*1000) { + while (val >= 1000 && n < (int)cardof(tab)-1) { + val /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], val); +} + + +/* + * Show a bandwidth value. + */ +static void +view_band(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ + "bytes/sec", "KB/sec", "MB/sec", "GB/sec", "TB/sec" + }; + + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Show a size. + */ +static void +view_size(int type, char *pref, char *name, long long value) +{ + int n = 0; + double val = value; + static char *tab[] ={ "bytes", "KB", "MB", "GB", "TB" }; + + if (!verbose(type, val)) + return; + if (!UnifyUnits) { + if (nice_1024(pref, name, value)) + return; + while (val >= 1000 && n < (int)cardof(tab)-1) { + val /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], val); +} + + +/* + * Show a number if it can be expressed as a nice multiple of a power of 1024. + */ +static int +nice_1024(char *pref, char *name, long long value) +{ + char *data; + char *altn; + int n = 0; + long long val = value; + static char *tab[] ={ "KiB", "MiB", "GiB", "TiB" }; + + if (val < 1024 || val % 1024) + return 0; + val /= 1024; + while (val >= 1024 && n < (int)cardof(tab)-1) { + if (val % 1024) + return 0; + val /= 1024; + ++n; + } + data = qasprintf("%lld", val); + altn = qasprintf("%lld", value); + place_any(pref, name, tab[n], commify(data), commify(altn)); + return 1; +} + + +/* + * Show a string. + */ +static void +view_strn(int type, char *pref, char *name, char *value) +{ + if (!verbose(type, value[0] != '\0')) + return; + place_any(pref, name, 0, strdup(value), 0); +} + + +/* + * Show a time. + */ +static void +view_time(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ "ns", "us", "ms", "sec" }; + + value *= 1E9; + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Determine if we are verbose enough to show a value. + */ +static int +verbose(int type, double value) +{ + if (type == 'a') + return 1; + if (value <= 0) + return 0; + switch (type) { + case 'd': return Debug; + case 'c': return VerboseConf >= 1; + case 's': return VerboseStat >= 1; + case 't': return VerboseTime >= 1; + case 'u': return VerboseUsed >= 1; + case 'C': return VerboseConf >= 2; + case 'S': return VerboseStat >= 2; + case 'T': return VerboseTime >= 2; + case 'U': return VerboseUsed >= 2; + default: bug_die("verbose: bad type: %c (%o)", type, type); + } + return 0; +} + + +/* + * Place a value to be shown later. + */ +static void +place_val(char *pref, char *name, char *unit, double value) +{ + char *data = qasprintf("%.0f", value); + char *p = data; + int n = Precision; + + if (*p == '-') + ++p; + while (isdigit(*p++)) + --n; + if (n > 0) { + free(data); + data = qasprintf("%.*f", n, value); + p = &data[strlen(data)]; + while (p > data && *--p == '0') + ; + if (p > data && *p == '.') + --p; + p[1] = '\0'; + } + place_any(pref, name, unit, commify(data), 0); +} + + +/* + * Place an entry in our show table. + */ +static void +place_any(char *pref, char *name, char *unit, char *data, char *altn) +{ + SHOW *show = &ShowTable[ShowIndex++]; + if (ShowIndex > cardof(ShowTable)) + bug_die("Need to increase size of ShowTable"); + show->pref = pref; + show->name = name; + show->unit = unit; + show->data = data; + show->altn = altn; +} + + +/* + * Show all saved values. + */ +static void +place_show(void) +{ + int i; + int nameLen = 0; + int dataLen = 0; + int unitLen = 0; + + /* First compute formating sizes */ + for (i = 0; i < ShowIndex; ++i) { + int n; + SHOW *show = &ShowTable[i]; + n = (show->pref ? strlen(show->pref) : 0) + strlen(show->name); + if (n > nameLen) + nameLen = n; + n = strlen(show->data); + if (show->unit) { + if (n > dataLen) + dataLen = n; + n = strlen(show->unit); + if (n > unitLen) + unitLen = n; + } + } + + /* Then display results */ + for (i = 0; i < ShowIndex; ++i) { + int n = 0; + SHOW *show = &ShowTable[i]; + + printf(" "); + if (show->pref) { + n = strlen(show->pref); + printf("%s", show->pref); + } + printf("%-*s", nameLen-n, show->name); + if (show->unit) { + printf(" = %*s", dataLen, show->data); + printf(" %s", show->unit); + } else + printf(" = %s", show->data); + if (show->altn) + printf(" (%s)", show->altn); + printf("\n"); + free(show->data); + free(show->altn); + } + ShowIndex = 0; +} + + +/* + * Set the processor affinity. + */ +static void +set_affinity(void) +{ + cpu_set_t set; + int a = Req.affinity; + + if (!a) + return; + CPU_ZERO(&set); + CPU_SET(a-1, &set); + if (sched_setaffinity(0, sizeof(set), &set) < 0) + syserror_die("Cannot set processor affinity (cpu %d)", a-1); +} + + +/* + * Encode a REQ structure into a data stream. + */ +static void +enc_req(REQ *host) +{ + enc_int(host->ver_maj, sizeof(host->ver_maj)); + enc_int(host->ver_min, sizeof(host->ver_min)); + enc_int(host->ver_inc, sizeof(host->ver_inc)); + enc_int(host->req_index, sizeof(host->req_index)); + enc_int(host->flip, sizeof(host->flip)); + enc_int(host->access_recv, sizeof(host->access_recv)); + enc_int(host->affinity, sizeof(host->affinity)); + enc_int(host->poll_mode, sizeof(host->poll_mode)); + enc_int(host->port, sizeof(host->port)); + enc_int(host->rd_atomic, sizeof(host->rd_atomic)); + enc_int(host->timeout, sizeof(host->timeout)); + enc_int(host->msg_size, sizeof(host->msg_size)); + enc_int(host->mtu_size, sizeof(host->mtu_size)); + enc_int(host->no_msgs, sizeof(host->no_msgs)); + enc_int(host->sock_buf_size, sizeof(host->sock_buf_size)); + enc_int(host->time, sizeof(host->time)); + enc_str(host->id, sizeof(host->id)); +} + + +/* + * Decode a REQ structure from a data stream. + */ +static void +dec_req(REQ *host) +{ + host->ver_maj = dec_int(sizeof(host->ver_maj)); + host->ver_min = dec_int(sizeof(host->ver_min)); + host->ver_inc = dec_int(sizeof(host->ver_inc)); + host->req_index = dec_int(sizeof(host->req_index)); + host->flip = dec_int(sizeof(host->flip)); + host->access_recv = dec_int(sizeof(host->access_recv)); + host->affinity = dec_int(sizeof(host->affinity)); + host->poll_mode = dec_int(sizeof(host->poll_mode)); + host->port = dec_int(sizeof(host->port)); + host->rd_atomic = dec_int(sizeof(host->rd_atomic)); + host->timeout = dec_int(sizeof(host->timeout)); + host->msg_size = dec_int(sizeof(host->msg_size)); + host->mtu_size = dec_int(sizeof(host->mtu_size)); + host->no_msgs = dec_int(sizeof(host->no_msgs)); + host->sock_buf_size = dec_int(sizeof(host->sock_buf_size)); + host->time = dec_int(sizeof(host->time)); + dec_str(host->id, sizeof(host->id)); +} + + +/* + * Encode a STAT structure into a data stream. + */ +static void +enc_stat(STAT *host) +{ + int i; + + enc_int(host->no_cpus, sizeof(host->no_cpus)); + enc_int(host->no_ticks, sizeof(host->no_ticks)); + enc_int(host->max_cqes, sizeof(host->max_cqes)); + for (i = 0; i < T_N; ++i) + enc_int(host->time_s[i], sizeof(host->time_s[i])); + for (i = 0; i < T_N; ++i) + enc_int(host->time_e[i], sizeof(host->time_e[i])); + enc_ustat(&host->s); + enc_ustat(&host->r); + enc_ustat(&host->rem_s); + enc_ustat(&host->rem_r); +} + + +/* + * Decode a STAT structure from a data stream. + */ +static void +dec_stat(STAT *host) +{ + int i; + + host->no_cpus = dec_int(sizeof(host->no_cpus)); + host->no_ticks = dec_int(sizeof(host->no_ticks)); + host->max_cqes = dec_int(sizeof(host->max_cqes)); + for (i = 0; i < T_N; ++i) + host->time_s[i] = dec_int(sizeof(host->time_s[i])); + for (i = 0; i < T_N; ++i) + host->time_e[i] = dec_int(sizeof(host->time_e[i])); + dec_ustat(&host->s); + dec_ustat(&host->r); + dec_ustat(&host->rem_s); + dec_ustat(&host->rem_r); +} + + +/* + * Encode a USTAT structure into a data stream. + */ +static void +enc_ustat(USTAT *host) +{ + enc_int(host->no_bytes, sizeof(host->no_bytes)); + enc_int(host->no_msgs, sizeof(host->no_msgs)); + enc_int(host->no_errs, sizeof(host->no_errs)); +} + + +/* + * Decode a USTAT structure from a data stream. + */ +static void +dec_ustat(USTAT *host) +{ + host->no_bytes = dec_int(sizeof(host->no_bytes)); + host->no_msgs = dec_int(sizeof(host->no_msgs)); + host->no_errs = dec_int(sizeof(host->no_errs)); +} + + +/* + * Initialize encode pointer. + */ +void +enc_init(void *p) +{ + EncodePtr = p; +} + + +/* + * Initialize decode pointer. + */ +void +dec_init(void *p) +{ + DecodePtr = p; +} + + +/* + * Encode a string. + */ +void +enc_str(char *s, int n) +{ + memcpy(EncodePtr, s, n); + EncodePtr += n; +} + + +/* + * Decode a string. + */ +void +dec_str(char *s, int n) +{ + memcpy(s, DecodePtr, n); + DecodePtr += n; +} + + +/* + * Encode an integer. + */ +void +enc_int(int64_t l, int n) +{ + while (n--) { + *EncodePtr++ = l; + l >>= 8; + } +} + + +/* + * Decode an integer. + */ +int64_t +dec_int(int n) +{ + uint64_t l = 0; + uint8_t *p = (DecodePtr += n); + while (n--) + l = (l << 8) | (*--p & 0xFF); + return l; +} + + +/* + * Get various temporal parameters. + */ +static void +get_times(CLOCK timex[T_N]) +{ + int n; + char *p; + char buf[BUFSIZE]; + struct tms tms; + + timex[0] = times(&tms); + if (lseek(ProcStatFD, 0, 0) < 0) + syserror_die("Failed to seek /proc/stat"); + n = read(ProcStatFD, buf, sizeof(buf)-1); + buf[n] = '\0'; + if (strncmp(buf, "cpu ", 4)) + error_die("/proc/stat does not start with 'cpu '"); + p = &buf[3]; + for (n = 1; n < T_N; ++n) { + while (*p == ' ') + ++p; + if (!isdigit(*p)) { + if (*p != '\n' || n < T_N-1) + error_die("/proc/stat has bad format"); + break; + } + timex[n] = strtoll(p, 0, 10); + while (*p != ' ' && *p != '\n' && *p != '\0') + ++p; + } + while (n < T_N) + timex[n++] = 0; +} + + +/* + * Get the time of day in seconds as a floating point number. + */ +static double +get_seconds(void) +{ + struct timeval timeval; + + if (gettimeofday(&timeval, 0) < 0) + syserror_die("gettimeofday failed"); + return timeval.tv_sec + timeval.tv_usec/(1000.0*1000.0); +} + + +/* + * Insert commas within a number for readability. + */ +static char * +commify(char *data) +{ + int s; + int d; + int seqS; + int seqE; + int dataLen; + int noCommas; + + if (!data) + return data; + if (UnifyUnits) + return data; + dataLen = strlen(data); + seqS = seqE = dataLen; + while (--seqS >= 0) + if (!isdigit(data[seqS])) + break; + if (seqS >= 0 && data[seqS] == '.') { + seqE = seqS; + while (--seqS >= 0) + if (!isdigit(data[seqS])) + break; + } + noCommas = (--seqE - ++seqS) / 3; + if (noCommas == 0) + return data; + data = realloc(data, dataLen+noCommas+1); + if (!data) + error_die("Out of space"); + s = dataLen; + d = dataLen + noCommas; + for (;;) { + int n; + data[d--] = data[s--]; + n = seqE - s; + if (n > 0 && n%3 == 0) { + data[d--] = ','; + if (--noCommas == 0) + break; + } + } + return data; +} + + +/* + * Like strncpy but ensures the destination is null terminated. + */ +static void +strncopy(char *d, char *s, int n) +{ + strncpy(d, s, n); + d[n-1] = '\0'; +} + + +/* + * Call malloc and panic on error. + */ +void * +qmalloc(long n) +{ + void *p = malloc(n); + if (!p) + error_die("Out of space"); + return p; +} + + +/* + * Print out an error message and exit. + */ +static char * +qasprintf(char *fmt, ...) +{ + int stat; + char *str; + va_list alist; + + va_start(alist, fmt); + stat = vasprintf(&str, fmt, alist); + va_end(alist); + if (stat < 0) + error_die("Out of space"); + return str; +} + + +/* + * Print out a debug message. + */ +void +debug(char *fmt, ...) +{ + va_list alist; + + if (!Debug) + return; + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); +} + + +/* + * Print out an error message. + */ +int +error(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); + return 0; +} + + +/* + * Print out an error message and exit. + */ +void +error_die(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); + die(); +} + + +/* + * Print out a system error message. + */ +int +syserror(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + if (errno) + fprintf(stderr, ": %s", strerror(errno)); + fprintf(stderr, "\n"); + return 0; +} + +/* + * Print out a system error message and exit. + */ +void +syserror_die(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + if (errno) + fprintf(stderr, ": %s", strerror(errno)); + fprintf(stderr, "\n"); + die(); +} + + +/* + * Print out an internal error and exit. + */ +static void +bug_die(char *fmt, ...) +{ + va_list alist; + + fprintf(stderr, "internal error: "); + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); + die(); +} + + +/* + * Exit unsuccessfully. + */ +void +die(void) +{ + exit(1); +} diff --git a/qperf.h b/qperf.h new file mode 100644 index 0000000..0bf5774 --- /dev/null +++ b/qperf.h @@ -0,0 +1,316 @@ +/* + * qperf - general header file. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * Parameters. + */ +#define STRSIZE 64 + + +/* + * For convenience and readability. + */ +#define SUCCESS0 0 +#define cardof(a) (sizeof(a)/sizeof(*a)) +#define endof(a) (&a[cardof(a)]) +#define streq(a, b) (strcmp(a, b) == 0) +#define is_client() (ServerName) + +typedef uint64_t CLOCK; + + +/* + * Time indices. + */ +typedef enum { + T_REAL, + T_USER, + T_NICE, + T_KERNEL, + T_IDLE, + T_IOWAIT, + T_IRQ, + T_SOFTIRQ, + T_STEAL, + T_N +} TIME_INDEX; + + +/* + * Parameter indices. P_NULL must be 0. + */ +typedef enum { + P_NULL, + L_ACCESS_RECV, + R_ACCESS_RECV, + L_AFFINITY, + R_AFFINITY, + L_FLIP, + R_FLIP, + L_ID, + R_ID, + L_MSG_SIZE, + R_MSG_SIZE, + L_MTU_SIZE, + R_MTU_SIZE, + L_NO_MSGS, + R_NO_MSGS, + L_POLL_MODE, + R_POLL_MODE, + L_PORT, + R_PORT, + L_RATE, + R_RATE, + L_RD_ATOMIC, + R_RD_ATOMIC, + L_SOCK_BUF_SIZE, + R_SOCK_BUF_SIZE, + L_TIME, + R_TIME, + L_TIMEOUT, + R_TIMEOUT, + P_N +} PAR_INDEX; + + +/* + * What we are measuring. + */ +typedef enum { + LATENCY, + MSG_RATE, + BANDWIDTH, + BANDWIDTH_SR +} MEASURE; + + +/* + * Request to the server. Note that most of these must be of type uint32_t + * because of the way options are set. The minor version must be changed if + * there is change to this data structure. + */ +typedef struct REQ { + uint16_t ver_maj; /* Major version */ + uint16_t ver_min; /* Minor version */ + uint16_t ver_inc; /* Incremental version */ + uint16_t req_index; /* Request index (into Tests) */ + uint32_t flip; /* Flip local/remote node functions */ + uint32_t access_recv; /* Access data after receiving */ + uint32_t affinity; /* Processor affinity */ + uint32_t poll_mode; /* Poll mode */ + uint32_t port; /* Port number requested */ + uint32_t rd_atomic; /* Number of pending RDMA or atomics */ + uint32_t timeout; /* Timeout for messages */ + uint32_t msg_size; /* Message Size */ + uint32_t mtu_size; /* MTU Size */ + uint32_t no_msgs; /* Number of messages */ + uint32_t sock_buf_size; /* Socket buffer size */ + uint32_t time; /* Duration in seconds */ + char id[STRSIZE]; /* Identifier */ + char rate[STRSIZE]; /* Rate */ +} REQ; + + +/* + * Transfer statistics. + */ +typedef struct USTAT { + uint64_t no_bytes; /* Number of bytes transfered */ + uint64_t no_msgs; /* Number of messages */ + uint64_t no_errs; /* Number of errors */ +} USTAT; + + +/* + * Statistics. + */ +typedef struct STAT { + uint32_t no_cpus; /* Number of processors */ + uint32_t no_ticks; /* Ticks per second */ + uint32_t max_cqes; /* Maximum CQ entries */ + CLOCK time_s[T_N]; /* Start times */ + CLOCK time_e[T_N]; /* End times */ + USTAT s; /* Send statistics */ + USTAT r; /* Receive statistics */ + USTAT rem_s; /* Remote send statistics */ + USTAT rem_r; /* Remote receive statistics */ +} STAT; + + +/* + * Results per node. + */ +typedef struct RESN { + double time_real; /* Real (elapsed) time in seconds */ + double time_cpu; /* Cpu time in seconds */ + double cpu_total; /* Cpu time (as a fraction of a cpu) */ + double cpu_user; /* User time (fraction of cpu) */ + double cpu_intr; /* Interrupt time (fraction of cpu) */ + double cpu_idle; /* Idle time (fraction of cpu) */ + double cpu_kernel; /* Kernel time (fraction of cpu) */ + double cpu_io_wait; /* IO wait time (fraction of cpu) */ +} RESN; + + +/* + * Results. + */ +typedef struct RES { + RESN l; /* Local information */ + RESN r; /* Remote information */ + double send_bw; /* Send bandwidth */ + double recv_bw; /* Receive bandwidth */ + double msg_rate; /* Messaging rate */ + double send_cost; /* Send cost */ + double recv_cost; /* Receive cost */ + double latency; /* Latency */ +} RES; + + +/* + * Functions prototypes. + */ +void client_send_request(void); +void debug(char *fmt, ...); +void dec_init(void *p); +int64_t dec_int(int n); +void dec_str(char *s, int n); +void die(void); +void enc_init(void *p); +void enc_int(int64_t l, int n); +void enc_str(char *s, int n); +int error(char *fmt, ...); +void error_die(char *fmt, ...); +void exchange_results(void); +int left_to_send(long *sentp, int room); +void opt_check(void); +void *qmalloc(long n); +int recv_mesg(void *ptr, int len, char *item); +int send_mesg(void *ptr, int len, char *item); +void set_finished(void); +void setp_u32(char *name, PAR_INDEX index, uint32_t l); +void setp_str(char *name, PAR_INDEX index, char *s); +void setv_u32(PAR_INDEX index, uint32_t l); +void show_results(MEASURE measure); +void stop_timing(void); +int synchronize(void); +int syserror(char *fmt, ...); +void syserror_die(char *fmt, ...); +void touch_data(void *p, int n); +void par_use(PAR_INDEX index); + + +/* + * Socket tests (ip.c). + */ +void run_client_rds_bw(void); +void run_server_rds_bw(void); +void run_client_rds_lat(void); +void run_server_rds_lat(void); +void run_client_sdp_bw(void); +void run_server_sdp_bw(void); +void run_client_sdp_lat(void); +void run_server_sdp_lat(void); +void run_client_tcp_bw(void); +void run_server_tcp_bw(void); +void run_client_tcp_lat(void); +void run_server_tcp_lat(void); +void run_client_udp_bw(void); +void run_server_udp_bw(void); +void run_client_udp_lat(void); +void run_server_udp_lat(void); + + +/* + * InfiniBand tests (ib.c). + */ +void run_client_bug(void); +void run_server_bug(void); +void run_client_rc_bi_bw(void); +void run_server_rc_bi_bw(void); +void run_client_rc_bw(void); +void run_server_rc_bw(void); +void run_client_rc_compare_swap_mr(void); +void run_server_rc_compare_swap_mr(void); +void run_client_rc_fetch_add_mr(void); +void run_server_rc_fetch_add_mr(void); +void run_client_rc_lat(void); +void run_server_rc_lat(void); +void run_client_rc_rdma_read_bw(void); +void run_server_rc_rdma_read_bw(void); +void run_client_rc_rdma_read_lat(void); +void run_server_rc_rdma_read_lat(void); +void run_client_rc_rdma_write_bw(void); +void run_server_rc_rdma_write_bw(void); +void run_client_rc_rdma_write_lat(void); +void run_server_rc_rdma_write_lat(void); +void run_client_rc_rdma_write_poll_lat(void); +void run_server_rc_rdma_write_poll_lat(void); +void run_client_uc_bi_bw(void); +void run_server_uc_bi_bw(void); +void run_client_uc_bw(void); +void run_server_uc_bw(void); +void run_client_uc_lat(void); +void run_server_uc_lat(void); +void run_client_uc_rdma_write_bw(void); +void run_server_uc_rdma_write_bw(void); +void run_client_uc_rdma_write_lat(void); +void run_server_uc_rdma_write_lat(void); +void run_client_uc_rdma_write_poll_lat(void); +void run_server_uc_rdma_write_poll_lat(void); +void run_client_ud_bi_bw(void); +void run_server_ud_bi_bw(void); +void run_client_ud_bw(void); +void run_server_ud_bw(void); +void run_client_ud_lat(void); +void run_server_ud_lat(void); +void run_client_ver_rc_compare_swap(void); +void run_server_ver_rc_compare_swap(void); +void run_client_ver_rc_fetch_add(void); +void run_server_ver_rc_fetch_add(void); + + +/* + * Variables. + */ +extern RES Res; +extern REQ Req; +extern STAT LStat; +extern char *Usage[]; +extern char *TestName; +extern char *ServerName; +extern int Successful; +extern volatile int Finished;