commit 529b2a4458161aa125a5c7ea68c0f1c878207e44 Author: Johann George (QLogic) Date: Mon Sep 24 11:11:24 2007 -0700 Initial commit diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d511905 --- /dev/null +++ b/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4fe250b --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +CC = gcc +CFLAGS = -Wall -O -DRDMA + +all: qperf + +qperf: qperf.o ip.o ib.o help.o + $(CC) -DRDMA -o $@ $^ -libverbs + +help.c: help.txt + ./mkhelp RDMA + +.PHONY: clean +clean: + rm -f *.o help.c qperf + +.PHONY: install +install: + cp qperf /usr/local/bin diff --git a/configure b/configure new file mode 100755 index 0000000..b983a08 --- /dev/null +++ b/configure @@ -0,0 +1,55 @@ +#!/bin/sh +# +LIBIBV=libibverbs.so.1 + +# Look for InfiniBand verbs library. +# +for DIR in /lib /usr/lib64 /usr/lib /usr/local/lib /usr/local/ibed/lib64 ""; do + [ -e $DIR/$LIBIBV ] && + break +done + + +# Set up parameters. +# +if [ "`uname -m`" = ppc64 ]; then + K_M64=-m64 + echo "Making PowerPC version" +fi +if [ -z "$DIR" ]; then + echo "Could not find $LIBIBV; making non-RDMA version" +else + echo "Found $LIBIBV in $DIR" + K_IB_O="ib.o" + K_DEF_IB="-DRDMA" + if [ "$DIR" = "/usr/lib64" ]; then + K_LIBS="-libverbs" + else + K_LIBS="-lsysfs -Wl,--rpath -Wl,$DIR $DIR/$LIBIBV" + fi + K_IB=RDMA +fi + + +# Produce Makefile +# +cat <Makefile +CC = gcc $K_M64 +CFLAGS = -Wall -O $K_DEF_IB + +all: qperf + +qperf: qperf.o ip.o $K_IB_O help.o + \$(CC) $K_DEF_IB -o \$@ \$^ $K_LIBS + +help.c: help.txt + ./mkhelp $K_IB + +.PHONY: clean +clean: + rm -f *.o help.c qperf + +.PHONY: install +install: + cp qperf /usr/local/bin +EOF diff --git a/help.txt b/help.txt new file mode 100644 index 0000000..d253b68 --- /dev/null +++ b/help.txt @@ -0,0 +1,708 @@ +Main + Usage: + qperf [options] ... + Description + One typically runs qperf with no arguments on the server machine. On a + client machine, one specifies the hostname of the server machine + followed by a list of tests that might be run. + More Information + qperf --help examples Some examples of using qperf + qperf --help opts Summary of options + qperf --help options Description of options + qperf --help tests Short summary and description of tests + qperf --help More information on a particular test +Opts + --access_recv Mode (-Ar) Access receive data + --affinity PN (-a) Set processor affinity + --loc_affinity PN (-la) Set local processor affinity + --rem_affinity PN (-ra) Set remote processor affinity + --flip Mode (-f) Flip sender and receiver + --help Topic (-h) Get more information on a topic + --host Node (-H) Identify server node + --id Device:Port (-i) Set IB device and port + --loc_id Device:Port (-li) Set local IB device and port + --rem_id Device:Port (-ri) Set remote IB device and port + --listen_port Port (-lp) Set server listen port + --msg_size Size (-m) Set message size + --mtu_size Size (-M) Set MTU size (IB only) + --no_msgs Count (-n) Send Count messages + --poll Mode (-P) Set polling mode on/off + --loc_poll Mode (-lP) Set local polling mode on/off + --rem_poll Mode (-lP) Set remote polling mode on/off + --port Port (-p) Set TCP port used for tests + --precision Digits (-e) Set precision reported + --rate (-r) Set IB static rate + --loc_rate (-lr) Set local IB static rate + --rem_rate (-rr) Set remote IB static rate + --rd_atomic Count (-R) Set RDMA read/atomic count + --loc_rd_atomic Count (-lR) Set local RDMA read/atomic count + --rem_rd_atomic Count (-rR) Set remote RDMA read/atomic count + --sock_buf_size Size (-S) Set socket buffer size + --loc_sock_buf_size Size (-lS) Set local socket buffer size + --rem_sock_buf_size Size (-rS) Set remote socket buffer size + --time (-t) Set test duration + --timeout Time (-T) Set timeout + --loc_timeout Time (-lT) Set local timeout + --rem_timeout Time (-rT) Set remote timeout + --unify_nodes (-U) Unify nodes + --unify_units (-u) Unify units + --verbose (-v) Verbose; turn on all of -v[cstu] + --verbose_conf (-vc) Show configuration information + --verbose_stat (-vs) Show statistical information + --verbose_time (-vt) Show timing information + --verbose_used (-vu) Show information on parameters + --verbose_more (-vv) More verbose; turn on all of -v[CSTU] + --verbose_more_conf (-vC) Show more configuration information + --verbose_more_stat (-vS) Show more statistical information + --verbose_more_time (-vT) Show more timing information + --verbose_more_used (-vU) Show more information on parameters + --version (-V) Print out version + --wait Time (-W) Set time to wait for server +Options + --access_recv Mode (-Ar) + If Mode is non-zero, data is accessed once received. Otherwise, data + is ignored. By default, Mode is 0. + --affinity PN (-a) + Set processor affinity to PN. Processors are numbered sequentially + from 0. If PN is "any", any processor is allowed otherwise the + processor is limited to the one specified. + --loc_affinity PN (-la) + Set local processor affinity to PN. + --rem_affinity PN (-ra) + Set remote processor affinity to PN. + --flip Mode (-f) + Cause sender and receiver to play opposite roles. + --help Topic (-h) + Print out information about Topic. To see the list of topics, type + qperf --help + --host Host (-H) + Run test between the current node and the qperf running on node Host. + This can also be specified as the first non-option argument. + --id Device:Port (-i) + Use InfiniBand Device and Port. + --loc_id Device:Port (-li) + Use local InfiniBand Device and Port. + --rem_id Device:Port (-ri) + Use remote InfiniBand Device and Port. + --listen_port Port (-lp) + Set the port we listen on to ListenPort. This must be set to the + same port on both the server and client machines. The default value + is 19765. + --msg_size Size (-m) + Set the message size to Size. The default value varies by test. It + is assumed that the value is specified in bytes however, a trailing + kib or K, mib or M, or gib or G indicates that the size is being + specified in kibibytes, mebibytes or gibibytes respectively while a + trailing kb or k, mb or m, or gb or g indicates kilobytes, megabytes + or gigabytes respectively. + --mtu_size Size (-M) + Set the MTU size. Only relevant to the RDMA UC/RC tests. Units are + specified in the same manner as the --msg_size option. + --no_msgs N (-n) + Set test duration by number of messages sent instead of time. + --poll Mode (-P) + Turn polling mode on or off. This is only relevant to the RDMA tests + and determines whether they poll or wait. If Mode is 0, they wait; + otherwise they poll. + --loc_poll Mode (-lP) + Locally turn polling mode on or off. + --rem_poll Mode (-rP) + Remotely turn polling mode on or off. + --port Port (-p) + Use Port to run the socket tests. This is different from + --listen_port which is used for synchronization. This is only + relevant for the socket tests and refers to the TCP/UDP/SDP/RDS port + that the test is run on. + --precision Digits (-e) + Set the number of significant digits that are used to report results. + --rate Rate (-r) + Force InfiniBand static rate. Rate can be one of: 2.5, 5, 10, 20, + 30, 40, 60, 80, 120, 1xSDR (2.5 Gbps), 1xDDR (5 Gbps), 1xQDR (10 + Gbps), 4xSDR (2.5 Gbps), 4xDDR (5 Gbps), 4xQDR (10 Gbps), 8xSDR (2.5 + Gbps), 8xDDR (5 Gbps), 8xQDR (10 Gbps). + --loc_rate (-lr) + Force local InfiniBand static rate + --rem_rate (-rr) + Force remote InfiniBand static rate + --rd_atomic Count (-R) + Set the number of in-flight operations that can be handled for a RDMA + read or atomic operation. This is only relevant to the RDMA tests. + --loc_rd_atomic Count (-lR) + Set local read/atomic count. + --rem_rd_atomic Count (-rR) + Set remote read/atomic count. + --sock_buf_size Size (-S) + Set the socket buffer size. This is only relevant to the socket + tests. + --loc_sock_buf_size Size (-lS) + Set local socket buffer size. + --rem_sock_buf_size Size (-rS) + Set remote socket buffer size. + --time Time (-t) + Set test duration to Time. Specified in seconds however a trailing + m, h or d indicates that the time is specified in minutes, hours or + days respectively. + --timeout Time (-T) + Set timeout to Time. This is the timeout used for various things + such as exchanging messages. The default is 5 seconds. + --loc_timeout Time (-lT) + Set local timeout to Time. + --rem_timeout Time (-rT) + Set local timeout to Time. + --unify_nodes (-U) + Unify the nodes. Describe them in terms of local and remote rather + than send and receive. + --unify_units (-u) + Unify the units that results are shown in. Uses the lowest common + denominator. Helpful for scripts. + --verbose (-v) + Provide more detailed output. Turns on -vc, -vs, -vt and -vu. + --verbose_conf (-vc) + Provide information on configuration. + --verbose_stat (-vs) + Provide information on statistics. + --verbose_stat (-vt) + Provide information on timing. + --verbose_stat (-vu) + Provide information on parameters used. + --verbose_more (-vv) + Provide even more detailed output. Turns on -vC, -vS, -vT and -vU. + --verbose_conf (-vC) + Provide more information on configuration. + --verbose_stat (-vS) + Provide more information on statistics. + --verbose_stat (-vT) + Provide more information on timing. + --verbose_stat (-vU) + Provide more information on parameters used. + --version (-V) + The current version of qperf is printed. + --wait Time (-W) + If the server is not ready, continue to try connecting for Time + seconds before giving up. +Examples + For these examples, we assume that qperf is running on a machine called + myserver in server mode. To run qperf in server mode, run it with no + arguments. In all the subsequent examples, we run qperf on another machine + and connect to the server which we assume has a hostname of myserver. + * To run a TCP bandwidth and latency test: + qperf myserver tcp_bw tcp_lat + * To run a SDP bandwidth test for 10 seconds: + qperf -t 10 myserver sdp_bw + * To run a UDP latency test and then cause the server to terminate: + qperf myserver udp_lat quit + * To measure the RDMA UD latency and bandwidth: + qperf myserver ud_lat ud_bw + * To measure RDMA UC bi-directional bandwidth: + qperf myserver rc_bi_bw +Tests -RDMA + Miscellaneous + conf Show configuration + quit Cause the server to quit + Socket Based + tcp_bw TCP streaming one way bandwidth + tcp_lat TCP one way latency + udp_bw UDP streaming one way bandwidth + udp_lat UDP one way latency + sdp_bw SDP streaming one way bandwidth + sdp_lat SDP one way latency + rds_bw RDS streaming one way bandwidth + rds_lat RDS one way latency +Tests +RDMA + Miscellaneous + conf Show configuration + quit Cause the server to quit + Socket Based + tcp_bw TCP streaming one way bandwidth + tcp_lat TCP one way latency + udp_bw UDP streaming one way bandwidth + udp_lat UDP one way latency + sdp_bw SDP streaming one way bandwidth + sdp_lat SDP one way latency + rds_bw RDS streaming one way bandwidth + rds_lat RDS one way latency + RDMA Send/Receive + ud_bw UD streaming one way bandwidth + ud_bi_bw UD streaming two way bandwidth + ud_lat UD one way latency + rc_bw RC streaming one way bandwidth + rc_bi_bw RC streaming two way bandwidth + rc_lat RC one way latency + uc_bw UC streaming one way bandwidth + uc_bi_bw UC streaming two way bandwidth + uc_lat UC one way latency + RDMA + rc_rdma_read_bw RC RDMA read streaming one way bandwidth + rc_rdma_read_lat RC RDMA read one way latency + rc_rdma_write_bw RC RDMA write streaming one way bandwidth + rc_rdma_write_lat RC RDMA write one way latency + rc_rdma_write_poll_lat RC RDMA write one way polling latency + uc_rdma_write_bw UC RDMA write streaming one way bandwidth + uc_rdma_write_lat UC RDMA write one way latency + uc_rdma_write_poll_lat UC RDMA write one way polling latency + InfiniBand Atomics + rc_compare_swap_mr RC compare and swap messaging rate + rc_fetch_add_mr RC fetch and add messaging rate + Verification + ver_rc_compare_swap verify RC compare and swap + ver_rc_fetch_add verify RC fetch and add +conf + Purpose + Show configuration + Common Options + None + Description + Shows the node name, CPUs and OS of both nodes being used. +quit + Purpose + Quit + Common Options + None + Description + Causes the server to quit. +tcp_bw + Purpose + TCP streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +tcp_lat + Purpose + TCP one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using TCP sockets. +udp_bw + Purpose + UDP streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +udp_lat + Purpose + UDP one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using UDP sockets. +sdp_bw + Purpose + SDP streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +sdp_lat + Purpose + SDP one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using SDP sockets. +rds_bw + Purpose + RDS streaming one way bandwidth + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly sends messages to the server while the server + notes how many were received. +rds_lat + Purpose + RDS one way latency + Common Options + --affinity PN (-a) set processor affinity + --msg_size Size (-m) set message size + --sock_buf_size Size (-S) set socket buffer size + --time (-t) set test duration + Other Options + --listen_port, --port, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using RDS sockets. +ud_bw +RDMA + Purpose + UD streaming one way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client sends messages to the server who notes how many it received. + The UD Send/Receive mechanism is used. +ud_bi_bw +RDMA + Purpose + UD streaming two way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Both the client and server exchange messages with each other using the + UD Send/Receive mechanism and note how many were received. +ud_lat +RDMA + Purpose + UD one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using UD Send/Receive. +rc_bw +RDMA + Purpose + RC streaming one way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client sends messages to the server who notes how many it received. + The RC Send/Receive mechanism is used. +rc_bi_bw +RDMA + Purpose + RC streaming two way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Both the client and server exchange messages with each other using the + RC Send/Receive mechanism and note how many were received. +rc_lat +RDMA + Purpose + RC one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using RC Send/Receive. +uc_bw +RDMA + Purpose + UC streaming one way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client sends messages to the server who notes how many it received. + The UC Send/Receive mechanism is used. +uc_bi_bw +RDMA + Purpose + UC streaming two way bandwidth + Common Options + --access_recv OnOff (-Ar) Access receive data + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Both the client and server exchange messages with each other using the + UC Send/Receive mechanism and note how many were received. +uc_lat +RDMA + Purpose + UC one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + repeatedly using UC Send/Receive. +rc_rdma_read_bw +RDMA + Purpose + RC RDMA read streaming one way bandwidth + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs RC RDMA Read operations and notes how + many of them complete. +rc_rdma_read_lat +RDMA + Purpose + RC RDMA read one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs RC RDMA Read operations waiting for + completion before starting the next one. +rc_rdma_write_bw +RDMA + Purpose + RC RDMA write streaming one way bandwidth + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs RC RDMA Write operations and notes how + many of them complete. +rc_rdma_write_lat +RDMA + Purpose + RC RDMA write one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + using RC RDMA write operations. +rc_rdma_write_poll_lat +RDMA + Purpose + RC RDMA write one way polling latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test using RC RDMA Write operations. First the + client performs an RDMA Write while the server stays in a tight loop + waiting for the memory buffer to change. The first and last bytes of + the memory buffer are tested to ensure that the entire message was + received. This is then repeated with both sides playing opposite + roles. Since this always polls, the -P (--poll) flag has no effect. +uc_rdma_write_bw +RDMA + Purpose + UC RDMA write streaming one way bandwidth + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs UC RDMA Write operations and notes how + many of them complete. +uc_rdma_write_lat +RDMA + Purpose + UC RDMA write one way latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test where the server and client exchange messages + using UC RDMA write operations. +uc_rdma_write_poll_lat +RDMA + Purpose + UC RDMA write one way polling latency + Common Options + --id Device:Port (-i) Set IB device and port + --msg_size Size (-m) Set message size + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + A ping pong latency test using UC RDMA Write operations. First the + client performs an RDMA Write while the server stays in a tight loop + waiting for the memory buffer to change. The first and last bytes of + the memory buffer are tested to ensure that the entire message was + received. This is then repeated with both sides playing opposite + roles. Since this always polls, the -P (--poll) flag has no effect. +rc_compare_swap_mr +RDMA + Purpose + RC compare and swap messaging rate + Common Options + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs the RC Atomic Compare and Swap operation + and determines how many of them complete. +rc_fetch_add_mr +RDMA + Purpose + RC fetch and add messaging rate + Common Options + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + The client repeatedly performs the RC Atomic Fetch and Add operation + and determines how many of them complete. +ver_rc_compare_swap +RDMA + Purpose + Verify RC compare and swap + Common Options + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Test the RC Compare and Swap Atomic operation. The server's memory + location starts with zero and the client successively exchanges, 0 for + 1, 1 for 2, etc. The results are checked for correctness. +ver_rc_fetch_add +RDMA + Purpose + Verify RC fetch and add + Common Options + --affinity PN (-a) Set processor affinity + --id Device:Port (-i) Set IB device and port + --poll OnOff (-P) Set polling mode on/off + --time (-t) Set test duration + Other Options + --affinity, --listen_port, --mtu_size, --rate, --rd_atomic, --timeout + Display Options + --precision, --unify_nodes, --unify_units, --verbose + Description + Tests the RC Fetch and Add Atomic operation. The server's memory + location starts with zero and the client successively adds one. The + results are checked for correctness. diff --git a/ib.c b/ib.c new file mode 100644 index 0000000..233fbc6 --- /dev/null +++ b/ib.c @@ -0,0 +1,2231 @@ +/* + * qperf - handle RDMA tests. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "qperf.h" + + +/* + * RDMA parameters. + */ +#define QKEY 0x11111111 /* Q_Key */ +#define NCQE 1024 /* Number of CQ entries */ +#define GRH_SIZE 40 /* IB GRH size */ +#define MTU_SIZE 2048 /* Default MTU Size */ +#define RETRY_CNT 7 /* RC/UC retry count */ +#define RNR_RETRY 7 /* RC/UC RNR retry count */ +#define RNR_TIMER 12 /* RC/UC RNR timeout */ +#define TIMEOUT 14 /* RC/UC timeout */ + + +/* + * Work request IDs. + */ +#define WRID_SEND 1 /* Send */ +#define WRID_RECV 2 /* Receive */ +#define WRID_RDMA 3 /* RDMA */ + + +/* + * Constants. + */ +#define K2 (2*1024) +#define K64 (64*1024) + + +/* + * For convenience. + */ +typedef enum ibv_wr_opcode OPCODE; + + +/* + * Atomics. + */ +typedef enum ATOMIC { + FETCH_ADD, /* Fetch and add */ + COMPARE_SWAP /* Compare and swap */ +} ATOMIC; + + +/* + * IO Mode. + */ +typedef enum IOMODE { + IO_SR, /* Send/Receive */ + IO_RDMA /* RDMA */ +} IOMODE; + + +/* + * RDMA connection context. + */ +typedef struct IBCON { + uint32_t lid; /* Local ID */ + uint32_t qpn; /* Queue pair number */ + uint32_t psn; /* Packet sequence number */ + uint32_t rkey; /* Remote key */ + uint64_t vaddr; /* Virtual address */ +} IBCON; + + +/* + * RDMA descriptor. + */ +typedef struct IBDEV { + IBCON lcon; /* Local context */ + IBCON rcon; /* Remote context */ + int mtu; /* MTU */ + int port; /* Port */ + int rate; /* Rate */ + int trans; /* QP transport */ + int maxinline; /* Maximum amount of inline data */ + char *buffer; /* Buffer */ + struct ibv_device **devlist; /* Device list */ + struct ibv_context *context; /* Context */ + struct ibv_comp_channel *channel; /* Channel */ + struct ibv_pd *pd; /* Protection domain */ + struct ibv_mr *mr; /* Memory region */ + struct ibv_cq *cq; /* Completion queue */ + struct ibv_qp *qp; /* QPair */ + struct ibv_ah *ah; /* Address handle */ +} IBDEV; + + +/* + * Names associated with a value. + */ +typedef struct NAMES { + int value; /* Value */ + char *name; /* Name */ +} NAMES; + + +/* + * RDMA speeds and names. + */ +typedef struct RATES { + const char *name; /* Name */ + uint32_t rate; /* Rate */ +} RATES; + + +/* + * Function prototypes. + */ +static void cq_error(int status); +static void dec_ibcon(IBCON *host); +static int do_error(int status, uint64_t *errors); +static void enc_ibcon(IBCON *host); +static void ib_bi_bw(int transport); +static void ib_client_atomic(ATOMIC atomic); +static void ib_client_bw(int transport); +static void ib_client_rdma_bw(int transport, OPCODE opcode); +static void ib_client_rdma_read_lat(int transport); +static void ib_close(IBDEV *ibdev); +static void ib_debug_info(IBDEV *ibdev); +static int ib_init(IBDEV *ibdev); +static int ib_mralloc(IBDEV *ibdev, int size); +static int ib_open(IBDEV *ibdev, int trans, int maxSendWR, int maxRecvWR); +static void ib_params_atomics(void); +static void ib_params_msgs(long msgSize, int use_poll_mode); +static int ib_poll(IBDEV *ibdev, struct ibv_wc *wc, int nwc); +static int ib_post_rdma(IBDEV *ibdev, OPCODE opcode, int n); +static int ib_post_compare_swap(IBDEV *ibdev, + int wrid, int offset, uint64_t compare, uint64_t swap); +static int ib_post_fetch_add(IBDEV *ibdev, + int wrid, int offset, uint64_t add); +static int ib_post_recv(IBDEV *ibdev, int n); +static int ib_post_send(IBDEV *ibdev, int n); +static void ib_pp_lat(int transport, IOMODE iomode); +static void ib_pp_lat_loop(IBDEV *ibdev, IOMODE iomode); +static int ib_prepare(IBDEV *ibdev); +static void ib_rdma_write_poll_lat(int transport); +static void ib_server_def(int transport); +static void ib_server_nop(int transport); +static char *opcode_name(int opcode); + + +/* + * List of errors we can get from a CQE. + */ +NAMES CQErrors[] ={ + { IBV_WC_SUCCESS, "Success" }, + { IBV_WC_LOC_LEN_ERR, "Local length error" }, + { IBV_WC_LOC_QP_OP_ERR, "Local QP operation failure" }, + { IBV_WC_LOC_EEC_OP_ERR, "Local EEC operation failure" }, + { IBV_WC_LOC_PROT_ERR, "Local protection error" }, + { IBV_WC_WR_FLUSH_ERR, "WR flush failure" }, + { IBV_WC_MW_BIND_ERR, "Memory window bind failure" }, + { IBV_WC_BAD_RESP_ERR, "Bad response" }, + { IBV_WC_LOC_ACCESS_ERR, "Local access failure" }, + { IBV_WC_REM_INV_REQ_ERR, "Remote invalid request" }, + { IBV_WC_REM_ACCESS_ERR, "Remote access failure" }, + { IBV_WC_REM_OP_ERR, "Remote operation failure" }, + { IBV_WC_RETRY_EXC_ERR, "Retries exceeded" }, + { IBV_WC_RNR_RETRY_EXC_ERR, "RNR retry exceeded" }, + { IBV_WC_LOC_RDD_VIOL_ERR, "Local RDD violation" }, + { IBV_WC_REM_INV_RD_REQ_ERR, "Remote invalid read request" }, + { IBV_WC_REM_ABORT_ERR, "Remote abort" }, + { IBV_WC_INV_EECN_ERR, "Invalid EECN" }, + { IBV_WC_INV_EEC_STATE_ERR, "Invalid EEC state" }, + { IBV_WC_FATAL_ERR, "Fatal error" }, + { IBV_WC_RESP_TIMEOUT_ERR, "Responder timeout" }, + { IBV_WC_GENERAL_ERR, "General error" }, +}; + + +/* + * Opcodes. + */ +NAMES Opcodes[] ={ + { IBV_WR_ATOMIC_CMP_AND_SWP, "compare and swap" }, + { IBV_WR_ATOMIC_FETCH_AND_ADD, "fetch and add" }, + { IBV_WR_RDMA_READ, "rdma read" }, + { IBV_WR_RDMA_WRITE, "rdma write" }, + { IBV_WR_RDMA_WRITE_WITH_IMM, "rdma write with immediate" }, + { IBV_WR_SEND, "send" }, + { IBV_WR_SEND_WITH_IMM, "send with immediate" }, +}; + + +/* + * Opcodes. + */ +RATES Rates[] ={ + { "", IBV_RATE_MAX }, + { "max", IBV_RATE_MAX }, + { "1xSDR", IBV_RATE_2_5_GBPS }, + { "1xDDR", IBV_RATE_5_GBPS }, + { "1xQDR", IBV_RATE_10_GBPS }, + { "4xSDR", IBV_RATE_10_GBPS }, + { "4xDDR", IBV_RATE_20_GBPS }, + { "4xQDR", IBV_RATE_40_GBPS }, + { "8xSDR", IBV_RATE_20_GBPS }, + { "8xDDR", IBV_RATE_40_GBPS }, + { "8xQDR", IBV_RATE_80_GBPS }, + { "2.5", IBV_RATE_2_5_GBPS }, + { "5", IBV_RATE_5_GBPS }, + { "10", IBV_RATE_10_GBPS }, + { "20", IBV_RATE_20_GBPS }, + { "30", IBV_RATE_30_GBPS }, + { "40", IBV_RATE_40_GBPS }, + { "60", IBV_RATE_60_GBPS }, + { "80", IBV_RATE_80_GBPS }, + { "120", IBV_RATE_120_GBPS }, +}; + + +/* + * Experimental (client side). + */ +void +run_client_experimental(void) +{ + IBDEV ibdev; + + ib_params_msgs(K64, 1); + if (!ib_open(&ibdev, IBV_QPT_UC, 1, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1)) + goto err; + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Experimental (server side). + */ +void +run_server_experimental(void) +{ + IBDEV ibdev; + int found = 0; + + if (!ib_open(&ibdev, IBV_QPT_UC, 0, 1)) + return; + if (!ib_init(&ibdev)) + goto err; + if (!ib_post_recv(&ibdev, 1)) + goto err; + if (!synchronize()) + goto err; + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (n < 0) + goto err; + if (n) { + found = 1; + break; + } + } + if (found) + printf("Received immediate data\n"); + else + printf("Failed to received immediate data\n"); + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure RC bi-directional bandwidth (client side). + */ +void +run_client_rc_bi_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + ib_params_msgs(K64, 1); + ib_bi_bw(IBV_QPT_RC); + show_results(BANDWIDTH); +} + + +/* + * Measure RC bi-directional bandwidth (server side). + */ +void +run_server_rc_bi_bw(void) +{ + ib_bi_bw(IBV_QPT_RC); +} + + +/* + * Measure RC bandwidth (client side). + */ +void +run_client_rc_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + par_use(L_NO_MSGS); + par_use(R_NO_MSGS); + ib_params_msgs(K64, 1); + ib_client_bw(IBV_QPT_RC); + show_results(BANDWIDTH); +} + + +/* + * Measure RC bandwidth (server side). + */ +void +run_server_rc_bw(void) +{ + ib_server_def(IBV_QPT_RC); +} + + +/* + * Measure RC compare and swap messaging rate (client side). + */ +void +run_client_rc_compare_swap_mr(void) +{ + ib_client_atomic(COMPARE_SWAP); +} + + +/* + * Measure RC compare and swap messaging rate (server side). + */ +void +run_server_rc_compare_swap_mr(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC fetch and add messaging rate (client side). + */ +void +run_client_rc_fetch_add_mr(void) +{ + ib_client_atomic(FETCH_ADD); +} + + +/* + * Measure RC fetch and add messaging rate (server side). + */ +void +run_server_rc_fetch_add_mr(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC latency (client side). + */ +void +run_client_rc_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_RC, IO_SR); +} + + +/* + * Measure RC latency (server side). + */ +void +run_server_rc_lat(void) +{ + ib_pp_lat(IBV_QPT_RC, IO_SR); +} + + +/* + * Measure RC RDMA read bandwidth (client side). + */ +void +run_client_rc_rdma_read_bw(void) +{ + par_use(L_RD_ATOMIC); + par_use(R_RD_ATOMIC); + ib_params_msgs(K64, 1); + ib_client_rdma_bw(IBV_QPT_RC, IBV_WR_RDMA_READ); + show_results(BANDWIDTH); +} + + +/* + * Measure RC RDMA read bandwidth (server side). + */ +void +run_server_rc_rdma_read_bw(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA read latency (client side). + */ +void +run_client_rc_rdma_read_lat(void) +{ + ib_params_msgs(1, 1); + ib_client_rdma_read_lat(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA read latency (server side). + */ +void +run_server_rc_rdma_read_lat(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA write bandwidth (client side). + */ +void +run_client_rc_rdma_write_bw(void) +{ + ib_params_msgs(K64, 1); + ib_client_rdma_bw(IBV_QPT_RC, IBV_WR_RDMA_WRITE_WITH_IMM); + show_results(BANDWIDTH); +} + + +/* + * Measure RC RDMA write bandwidth (server side). + */ +void +run_server_rc_rdma_write_bw(void) +{ + ib_server_def(IBV_QPT_RC); +} + + +/* + * Measure RC RDMA write latency (client side). + */ +void +run_client_rc_rdma_write_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_RC, IO_RDMA); +} + + +/* + * Measure RC RDMA write latency (server side). + */ +void +run_server_rc_rdma_write_lat(void) +{ + ib_pp_lat(IBV_QPT_RC, IO_RDMA); +} + + +/* + * Measure RC RDMA write polling latency (client side). + */ +void +run_client_rc_rdma_write_poll_lat(void) +{ + ib_params_msgs(1, 0); + ib_rdma_write_poll_lat(IBV_QPT_RC); + show_results(LATENCY); +} + + +/* + * Measure RC RDMA write polling latency (server side). + */ +void +run_server_rc_rdma_write_poll_lat(void) +{ + ib_rdma_write_poll_lat(IBV_QPT_RC); +} + + +/* + * Measure UC bi-directional bandwidth (client side). + */ +void +run_client_uc_bi_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + ib_params_msgs(K64, 1); + ib_bi_bw(IBV_QPT_UC); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UC bi-directional bandwidth (server side). + */ +void +run_server_uc_bi_bw(void) +{ + ib_bi_bw(IBV_QPT_UC); +} + + +/* + * Measure UC bandwidth (client side). + */ +void +run_client_uc_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + par_use(L_NO_MSGS); + par_use(R_NO_MSGS); + ib_params_msgs(K64, 1); + ib_client_bw(IBV_QPT_UC); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UC bandwidth (server side). + */ +void +run_server_uc_bw(void) +{ + ib_server_def(IBV_QPT_UC); +} + + +/* + * Measure UC latency (client side). + */ +void +run_client_uc_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_UC, IO_SR); +} + + +/* + * Measure UC latency (server side). + */ +void +run_server_uc_lat(void) +{ + ib_pp_lat(IBV_QPT_UC, IO_SR); +} + + +/* + * Measure UC RDMA write bandwidth (client side). + */ +void +run_client_uc_rdma_write_bw(void) +{ + ib_params_msgs(K64, 1); + ib_client_rdma_bw(IBV_QPT_UC, IBV_WR_RDMA_WRITE_WITH_IMM); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UC RDMA write bandwidth (server side). + */ +void +run_server_uc_rdma_write_bw(void) +{ + ib_server_def(IBV_QPT_UC); +} + + +/* + * Measure UC RDMA write latency (client side). + */ +void +run_client_uc_rdma_write_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_UC, IO_RDMA); +} + + +/* + * Measure UC RDMA write latency (server side). + */ +void +run_server_uc_rdma_write_lat(void) +{ + ib_pp_lat(IBV_QPT_UC, IO_RDMA); +} + + +/* + * Measure UC RDMA write polling latency (client side). + */ +void +run_client_uc_rdma_write_poll_lat(void) +{ + ib_params_msgs(1, 1); + ib_rdma_write_poll_lat(IBV_QPT_UC); + show_results(LATENCY); +} + + +/* + * Measure UC RDMA write polling latency (server side). + */ +void +run_server_uc_rdma_write_poll_lat(void) +{ + ib_rdma_write_poll_lat(IBV_QPT_UC); +} + + +/* + * Measure UD bi-directional bandwidth (client side). + */ +void +run_client_ud_bi_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + ib_params_msgs(K2, 1); + ib_bi_bw(IBV_QPT_UD); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UD bi-directional bandwidth (server side). + */ +void +run_server_ud_bi_bw(void) +{ + ib_bi_bw(IBV_QPT_UD); +} + + +/* + * Measure UD bandwidth (client side). + */ +void +run_client_ud_bw(void) +{ + par_use(L_ACCESS_RECV); + par_use(R_ACCESS_RECV); + par_use(L_NO_MSGS); + par_use(R_NO_MSGS); + ib_params_msgs(K2, 1); + ib_client_bw(IBV_QPT_UD); + show_results(BANDWIDTH_SR); +} + + +/* + * Measure UD bandwidth (server side). + */ +void +run_server_ud_bw(void) +{ + ib_server_def(IBV_QPT_UD); +} + + +/* + * Measure UD latency (client side). + */ +void +run_client_ud_lat(void) +{ + ib_params_msgs(1, 1); + ib_pp_lat(IBV_QPT_UD, IO_SR); +} + + +/* + * Measure UD latency (server side). + */ +void +run_server_ud_lat(void) +{ + ib_pp_lat(IBV_QPT_UD, IO_SR); +} + +/* + * Verify RC compare and swap (client side). + */ +void +run_client_ver_rc_compare_swap(void) +{ + IBDEV ibdev; + uint64_t *result; + uint64_t last = 0; + uint64_t cur = 0; + uint64_t next = 0x0123456789abcdefULL; + int i; + int size; + + ib_params_atomics(); + if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0)) + goto err; + size = Req.rd_atomic * sizeof(uint64_t); + setv_u32(L_MSG_SIZE, size); + setv_u32(R_MSG_SIZE, size); + ib_mralloc(&ibdev, size); + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + for (i = 0; i < Req.rd_atomic; ++i) { + if (!ib_post_compare_swap(&ibdev, i, i*sizeof(uint64_t), cur, next)) + goto err; + cur = next; + next = cur + 1; + } + result = (uint64_t *) ibdev.buffer; + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + uint64_t res; + + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int x = wc[i].wr_id; + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.rem_r.no_bytes += sizeof(uint64_t); + LStat.rem_r.no_msgs++; + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + res = result[x]; + if (last != res) { + error("compare and swap doesn't match (expected %llx vs. %llx)", + (long long)last, (long long)res); + goto err; + } + if (last) + last++; + else + last = 0x0123456789abcdefULL; + next = cur + 1; + if (!ib_post_compare_swap(&ibdev, x, x*sizeof(uint64_t), + cur, next)) + goto err; + cur = next; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(MSG_RATE); +} + + +/* + * Verify RC compare and swap (server side). + */ +void +run_server_ver_rc_compare_swap(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Verify RC fetch and add (client side). + */ +void +run_client_ver_rc_fetch_add(void) +{ + IBDEV ibdev; + uint64_t *result; + uint64_t last = 0; + int i; + int size; + + ib_params_atomics(); + if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0)) + goto err; + size = Req.rd_atomic * sizeof(uint64_t); + setv_u32(L_MSG_SIZE, size); + setv_u32(R_MSG_SIZE, size); + ib_mralloc(&ibdev, size); + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + for (i = 0; i < Req.rd_atomic; ++i) { + if (!ib_post_fetch_add(&ibdev, i, i*sizeof(uint64_t), 1)) + goto err; + } + result = (uint64_t *) ibdev.buffer; + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + uint64_t res; + + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int x = wc[i].wr_id; + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.rem_r.no_bytes += sizeof(uint64_t); + LStat.rem_r.no_msgs++; + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + res = result[x]; + if (last != res) { + error("fetch and add doesn't match (expected %llx vs. %llx)", + (long long)last, (long long)res); + goto err; + } + last++; + if (!ib_post_fetch_add(&ibdev, x, x*sizeof(uint64_t), 1)) + goto err; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(MSG_RATE); +} + + +/* + * Verify RC fetch and add (server side). + */ +void +run_server_ver_rc_fetch_add(void) +{ + ib_server_nop(IBV_QPT_RC); +} + + +/* + * Measure messaging rate for an atomic operation. + */ +static void +ib_client_atomic(ATOMIC atomic) +{ + int i; + int r; + IBDEV ibdev; + + ib_params_atomics(); + if (!ib_open(&ibdev, IBV_QPT_RC, NCQE, 0)) + goto err; + setv_u32(L_MSG_SIZE, sizeof(uint64_t)); + setv_u32(R_MSG_SIZE, sizeof(uint64_t)); + ib_mralloc(&ibdev, sizeof(uint64_t)); + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + for (i = 0; i < Req.rd_atomic; ++i) { + r = (atomic == FETCH_ADD) + ? ib_post_fetch_add(&ibdev, 0, 0, 0) + : ib_post_compare_swap(&ibdev, 0, 0, 0, 0); + if (!r) + goto err; + } + while (!Finished) { + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.rem_r.no_bytes += sizeof(uint64_t); + LStat.rem_r.no_msgs++; + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + r = (atomic == FETCH_ADD) + ? ib_post_fetch_add(&ibdev, 0, 0, 0) + : ib_post_compare_swap(&ibdev, 0, 0, 0, 0); + if (!r) + goto err; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(MSG_RATE); +} + + +/* + * Measure IB bandwidth (client side). + */ +static void +ib_client_bw(int transport) +{ + IBDEV ibdev; + + long sent = 0; + if (!ib_open(&ibdev, transport, NCQE, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_send(&ibdev, left_to_send(&sent, NCQE))) + goto err; + sent = NCQE; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (n > LStat.max_cqes) + LStat.max_cqes = n; + if (n < 0) + goto err; + if (Finished) + break; + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + if (id != WRID_SEND) + debug("bad WR ID %d", id); + else if (status != IBV_WC_SUCCESS) + if (!do_error(status, &LStat.s.no_errs)) + goto err; + } + if (Req.no_msgs) { + if (LStat.s.no_msgs + LStat.s.no_errs >= Req.no_msgs) + break; + n = left_to_send(&sent, n); + } + if (!ib_post_send(&ibdev, n)) + goto err; + sent += n; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Default action for the server is to post receive buffers and whenever it + * gets a completion entry, compute statistics and post more buffers. + */ +static void +ib_server_def(int transport) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, 0, NCQE)) + return; + if (!ib_init(&ibdev)) + goto err; + if (!ib_post_recv(&ibdev, NCQE)) + goto err; + if (!synchronize()) + goto err; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + if (n < 0) + goto err; + for (i = 0; i < n; ++i) { + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + if (Req.access_recv) + touch_data(ibdev.buffer, Req.msg_size); + } else if (!do_error(status, &LStat.r.no_errs)) + goto err; + } + if (Req.no_msgs) + if (LStat.r.no_msgs + LStat.r.no_errs >= Req.no_msgs) + break; + if (!ib_post_recv(&ibdev, n)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure bi-directional IB bandwidth. + */ +static void +ib_bi_bw(int transport) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, NCQE, NCQE)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!ib_post_recv(&ibdev, NCQE)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_send(&ibdev, NCQE)) + goto err; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + int noSend = 0; + int noRecv = 0; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + if (n < 0) + goto err; + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + switch (id) { + case WRID_SEND: + if (status != IBV_WC_SUCCESS) + if (!do_error(status, &LStat.s.no_errs)) + goto err; + ++noSend; + break; + case WRID_RECV: + if (status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + if (Req.access_recv) + touch_data(ibdev.buffer, Req.msg_size); + } else if (!do_error(status, &LStat.r.no_errs)) + goto err; + ++noRecv; + break; + default: + debug("bad WR ID %d", id); + } + } + if (noRecv) + if (!ib_post_recv(&ibdev, noRecv)) + goto err; + if (noSend) + if (!ib_post_send(&ibdev, noSend)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure ping-pong latency (client and server side). + */ +static void +ib_pp_lat(int transport, IOMODE iomode) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, 1, 1)) + goto err; + if (!ib_init(&ibdev)) + goto err; + ib_pp_lat_loop(&ibdev, iomode); +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + if (is_client()) + show_results(LATENCY); +} + + +/* + * Loop sending packets back and forth to measure ping-pong latency. + */ +static void +ib_pp_lat_loop(IBDEV *ibdev, IOMODE iomode) +{ + int done = 1; + if (!ib_post_recv(ibdev, 1)) + return; + if (!synchronize()) + return; + if (is_client()) { + if (iomode == IO_SR) { + if (!ib_post_send(ibdev, 1)) + return; + } else { + if (!ib_post_rdma(ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1)) + return; + } + done = 0; + } + + while (!Finished) { + int i; + struct ibv_wc wc[2]; + int n = ib_poll(ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n < 0) + return; + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + switch (id) { + case WRID_SEND: + case WRID_RDMA: + if (status != IBV_WC_SUCCESS) + if (!do_error(status, &LStat.s.no_errs)) + return; + done |= 1; + continue; + case WRID_RECV: + if (status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + if (!ib_post_recv(ibdev, 1)) + return; + } else if (!do_error(status, &LStat.r.no_errs)) + return; + done |= 2; + continue; + default: + debug("bad WR ID %d", id); + continue; + } + break; + } + if (done == 3) { + if (iomode == IO_SR) { + if (!ib_post_send(ibdev, 1)) + return; + } else { + if (!ib_post_rdma(ibdev, IBV_WR_RDMA_WRITE_WITH_IMM, 1)) + return; + } + done = 0; + } + } + Successful = 1; +} + + +/* + * Loop sending packets back and forth using RDMA Write and polling to measure + * latency. Note that if we increase the number of entries of wc to be NCQE, + * on the PS HCA, the latency is much longer. + */ +static void +ib_rdma_write_poll_lat(int transport) +{ + IBDEV ibdev; + volatile char *p; + volatile char *q; + int send = is_client() ? 1 : 0; + int locID = send; + int remID = !locID; + + if (!ib_open(&ibdev, transport, NCQE, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + p = &ibdev.buffer[0]; + q = &ibdev.buffer[Req.msg_size-1]; + while (!Finished) { + *p = locID; + *q = locID; + if (send) { + int i; + int n; + struct ibv_wc wc[2]; + + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_WRITE, 1)) + goto err; + if (Finished) + break; + n = ibv_poll_cq(ibdev.cq, cardof(wc), wc); + if (n < 0) { + syserror("CQ poll failed"); + goto err; + } + for (i = 0; i < n; ++i) { + int id = wc[i].wr_id; + int status = wc[i].status; + if (id != WRID_RDMA) + debug("bad WR ID %d", id); + else if (status != IBV_WC_SUCCESS) { + if (!do_error(status, &LStat.s.no_errs)) + goto err; + } + } + } + while (!Finished) + if (*p == remID && *q == remID) + break; + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + send = 1; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Measure RDMA Read latency (client side). + */ +static void +ib_client_rdma_read_lat(int transport) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, 1, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_READ, 1)) + goto err; + while (!Finished) { + struct ibv_wc wc; + int n = ib_poll(&ibdev, &wc, 1); + if (n < 0) + goto err; + if (n == 0) + continue; + if (Finished) + break; + if (wc.wr_id != WRID_RDMA) { + debug("bad WR ID %d", (int)wc.wr_id); + continue; + } + if (wc.status == IBV_WC_SUCCESS) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + LStat.rem_s.no_bytes += Req.msg_size; + LStat.rem_s.no_msgs++; + } else if (!do_error(wc.status, &LStat.s.no_errs)) + goto err; + if (!ib_post_rdma(&ibdev, IBV_WR_RDMA_READ, 1)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); + show_results(LATENCY); +} + + +/* + * Measure RDMA bandwidth (client side). + */ +static void +ib_client_rdma_bw(int transport, OPCODE opcode) +{ + IBDEV ibdev; + + if (!ib_open(&ibdev, transport, NCQE, 0)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + if (!ib_post_rdma(&ibdev, opcode, NCQE)) + goto err; + while (!Finished) { + int i; + struct ibv_wc wc[NCQE]; + int n = ib_poll(&ibdev, wc, cardof(wc)); + if (Finished) + break; + if (n < 0) + goto err; + if (n > LStat.max_cqes) + LStat.max_cqes = n; + for (i = 0; i < n; ++i) { + int status = wc[i].status; + if (status == IBV_WC_SUCCESS) { + if (opcode == IBV_WR_RDMA_READ) { + LStat.r.no_bytes += Req.msg_size; + LStat.r.no_msgs++; + LStat.rem_s.no_bytes += Req.msg_size; + LStat.rem_s.no_msgs++; + } + } else if (!do_error(status, &LStat.s.no_errs)) + goto err; + } + if (!ib_post_rdma(&ibdev, opcode, n)) + goto err; + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Server just waits and lets driver take care of any requests. + */ +static void +ib_server_nop(int transport) +{ + IBDEV ibdev; + + /* workaround: Size of RQ should be 0; bug in Mellanox driver */ + if (!ib_open(&ibdev, transport, 0, 1)) + goto err; + if (!ib_init(&ibdev)) + goto err; + if (!synchronize()) + goto err; + while (!Finished) + pause(); + Successful = 1; +err: + stop_timing(); + exchange_results(); + ib_close(&ibdev); +} + + +/* + * Set default IB parameters for tests that use messages. + */ +static void +ib_params_msgs(long msgSize, int use_poll_mode) +{ + setp_u32(0, L_MSG_SIZE, msgSize); + setp_u32(0, R_MSG_SIZE, msgSize); + setp_u32(0, L_MTU_SIZE, MTU_SIZE); + setp_u32(0, R_MTU_SIZE, MTU_SIZE); + par_use(L_ID); + par_use(R_ID); + par_use(L_MTU_SIZE); + par_use(R_MTU_SIZE); + par_use(L_RATE); + par_use(R_RATE); + if (use_poll_mode) { + par_use(L_POLL_MODE); + par_use(R_POLL_MODE); + } + opt_check(); +} + + +/* + * Set default IB parameters for tests that use atomics. + */ +static void +ib_params_atomics(void) +{ + setp_u32(0, L_MTU_SIZE, MTU_SIZE); + setp_u32(0, R_MTU_SIZE, MTU_SIZE); + par_use(L_ID); + par_use(R_ID); + par_use(L_POLL_MODE); + par_use(R_POLL_MODE); + par_use(L_RATE); + par_use(R_RATE); + par_use(L_RD_ATOMIC); + par_use(R_RD_ATOMIC); + opt_check(); + + setv_u32(L_MSG_SIZE, 0); +} + + +/* + * IB initialization. + */ +static int +ib_init(IBDEV *ibdev) +{ + IBCON ibcon; + + if (is_client()) { + client_send_request(); + enc_init(&ibcon); + enc_ibcon(&ibdev->lcon); + if (!send_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + if (!recv_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + dec_init(&ibcon); + dec_ibcon(&ibdev->rcon); + } else { + if (!recv_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + dec_init(&ibcon); + dec_ibcon(&ibdev->rcon); + enc_init(&ibcon); + enc_ibcon(&ibdev->lcon); + if (!send_mesg(&ibcon, sizeof(ibcon), "IB connection")) + return 0; + } + if (!ib_prepare(ibdev)) + return 0; + ib_debug_info(ibdev); + return 1; +} + + +/* + * Show debugging information. + */ +static void +ib_debug_info(IBDEV *ibdev) +{ + debug("L: lid=%04x qpn=%06x psn=%06x rkey=%08x vaddr=%010x", + ibdev->lcon.lid, ibdev->lcon.qpn, ibdev->lcon.psn, + ibdev->lcon.rkey, ibdev->lcon.vaddr); + debug("R: lid=%04x qpn=%06x psn=%06x rkey=%08x vaddr=%010x", + ibdev->rcon.lid, ibdev->rcon.qpn, ibdev->rcon.psn, + ibdev->rcon.rkey, ibdev->rcon.vaddr); +} + + +/* + * Open a RDMA device. + */ +static int +ib_open(IBDEV *ibdev, int trans, int maxSendWR, int maxRecvWR) +{ + /* Clear structure */ + memset(ibdev, 0, sizeof(*ibdev)); + + /* Check and set MTU */ + { + int mtu = Req.mtu_size; + if (mtu == 256) + ibdev->mtu = IBV_MTU_256; + else if (mtu == 512) + ibdev->mtu = IBV_MTU_512; + else if (mtu == 1024) + ibdev->mtu = IBV_MTU_1024; + else if (mtu == 2048) + ibdev->mtu = IBV_MTU_2048; + else if (mtu == 4096) + ibdev->mtu = IBV_MTU_4096; + else + error_die("Bad MTU: %d; must be 256/512/1K/2K/4K", mtu); + } + + /* Set transport type */ + ibdev->trans = trans; + + /* Set port */ + { + int port = 1; + char *p = index(Req.id, ':'); + if (p) { + *p++ = '\0'; + port = atoi(p); + if (port < 1) + error_die("Bad IB port: %d; must be at least 1", port); + } + ibdev->port = port; + } + + /* Set rate */ + { + RATES *q = Rates; + RATES *r = q + cardof(Rates); + + for (;; ++q) { + if (q >= r) { + syserror("Bad rate: %s", Req.rate); + goto err; + } + if (streq(Req.rate, q->name)) { + ibdev->rate = q->rate; + break; + } + } + } + + /* Determine device and open */ + { + struct ibv_device *device; + char *name = Req.id[0] ? Req.id : 0; + + ibdev->devlist = ibv_get_device_list(0); + if (!ibdev->devlist) { + syserror("Failed to find any IB devices"); + goto err; + } + if (!name) + device = *ibdev->devlist; + else { + struct ibv_device **d = ibdev->devlist; + while ((device = *d++)) + if (streq(ibv_get_device_name(device), name)) + break; + } + if (!device) { + syserror("Failed to find IB device"); + goto err; + } + ibdev->context = ibv_open_device(device); + if (!ibdev->context) { + syserror("Failed to open device %s", ibv_get_device_name(device)); + goto err; + } + } + + /* Allocate completion channel */ + ibdev->channel = ibv_create_comp_channel(ibdev->context); + if (!ibdev->channel) { + syserror("Failed to create completion channel"); + goto err; + } + + /* Allocate protection domain */ + ibdev->pd = ibv_alloc_pd(ibdev->context); + if (!ibdev->pd) { + syserror("Failed to allocate protection domain"); + goto err; + } + + /* Allocate message buffer and memory region */ + { + int bufSize = Req.msg_size; + int pageSize = sysconf(_SC_PAGESIZE); + if (trans == IBV_QPT_UD) + bufSize += GRH_SIZE; + if (bufSize == 0) + bufSize = 1; + if (posix_memalign((void **)&ibdev->buffer, pageSize, bufSize) != 0) { + syserror("Failed to allocate memory"); + goto err; + } + memset(ibdev->buffer, 0, bufSize); + int flags = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC; + ibdev->mr = ibv_reg_mr(ibdev->pd, ibdev->buffer, bufSize, flags); + if (!ibdev->mr) { + syserror("Failed to allocate memory region"); + goto err; + } + } + + /* Create completion queue */ + ibdev->cq = ibv_create_cq(ibdev->context, + maxSendWR+maxRecvWR, 0, ibdev->channel, 0); + if (!ibdev->cq) { + syserror("Failed to create completion queue"); + goto err; + } + + /* Create queue pair */ + { + struct ibv_qp_init_attr attr ={ + .send_cq = ibdev->cq, + .recv_cq = ibdev->cq, + .cap ={ + .max_send_wr = maxSendWR, + .max_recv_wr = maxRecvWR, + .max_send_sge = 1, + .max_recv_sge = 1, + .max_inline_data = 0, + }, + .qp_type = ibdev->trans, + }; + ibdev->qp = ibv_create_qp(ibdev->pd, &attr); + if (!ibdev->qp) { + syserror("Failed to create QP"); + goto err; + } + } + + /* Modify queue pair to INIT state */ + { + struct ibv_qp_attr attr ={ + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = ibdev->port + }; + int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; + + if (ibdev->trans == IBV_QPT_UD) { + flags |= IBV_QP_QKEY; + attr.qkey = QKEY; + } else if (ibdev->trans == IBV_QPT_RC) { + flags |= IBV_QP_ACCESS_FLAGS; + attr.qp_access_flags = + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC; + } else if (ibdev->trans == IBV_QPT_UC) { + flags |= IBV_QP_ACCESS_FLAGS; + attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE; + } + if (ibv_modify_qp(ibdev->qp, &attr, flags) != SUCCESS0) { + syserror("Failed to modify QP to INIT state"); + goto err; + } + } + + /* Get QP attributes */ + { + struct ibv_qp_attr qp_attr; + struct ibv_qp_init_attr qp_init_attr; + + if (ibv_query_qp(ibdev->qp, &qp_attr, 0, &qp_init_attr) != SUCCESS0) { + syserror("Query QP failed"); + goto err; + } + ibdev->maxinline = qp_attr.cap.max_inline_data; + } + + /* Get device properties */ + { + struct ibv_device_attr dev_attr; + + if (ibv_query_device(ibdev->context, &dev_attr) != SUCCESS0) { + syserror("Query device failed"); + goto err; + } + if (Req.rd_atomic == 0) + Req.rd_atomic = dev_attr.max_qp_rd_atom; + else if (Req.rd_atomic > dev_attr.max_qp_rd_atom) + error("Device only supports %d (< %d) RDMA reads or atomic ops", + dev_attr.max_qp_rd_atom, Req.rd_atomic); + } + + /* Set up local context */ + { + struct ibv_port_attr port_attr; + + int stat = ibv_query_port(ibdev->context, ibdev->port, &port_attr); + if (stat != SUCCESS0) { + syserror("Query port failed"); + goto err; + } + srand48(getpid()*time(0)); + + ibdev->lcon.lid = port_attr.lid; + ibdev->lcon.qpn = ibdev->qp->qp_num; + ibdev->lcon.psn = lrand48() & 0xffffff; + ibdev->lcon.rkey = 0; + ibdev->lcon.vaddr = 0; + } + + /* Allocate memory region */ + if (!ib_mralloc(ibdev, Req.msg_size)) + goto err; + return 1; + +err: + ib_close(ibdev); + return 0; +} + + +/* + * Allocate a memory region. + */ +static int +ib_mralloc(IBDEV *ibdev, int size) +{ + int pageSize; + + if (size == 0) + return 1; + if (ibdev->trans == IBV_QPT_UD) + size += GRH_SIZE; + pageSize = sysconf(_SC_PAGESIZE); + if (posix_memalign((void **)&ibdev->buffer, pageSize, size) != 0) { + syserror("Failed to allocate memory"); + goto err; + } + memset(ibdev->buffer, 0, size); + int flags = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_ATOMIC; + ibdev->mr = ibv_reg_mr(ibdev->pd, ibdev->buffer, size, flags); + if (!ibdev->mr) { + syserror("Failed to allocate memory region"); + goto err; + } + + ibdev->lcon.rkey = ibdev->mr->rkey; + ibdev->lcon.vaddr = (unsigned long)ibdev->buffer; + return 1; + +err: + if (ibdev->buffer) { + free(ibdev->buffer); + ibdev->buffer = 0; + } + return 0; +} + + +/* + * Prepare the IB device for receiving and sending. + */ +static int +ib_prepare(IBDEV *ibdev) +{ + int flags; + struct ibv_qp_attr rtr_attr ={ + .qp_state = IBV_QPS_RTR, + .path_mtu = ibdev->mtu, + .dest_qp_num = ibdev->rcon.qpn, + .rq_psn = ibdev->rcon.psn, + .min_rnr_timer = RNR_TIMER, + .max_dest_rd_atomic = Req.rd_atomic, + .ah_attr = { + .dlid = ibdev->rcon.lid, + .port_num = ibdev->port, + .static_rate = ibdev->rate + } + }; + struct ibv_qp_attr rts_attr ={ + .qp_state = IBV_QPS_RTS, + .timeout = TIMEOUT, + .retry_cnt = RETRY_CNT, + .rnr_retry = RNR_RETRY, + .sq_psn = ibdev->lcon.psn, + .max_rd_atomic = Req.rd_atomic + }; + struct ibv_ah_attr ah_attr ={ + .dlid = ibdev->rcon.lid, + .port_num = ibdev->port, + .static_rate = ibdev->rate + }; + + if (ibdev->trans == IBV_QPT_UD) { + /* Modify queue pair to RTR */ + flags = IBV_QP_STATE; + if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTR"); + + /* Modify queue pair to RTS */ + flags = IBV_QP_STATE | IBV_QP_SQ_PSN; + if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTS"); + + /* Create address handle */ + ibdev->ah = ibv_create_ah(ibdev->pd, &ah_attr); + if (!ibdev->ah) + return syserror("Failed to create address handle"); + } else if (ibdev->trans == IBV_QPT_RC) { + /* Modify queue pair to RTR */ + flags = IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER; + if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTR"); + + /* Modify queue pair to RTS */ + flags = IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC; + if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTS"); + } else if (ibdev->trans == IBV_QPT_UC) { + /* Modify queue pair to RTR */ + flags = IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN; + if (ibv_modify_qp(ibdev->qp, &rtr_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTR"); + + /* Modify queue pair to RTS */ + flags = IBV_QP_STATE | + IBV_QP_SQ_PSN; + if (ibv_modify_qp(ibdev->qp, &rts_attr, flags) != SUCCESS0) + return syserror("Failed to modify QP to RTS"); + } + if (!Req.poll_mode) { + if (ibv_req_notify_cq(ibdev->cq, 0) != SUCCESS0) + return syserror("Failed to request CQ notification"); + } + return 1; +} + + +/* + * Close a RDMA device. We ust destroy the CQ before the QP otherwise the + * ibv_destroy_qp call might hang. + */ +static void +ib_close(IBDEV *ibdev) +{ + if (ibdev->ah) + ibv_destroy_ah(ibdev->ah); + if (ibdev->cq) + ibv_destroy_cq(ibdev->cq); + if (ibdev->qp) + ibv_destroy_qp(ibdev->qp); + if (ibdev->mr) + ibv_dereg_mr(ibdev->mr); + if (ibdev->pd) + ibv_dealloc_pd(ibdev->pd); + if (ibdev->channel) + ibv_destroy_comp_channel(ibdev->channel); + if (ibdev->context) + ibv_close_device(ibdev->context); + if (ibdev->buffer) + free(ibdev->buffer); + if (ibdev->devlist) + free(ibdev->devlist); + memset(ibdev, 0, sizeof(*ibdev)); +} + + +/* + * Post a compare and swap request. + */ +static int +ib_post_compare_swap(IBDEV *ibdev, + int wrid, int offset, uint64_t compare, uint64_t swap) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t)ibdev->buffer + offset, + .length = sizeof(uint64_t), + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = wrid, + .sg_list = &sge, + .num_sge = 1, + .opcode = IBV_WR_ATOMIC_CMP_AND_SWP, + .send_flags = IBV_SEND_SIGNALED, + .wr = { + .atomic = { + .remote_addr = ibdev->rcon.vaddr, + .rkey = ibdev->rcon.rkey, + .compare_add = compare, + .swap = swap + } + } + }; + struct ibv_send_wr *badWR; + + errno = 0; + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post compare and swap"); + } + + LStat.s.no_bytes += sizeof(uint64_t); + LStat.s.no_msgs++; + return 1; +} + + +/* + * Post a fetch and add request. + */ +static int +ib_post_fetch_add(IBDEV *ibdev, int wrid, int offset, uint64_t add) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer + offset, + .length = sizeof(uint64_t), + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = wrid, + .sg_list = &sge, + .num_sge = 1, + .opcode = IBV_WR_ATOMIC_FETCH_AND_ADD, + .send_flags = IBV_SEND_SIGNALED, + .wr = { + .atomic = { + .remote_addr = ibdev->rcon.vaddr, + .rkey = ibdev->rcon.rkey, + .compare_add = add + } + } + }; + struct ibv_send_wr *badWR; + + errno = 0; + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post fetch and add"); + } + + LStat.s.no_bytes += sizeof(uint64_t); + LStat.s.no_msgs++; + return 1; +} + + +/* + * Post n sends. + */ +static int +ib_post_send(IBDEV *ibdev, int n) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer, + .length = Req.msg_size, + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = WRID_SEND, + .sg_list = &sge, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, + }; + struct ibv_send_wr *badWR; + + if (ibdev->trans == IBV_QPT_UD) { + wr.wr.ud.ah = ibdev->ah; + wr.wr.ud.remote_qpn = ibdev->rcon.qpn; + wr.wr.ud.remote_qkey = QKEY; + } + if (Req.msg_size <= ibdev->maxinline) + wr.send_flags |= IBV_SEND_INLINE; + errno = 0; + while (n-- > 0) { + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post send"); + } + LStat.s.no_bytes += Req.msg_size; + LStat.s.no_msgs++; + } + + return 1; +} + + +/* + * Post n receives. + */ +static int +ib_post_recv(IBDEV *ibdev, int n) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer, + .length = Req.msg_size, + .lkey = ibdev->mr->lkey + }; + struct ibv_recv_wr wr ={ + .wr_id = WRID_RECV, + .sg_list = &sge, + .num_sge = 1, + }; + struct ibv_recv_wr *badWR; + + if (ibdev->trans == IBV_QPT_UD) + sge.length += GRH_SIZE; + + errno = 0; + while (n-- > 0) { + if (ibv_post_recv(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post receive"); + } + } + return 1; +} + + +/* + * Post n RDMA requests. + */ +static int +ib_post_rdma(IBDEV *ibdev, OPCODE opcode, int n) +{ + struct ibv_sge sge ={ + .addr = (uintptr_t) ibdev->buffer, + .length = Req.msg_size, + .lkey = ibdev->mr->lkey + }; + struct ibv_send_wr wr ={ + .wr_id = WRID_RDMA, + .sg_list = &sge, + .num_sge = 1, + .opcode = opcode, + .send_flags = IBV_SEND_SIGNALED, + .wr = { + .rdma = { + .remote_addr = ibdev->rcon.vaddr, + .rkey = ibdev->rcon.rkey + } + } + }; + struct ibv_send_wr *badWR; + + if (opcode != IBV_WR_RDMA_READ && Req.msg_size <= ibdev->maxinline) + wr.send_flags |= IBV_SEND_INLINE; + errno = 0; + while (n--) { + if (ibv_post_send(ibdev->qp, &wr, &badWR) != SUCCESS0) { + if (Finished && errno == EINTR) + return 1; + return syserror("Failed to post %s", opcode_name(wr.opcode)); + } + if (opcode != IBV_WR_RDMA_READ) { + LStat.s.no_bytes += Req.msg_size; + LStat.s.no_msgs++; + } + } + return 1; +} + + +/* + * Poll the completion queue. + */ +static int +ib_poll(IBDEV *ibdev, struct ibv_wc *wc, int nwc) +{ + int n; + char *msg; + + if (!Req.poll_mode && !Finished) { + void *ectx; + struct ibv_cq *ecq; + + if (ibv_get_cq_event(ibdev->channel, &ecq, &ectx) != SUCCESS0) + {msg = "failed to get CQ event"; goto err;} + if (ecq != ibdev->cq) + {msg = "CQ event for unknown CQ"; goto err;} + if (ibv_req_notify_cq(ibdev->cq, 0) != SUCCESS0) + {msg = "failed to request CQ notification"; goto err;} + } + n = ibv_poll_cq(ibdev->cq, nwc, wc); + if (n < 0) + {msg = "CQ poll failed"; goto err;} + return n; + +err: + if (Finished && errno == EINTR) + return 0; + syserror(msg); + return -1; +} + + +/* + * Encode a IBCON structure into a data stream. + */ +static void +enc_ibcon(IBCON *host) +{ + enc_int(host->lid, sizeof(host->lid)); + enc_int(host->qpn, sizeof(host->qpn)); + enc_int(host->psn, sizeof(host->psn)); + enc_int(host->rkey, sizeof(host->rkey)); + enc_int(host->vaddr, sizeof(host->vaddr)); +} + + +/* + * Decode a IBCON structure from a data stream. + */ +static void +dec_ibcon(IBCON *host) +{ + host->lid = dec_int(sizeof(host->lid)); + host->qpn = dec_int(sizeof(host->qpn)); + host->psn = dec_int(sizeof(host->psn)); + host->rkey = dec_int(sizeof(host->rkey)); + host->vaddr = dec_int(sizeof(host->vaddr)); +} + + +/* + * Handle a CQ error and return true if it is recoverable. + */ +static int +do_error(int status, uint64_t *errors) +{ + ++*errors; + cq_error(status); + return 0; +} + + +/* + * Print out a CQ error given a status. + */ +static void +cq_error(int status) +{ + int i; + + for (i = 0; i < cardof(CQErrors); ++i) { + if (CQErrors[i].value == status) { + error("%s failed: %s", TestName, CQErrors[i].name); + return; + } + } + error("%s failed: CQ error %d", TestName, status); +} + + +/* + * Return the name of an opcode. + */ +static char * +opcode_name(int opcode) +{ + int i; + + for (i = 0; i < cardof(Opcodes); ++i) + if (Opcodes[i].value == opcode) + return Opcodes[i].name; + return "unknown operation"; +} diff --git a/ip.c b/ip.c new file mode 100644 index 0000000..8b048de --- /dev/null +++ b/ip.c @@ -0,0 +1,837 @@ +/* + * qperf - handle socket tests. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include "qperf.h" + + +/* + * Parameters. + */ +#define AF_INET_SDP 27 /* Family for SDP */ +#define AF_INET_RDS 30 /* Family for RDS */ + + +/* + * Function prototypes. + */ +static void datagram_client_bw(int domain); +static void datagram_client_init(int *fd, int domain, + struct sockaddr_in *addr); +static void datagram_client_lat(int domain); +static void datagram_server_bw(int domain); +static int datagram_server_init(int *fd, int domain); +static void datagram_server_lat(int domain); +static uint32_t decode_port(uint32_t *p); +static void encode_port(uint32_t *p, uint32_t port); +static void ip_parameters(long msgSize); +static int recv_full(int fd, void *ptr, int len); +static int send_full(int fd, void *ptr, int len); +static int set_socket_buffer_size(int fd); +static void socket_client_bw(int domain); +static void socket_client_init(int *fd, int domain); +static void socket_client_lat(int domain); +static void socket_server_bw(int domain); +static int socket_server_init(int *fd, int domain); +static void socket_server_lat(int domain); + + +/* + * Measure RDS bandwidth (client side). + */ +void +run_client_rds_bw(void) +{ + ip_parameters(8*1024); + datagram_client_bw(AF_INET_RDS); +} + + +/* + * Measure RDS bandwidth (server side). + */ +void +run_server_rds_bw(void) +{ + datagram_server_bw(AF_INET_RDS); +} + + +/* + * Measure RDS latency (client side). + */ +void +run_client_rds_lat(void) +{ + ip_parameters(1); + datagram_client_lat(AF_INET_RDS); +} + + +/* + * Measure RDS latency (server side). + */ +void +run_server_rds_lat(void) +{ + datagram_server_lat(AF_INET_RDS); +} + + +/* + * Measure UDP bandwidth (client side). + */ +void +run_client_udp_bw(void) +{ + ip_parameters(32*1024); + datagram_client_bw(AF_INET); +} + + +/* + * Measure UDP bandwidth (server side). + */ +void +run_server_udp_bw(void) +{ + datagram_server_bw(AF_INET); +} + + +/* + * Measure UDP latency (client side). + */ +void +run_client_udp_lat(void) +{ + ip_parameters(1); + datagram_client_lat(AF_INET); +} + + +/* + * Measure UDP latency (server side). + */ +void +run_server_udp_lat(void) +{ + datagram_server_lat(AF_INET); +} + + +/* + * Measure SDP bandwidth (client side). + */ +void +run_client_sdp_bw(void) +{ + ip_parameters(64*1024); + socket_client_bw(AF_INET_SDP); +} + + +/* + * Measure SDP bandwidth (server side). + */ +void +run_server_sdp_bw(void) +{ + socket_server_bw(AF_INET_SDP); +} + + +/* + * Measure SDP latency (client side). + */ +void +run_client_sdp_lat(void) +{ + ip_parameters(1); + socket_client_lat(AF_INET_SDP); +} + + +/* + * Measure SDP latency (server side). + */ +void +run_server_sdp_lat(void) +{ + socket_server_lat(AF_INET_SDP); +} + + +/* + * Measure TCP bandwidth (client side). + */ +void +run_client_tcp_bw(void) +{ + ip_parameters(64*1024); + socket_client_bw(AF_INET); +} + + +/* + * Measure TCP bandwidth (server side). + */ +void +run_server_tcp_bw(void) +{ + socket_server_bw(AF_INET); +} + + +/* + * Measure TCP latency (client side). + */ +void +run_client_tcp_lat(void) +{ + ip_parameters(1); + socket_client_lat(AF_INET); +} + + +/* + * Measure TCP latency (server side). + */ +void +run_server_tcp_lat(void) +{ + socket_server_lat(AF_INET); +} + + +/* + * Measure socket bandwidth (client side). + */ +static void +socket_client_bw(int domain) +{ + char *buf; + int sockFD; + + socket_client_init(&sockFD, domain); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = send_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(BANDWIDTH); +} + + +/* + * Measure socket bandwidth (server side). + */ +static void +socket_server_bw(int domain) +{ + int sockFD; + char *buf = 0; + + if (!socket_server_init(&sockFD, domain)) + return; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + int n = recv_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +/* + * Measure socket latency (client side). + */ +static void +socket_client_lat(int domain) +{ + char *buf; + int sockFD; + + socket_client_init(&sockFD, domain); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = send_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + + n = recv_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(LATENCY); +} + + +/* + * Measure socket latency (server side). + */ +static void +socket_server_lat(int domain) +{ + int sockFD; + char *buf = 0; + + if (!socket_server_init(&sockFD, domain)) + return; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + int n = recv_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + + n = send_full(sockFD, buf, Req.msg_size); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +/* + * Socket client initialization. + */ +static void +socket_client_init(int *fd, int domain) +{ + uint32_t port; + struct hostent *host; + struct sockaddr_in clientAddr; + struct sockaddr_in serverAddr; + socklen_t clientLen = sizeof(clientAddr); + + client_send_request(); + *fd = socket(domain, SOCK_STREAM, 0); + if (*fd < 0) + syserror_die("socket failed"); + clientAddr.sin_family = AF_INET; + clientAddr.sin_addr.s_addr = htonl(INADDR_ANY); + clientAddr.sin_port = htons(0); + if (bind(*fd, (struct sockaddr *)&clientAddr, clientLen) < 0) + syserror_die("bind failed"); + if (getsockname(*fd, (struct sockaddr *)&clientAddr, &clientLen) < 0) + syserror_die("getsockname failed"); + if (!set_socket_buffer_size(*fd)) + die(); + + host = gethostbyname(ServerName); + if (!host) + error_die("cannot find machine %s", ServerName); + serverAddr.sin_family = AF_INET; + if (host->h_length > sizeof(serverAddr.sin_addr)) + error_die("address too large to handle"); + memcpy(&serverAddr.sin_addr.s_addr, host->h_addr, host->h_length); + if (!recv_mesg(&port, sizeof(port), "port")) + die(); + port = decode_port(&port); + debug("sending from %s port %d to %d", + domain == AF_INET_SDP ? "SDP" : "TCP", + ntohs(clientAddr.sin_port), port); + serverAddr.sin_port = htons(port); + if (connect(*fd, &serverAddr, sizeof(serverAddr)) < 0) + syserror_die("connect failed"); +} + + +/* + * Socket server initialization. + */ +static int +socket_server_init(int *fd, int domain) +{ + uint32_t port; + int listenFD; + struct sockaddr_in addr; + socklen_t len = sizeof(addr); + int stat = 0; + int one = 1; + + listenFD = socket(domain, SOCK_STREAM, 0); + if (listenFD < 0) + return syserror("socket failed"); + if (setsockopt(listenFD, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) + return syserror("failed to reuse address on socket"); + memset(&addr, 0, len); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = htons(Req.port); + if (bind(listenFD, (struct sockaddr *)&addr, len) < 0) { + syserror("bind failed"); + goto err; + } + if (getsockname(listenFD, (struct sockaddr *)&addr, &len) < 0) { + syserror("getsockname failed"); + goto err; + } + port = ntohs(addr.sin_port); + if (listen(listenFD, 1) < 0) { + syserror("listen failed"); + goto err; + } + encode_port(&port, port); + if (!send_mesg(&port, sizeof(port), "port")) + goto err; + len = sizeof(addr); + *fd = accept(listenFD, (struct sockaddr *)&addr, &len); + if (*fd < 0) { + syserror("accept failed"); + goto err; + } + debug("accepted connection"); + if (!set_socket_buffer_size(*fd)) { + close(*fd); + goto err; + } + stat = 1; +err: + close(listenFD); + return stat; +} + + +/* + * Set both the send and receive socket buffer sizes. + */ +static int +set_socket_buffer_size(int fd) +{ + int size = Req.sock_buf_size; + + if (!size) + return 1; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)) < 0) + return syserror("failed to set send buffer size on socket"); + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)) < 0) + return syserror("failed to set receive buffer size on socket"); + return 1; +} + + +static void +datagram_client_bw(int domain) +{ + char *buf; + int sockFD; + struct sockaddr_in serverAddr; + + datagram_client_init(&sockFD, domain, &serverAddr); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = sendto(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&serverAddr, sizeof(serverAddr)); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(BANDWIDTH_SR); +} + + +static void +datagram_server_bw(int domain) +{ + int sockFD; + char *buf = 0; + + if (!datagram_server_init(&sockFD, domain)) + return; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + int n = recv(sockFD, buf, Req.msg_size, 0); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +static void +datagram_client_lat(int domain) +{ + char *buf; + int sockFD; + struct sockaddr_in addr; + + datagram_client_init(&sockFD, domain, &addr); + buf = qmalloc(Req.msg_size); + if (!synchronize()) + goto err; + while (!Finished) { + int n = sendto(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&addr, sizeof(addr)); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + + n = recv(sockFD, buf, Req.msg_size, 0); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); + show_results(LATENCY); +} + + +/* + * Set default IP parameters and ensure that any that are set are being used. + */ +static void +ip_parameters(long msgSize) +{ + setp_u32(0, L_MSG_SIZE, msgSize); + setp_u32(0, R_MSG_SIZE, msgSize); + par_use(L_PORT); + par_use(R_PORT); + par_use(L_SOCK_BUF_SIZE); + par_use(R_SOCK_BUF_SIZE); + opt_check(); +} + + +static void +datagram_server_lat(int domain) +{ + int sockFD; + char *buf = 0; + + if (!datagram_server_init(&sockFD, domain)) + goto err; + if (!synchronize()) + goto err; + buf = qmalloc(Req.msg_size); + while (!Finished) { + struct sockaddr_in clientAddr; + socklen_t clientLen = sizeof(clientAddr); + int n = recvfrom(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&clientAddr, &clientLen); + if (Finished) + break; + if (n < 0) { + LStat.r.no_errs++; + continue; + } else { + LStat.r.no_bytes += n; + LStat.r.no_msgs++; + } + + n = sendto(sockFD, buf, Req.msg_size, 0, + (struct sockaddr *)&clientAddr, clientLen); + if (Finished) + break; + if (n < 0) { + LStat.s.no_errs++; + continue; + } else { + LStat.s.no_bytes += n; + LStat.s.no_msgs++; + } + } + Successful = 1; +err: + stop_timing(); + exchange_results(); + free(buf); + close(sockFD); +} + + +/* + * Datagram client initialization. + */ +static void +datagram_client_init(int *fd, int domain, struct sockaddr_in *serverAddr) +{ + uint32_t port; + struct hostent *host; + struct sockaddr_in clientAddr; + socklen_t clientLen = sizeof(clientAddr); + + client_send_request(); + *fd = socket(domain, SOCK_DGRAM, 0); + if (*fd < 0) + syserror_die("socket failed"); + clientAddr.sin_family = AF_INET; + clientAddr.sin_addr.s_addr = htonl(INADDR_ANY); + clientAddr.sin_port = htons(0); + if (bind(*fd, (struct sockaddr *)&clientAddr, clientLen) < 0) + syserror_die("bind failed"); + if (getsockname(*fd, (struct sockaddr *)&clientAddr, &clientLen) < 0) + syserror_die("getsockname failed"); + if (!set_socket_buffer_size(*fd)) + die(); + + host = gethostbyname(ServerName); + if (!host) + error_die("cannot find machine %s", ServerName); + serverAddr->sin_family = AF_INET; + if (host->h_length > sizeof(serverAddr->sin_addr)) + error_die("address too large to handle"); + memcpy(&serverAddr->sin_addr.s_addr, host->h_addr, host->h_length); + if (!recv_mesg(&port, sizeof(port), "port")) + die(); + port = decode_port(&port); + debug("sending from %s port %d to %d", + domain == AF_INET ? "UDP" : "RDS", ntohs(clientAddr.sin_port), port); + serverAddr->sin_port = htons(port); +} + + +/* + * Datagram server initialization. + */ +static int +datagram_server_init(int *fd, int domain) +{ + uint32_t port; + struct sockaddr_in addr; + socklen_t len = sizeof(addr); + + *fd = socket(domain, SOCK_DGRAM, 0); + if (*fd < 0) + return syserror("socket failed"); + memset(&addr, 0, len); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = htons(Req.port); + if (bind(*fd, (struct sockaddr *)&addr, len) < 0) { + syserror("bind failed"); + goto err; + } + if (getsockname(*fd, (struct sockaddr *)&addr, &len) < 0) { + syserror("getsockname failed"); + goto err; + } + if (!set_socket_buffer_size(*fd)) + goto err; + encode_port(&port, ntohs(addr.sin_port)); + if (!send_mesg(&port, sizeof(port), "port")) + goto err; + return 1; + +err: + close(*fd); + return 0; +} + + +/* + * Send a complete message to a socket. A zero byte write indicates an end of + * file which suggests that we are finished. + */ +static int +send_full(int fd, void *ptr, int len) +{ + int n = len; + while (!Finished && n) { + int i = write(fd, ptr, n); + if (i < 0) + return i; + ptr += i; + n -= i; + if (i == 0) + set_finished(); + } + return len-n; +} + + +/* + * Receive a complete message from a socket. A zero byte read indicates an end + * of file which suggests that we are finished. + */ +static int +recv_full(int fd, void *ptr, int len) +{ + int n = len; + while (!Finished && n) { + int i = read(fd, ptr, n); + if (i < 0) + return i; + ptr += i; + n -= i; + if (i == 0) + set_finished(); + } + return len-n; +} + + +/* + * Encode a port which is stored as a 32 bit unsigned. + */ +static void +encode_port(uint32_t *p, uint32_t port) +{ + enc_init(p); + enc_int(port, sizeof(port)); +} + + +/* + * Decode a port which is stored as a 32 bit unsigned. + */ +static uint32_t +decode_port(uint32_t *p) +{ + dec_init(p); + return dec_int(sizeof(uint32_t)); +} diff --git a/mkhelp b/mkhelp new file mode 100755 index 0000000..2e7c8dc --- /dev/null +++ b/mkhelp @@ -0,0 +1,113 @@ +#!/usr/bin/env perl +# +use strict; +use warnings; +use diagnostics; + +my $help_txt = "help.txt"; +my $help_c = "help.c"; +my $top = " +/* + * This was generated from $help_txt. Do not modify directly. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +char *Usage[] ={ +"; +my $end = " + 0, +}; +"; + +sub panic { + print STDERR @_, "\n"; + exit 1; +} + +sub main() { + my %defs; + $defs{$_} = 1 for (@ARGV); + my $iFile; + open($iFile, "<", $help_txt) or + panic("cannot find $help_txt"); + my $str = ""; + my $keep = 1; + while (<$iFile>) { + chomp; + s/\s+$//; + if (/^ /) { + if ($keep) { + s///; + s/(["\\])/\\$1/g; + s/$/\\n/; + if (/^(.{68}(?>[^\\]?))(..*)/) { + $str .= " "x8 . "\"$1\"\n"; + $str .= " "x12 . "\"$2\"\n"; + } else { + $str .= " "x8 . "\"$_\"\n"; + } + } + } else { + my @args = split; + my $arg0 = lc(shift @args); + $keep = 1; + for (@args) { + if (/^\+(.*)/) { + $keep = 0 unless ($defs{$1}); + } elsif (/^-(.*)/) { + $keep = 0 if ($defs{$1}); + } + } + if ($keep) { + if ($str) { + chop $str; + $str .= ",\n"; + } + $str .= " "x4 . "\"$arg0\",\n"; + } + } + } + close $iFile; + if ($str) { + chop $str; + $str .= ",\n"; + } + $top =~ s/^\n//; + $end =~ s/^\n//; + my $oFile; + open($oFile, ">", $help_c) or + panic("cannot create $help_c"); + print $oFile $top, $str, $end; + close $oFile; +} + +main(); diff --git a/qperf.c b/qperf.c new file mode 100644 index 0000000..43f4873 --- /dev/null +++ b/qperf.c @@ -0,0 +1,2913 @@ +/* + * qperf - main. + * Run performance tests over TCP/IP and RDMA. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "qperf.h" + + +/* + * Configurable parameters. If your change makes this version of qperf + * incompatible with previous versions (usually a change to the Req structure), + * increment VER_MIN and set VER_INC to 0. Otherwise, just increment VER_INC. + * VER_MAJ is reserved for major changes. + */ +#define VER_MAJ 0 /* Major version */ +#define VER_MIN 2 /* Minor version */ +#define VER_INC 0 /* Incremental version */ +#define LISTENQ 5 /* Size of listen queue */ +#define BUFSIZE 1024 /* Size of buffers */ +#define SYNCMESG "SyN" /* Synchronize message */ +#define SYNCSIZE sizeof(SYNCMESG) /* Size of synchronize message */ + + +/* + * For convenience. + */ +#define with(c) |(c<<8) + + +/* + * Option list. + */ +typedef struct OPTION { + char *name; /* Name of option */ + short server_valid; /* Option valid on server */ + void (*func)(); /* Function to call */ + int arg1; /* First argument */ + int arg2; /* Second argument */ +} OPTION; + + +/* + * Parameter information. + */ +typedef struct PAR_INFO { + PAR_INDEX index; /* Index into parameter table */ + int type; /* Type */ + void *ptr; /* Pointer to value */ + char *name; /* Option name */ + int set; /* Parameter has been set */ + int used; /* Parameter has been used */ + int inuse; /* Parameter is in use */ +} PAR_INFO; + + +/* + * Parameter name association. + */ +typedef struct PAR_NAME { + char *name; /* Name */ + PAR_INDEX loc_i; /* Local index */ + PAR_INDEX rem_i; /* Remote index */ +} PAR_NAME; + + +/* + * Test prototype. + */ +typedef struct TEST { + char *name; /* Test name */ + void (*client)(void); /* Client function */ + void (*server)(void); /* Server function */ +} TEST; + + +/* + * Used to save output data for formatting. + */ +typedef struct SHOW { + char *pref; /* Name prefix */ + char *name; /* Name */ + char *data; /* Data */ + char *unit; /* Unit */ + char *altn; /* Alternative value */ +} SHOW; + + +/* + * Configuration information. + */ +typedef struct CONF { + char node[STRSIZE]; /* Node */ + char cpu[STRSIZE]; /* CPU */ + char os[STRSIZE]; /* Operating System */ + char qperf[STRSIZE]; /* Qperf version */ +} CONF; + + +/* + * Function prototypes. + */ +static void add_ustat(USTAT *l, USTAT *r); +static long arg_long(char ***argvp); +static long arg_size(char ***argvp); +static char *arg_strn(char ***argvp); +static long arg_time(char ***argvp); +static void bug_die(char *fmt, ...); +static void calc_node(RESN *resn, STAT *stat); +static void calc_results(void); +static void client(TEST *test); +static int cmpsub(char *s2, char *s1); +static char *commify(char *data); +static void dec_req(REQ *host); +static void dec_stat(STAT *host); +static void dec_ustat(USTAT *host); +static void do_args(char *args[]); +static void enc_req(REQ *host); +static void enc_stat(STAT *host); +static void enc_ustat(USTAT *host); +static TEST *find_test(char *name); +static OPTION *find_option(char *name); +static void get_conf(CONF *conf); +static void get_cpu(CONF *conf); +static double get_seconds(void); +static void get_times(CLOCK timex[T_N]); +static void initialize(void); +static void init_lstat(void); +static void init_vars(void); +static int nice_1024(char *pref, char *name, long long value); +static void opt_help(OPTION *option, char ***argvp); +static void opt_misc(OPTION *option, char ***argvp); +static void opt_strn(OPTION *option, char ***argvp); +static void opt_long(OPTION *option, char ***argvp); +static void opt_size(OPTION *option, char ***argvp); +static void opt_time(OPTION *option, char ***argvp); +static void opt_vers(OPTION *option, char ***argvp); +static PAR_INFO *par_info(PAR_INDEX index); +static PAR_INFO *par_set(char *name, PAR_INDEX index); +static int par_isset(PAR_INDEX index); +static void place_any(char *pref, char *name, char *unit, char *data, + char *altn); +static void place_show(void); +static void place_val(char *pref, char *name, char *unit, double value); +static char *qasprintf(char *fmt, ...); +static int recv_sync(void); +static void run_client_conf(void); +static void run_client_quit(void); +static void run_server_conf(void); +static void run_server_quit(void); +static int send_recv_mesg(int sr, char *item, int fd, char *buf, int len); +static int send_sync(void); +static void server(void); +static void server_listen(void); +static int server_recv_request(void); +static void set_affinity(void); +static int set_nonblock(int fd); +static void set_signals(void); +static void show_debug(void); +static void show_info(MEASURE measure); +static void show_rest(void); +static void show_used(void); +static void sig_alrm(int signo, siginfo_t *siginfo, void *ucontext); +static char *skip_colon(char *s); +static void start_timing(int seconds); +static void strncopy(char *d, char *s, int n); +static int verbose(int type, double value); +static void view_band(int type, char *pref, char *name, double value); +static void view_cost(int type, char *pref, char *name, double value); +static void view_cpus(int type, char *pref, char *name, double value); +static void view_rate(int type, char *pref, char *name, double value); +static void view_long(int type, char *pref, char *name, long long value); +static void view_size(int type, char *pref, char *name, long long value); +static void view_strn(int type, char *pref, char *name, char *value); +static void view_time(int type, char *pref, char *name, double value); + + +/* + * Configurable variables. + */ +static int ListenPort = 19765; +static int Precision = 3; +static int ServerTimeout = 5; + + +/* + * Static variables. + */ +static REQ RReq; +static int Debug; +static uint8_t *DecodePtr; +static int ExitStatus; +static uint8_t *EncodePtr; +static STAT IStat; +static int ListenFD; +static int ProcStatFD; +static int RemoteFD; +static STAT RStat; +static int ShowIndex; +static SHOW ShowTable[256]; +static int UnifyUnits; +static int UnifyNodes; +static int VerboseConf; +static int VerboseStat; +static int VerboseTime; +static int VerboseUsed; +static int Wait; + + +/* + * Global variables. + */ +RES Res; +REQ Req; +STAT LStat; +char *TestName; +char *ServerName; +int Successful; +volatile int Finished; + + +/* + * Parameter names. This is used to print out the names of the parameters that + * have been set. + */ +PAR_NAME ParName[] ={ + { "access_recv", L_ACCESS_RECV, R_ACCESS_RECV }, + { "affinity", L_AFFINITY, R_AFFINITY }, + { "flip", L_FLIP, R_FLIP }, + { "id", L_ID, R_ID }, + { "msg_size", L_MSG_SIZE, R_MSG_SIZE }, + { "mtu_size", L_MTU_SIZE, R_MTU_SIZE }, + { "no_msgs", L_NO_MSGS, R_NO_MSGS }, + { "poll_mode", L_POLL_MODE, R_POLL_MODE }, + { "port", L_PORT, R_PORT }, + { "rd_atomic", L_RD_ATOMIC, R_RD_ATOMIC }, + { "sock_buf_size", L_SOCK_BUF_SIZE, R_SOCK_BUF_SIZE }, + { "time", L_TIME, R_TIME }, + { "timeout", L_TIMEOUT, R_TIMEOUT }, +}; + + +/* + * Parameters. These must be listed in the same order as the indices are + * defined. + */ +PAR_INFO ParInfo[P_N] ={ + { P_NULL, }, + { L_ACCESS_RECV, 'l', &Req.access_recv }, + { R_ACCESS_RECV, 'l', &RReq.access_recv }, + { L_AFFINITY, 'l', &Req.affinity }, + { R_AFFINITY, 'l', &RReq.affinity }, + { L_FLIP, 'l', &Req.flip }, + { R_FLIP, 'l', &RReq.flip }, + { L_ID, 'p', &Req.id }, + { R_ID, 'p', &RReq.id }, + { L_MSG_SIZE, 's', &Req.msg_size }, + { R_MSG_SIZE, 's', &RReq.msg_size }, + { L_MTU_SIZE, 's', &Req.mtu_size }, + { R_MTU_SIZE, 's', &RReq.mtu_size }, + { L_NO_MSGS, 'l', &Req.no_msgs }, + { R_NO_MSGS, 'l', &RReq.no_msgs }, + { L_POLL_MODE, 'l', &Req.poll_mode }, + { R_POLL_MODE, 'l', &RReq.poll_mode }, + { L_PORT, 'l', &Req.port }, + { R_PORT, 'l', &RReq.port }, + { L_RATE, 'p', &Req.rate }, + { R_RATE, 'p', &RReq.rate }, + { L_RD_ATOMIC, 'l', &Req.rd_atomic }, + { R_RD_ATOMIC, 'l', &RReq.rd_atomic }, + { L_SOCK_BUF_SIZE, 's', &Req.sock_buf_size }, + { R_SOCK_BUF_SIZE, 's', &RReq.sock_buf_size }, + { L_TIME, 't', &Req.time }, + { R_TIME, 't', &RReq.time }, + { L_TIMEOUT, 't', &Req.timeout }, + { R_TIMEOUT, 't', &RReq.timeout }, +}; + + +/* + * Options. + */ +OPTION Options[] ={ + { "--access_recv", 0, &opt_long, L_ACCESS_RECV, R_ACCESS_RECV }, + { "-Ar", 0, &opt_long, L_ACCESS_RECV, R_ACCESS_RECV }, + { "--affinity", 0, &opt_long, L_AFFINITY, R_AFFINITY }, + { "-a", 0, &opt_long, L_AFFINITY, R_AFFINITY }, + { "--loc_affinity", 0, &opt_long, L_AFFINITY, }, + { "-la", 0, &opt_long, L_AFFINITY, }, + { "--rem_affinity", 0, &opt_long, R_AFFINITY }, + { "-ra", 0, &opt_long, R_AFFINITY }, + { "--debug", 1, &opt_misc, 'D', }, + { "-D", 1, &opt_misc, 'D', }, + { "--flip", 0, &opt_long, L_FLIP, R_FLIP }, + { "-f", 0, &opt_long, L_FLIP, R_FLIP }, + { "--help", 0, &opt_help }, + { "-h", 0, &opt_help }, + { "--host", 0, &opt_misc, 'H', }, + { "-H", 0, &opt_misc, 'H', }, + { "--id", 0, &opt_strn, L_ID, R_ID }, + { "-i", 0, &opt_strn, L_ID, R_ID }, + { "--loc_id", 0, &opt_strn, L_ID, }, + { "-li", 0, &opt_strn, L_ID, }, + { "--rem_id", 0, &opt_strn, R_ID }, + { "-ri", 0, &opt_strn, R_ID }, + { "--listen_port", 1, &opt_misc, 'l','p' }, + { "-lp", 1, &opt_misc, 'l','p' }, + { "--msg_size", 0, &opt_size, L_MSG_SIZE, R_MSG_SIZE }, + { "-m", 0, &opt_size, L_MSG_SIZE, R_MSG_SIZE }, + { "--mtu_size", 0, &opt_size, L_MTU_SIZE, R_MTU_SIZE }, + { "-M", 0, &opt_size, L_MTU_SIZE, R_MTU_SIZE }, + { "--no_msgs", 0, &opt_long, L_NO_MSGS, R_NO_MSGS }, + { "-n", 0, &opt_long, L_NO_MSGS, R_NO_MSGS }, + { "--poll", 0, &opt_long, L_POLL_MODE, R_POLL_MODE }, + { "-P", 0, &opt_long, L_POLL_MODE, R_POLL_MODE }, + { "--loc_poll", 0, &opt_long, L_POLL_MODE, }, + { "-lP", 0, &opt_long, L_POLL_MODE, }, + { "--rem_poll", 0, &opt_long, R_POLL_MODE }, + { "-rP", 0, &opt_long, R_POLL_MODE }, + { "--port", 0, &opt_long, L_PORT, R_PORT }, + { "-p", 0, &opt_long, L_PORT, R_PORT }, + { "--precision", 0, &opt_misc, 'e', }, + { "-e", 0, &opt_misc, 'e', }, + { "--rate", 0, &opt_strn, L_RATE, R_RATE }, + { "-r", 0, &opt_strn, L_RATE, R_RATE }, + { "--loc_rate", 0, &opt_strn, L_RATE }, + { "-lr", 0, &opt_strn, L_RATE }, + { "--rem_rate", 0, &opt_strn, R_RATE }, + { "-rr", 0, &opt_strn, R_RATE }, + { "-rd_atomic", 0, &opt_long, L_RD_ATOMIC, R_RD_ATOMIC }, + { "-R", 0, &opt_long, L_RD_ATOMIC, R_RD_ATOMIC }, + { "--loc_rd_atomic", 0, &opt_long, L_RD_ATOMIC, }, + { "-lR", 0, &opt_long, L_RD_ATOMIC, }, + { "--rem_rd_atomic", 0, &opt_long, R_RD_ATOMIC }, + { "-rR", 0, &opt_long, R_RD_ATOMIC }, + { "--sock_buf_size", 0, &opt_size, L_SOCK_BUF_SIZE, R_SOCK_BUF_SIZE }, + { "-S", 0, &opt_size, L_SOCK_BUF_SIZE, R_SOCK_BUF_SIZE }, + { "--loc_sock_buf_size", 0, &opt_size, L_SOCK_BUF_SIZE }, + { "-lS", 0, &opt_size, L_SOCK_BUF_SIZE }, + { "--rem_sock_buf_size", 0, &opt_size, R_SOCK_BUF_SIZE }, + { "-rS", 0, &opt_size, R_SOCK_BUF_SIZE }, + { "--time", 0, &opt_time, L_TIME, R_TIME }, + { "-t", 0, &opt_time, L_TIME, R_TIME }, + { "--timeout", 0, &opt_time, L_TIMEOUT, R_TIMEOUT }, + { "-T", 0, &opt_time, L_TIMEOUT, R_TIMEOUT }, + { "--loc_timeout", 0, &opt_time, L_TIMEOUT }, + { "-lT", 0, &opt_time, L_TIMEOUT }, + { "--rem_timeout", 0, &opt_time, R_TIMEOUT }, + { "-rT", 0, &opt_time, R_TIMEOUT }, + { "--server_timeout", 0, &opt_misc, 's', 't' }, + { "-st", 0, &opt_misc, 's', 't' }, + { "--unify_nodes", 0, &opt_misc, 'U' }, + { "-U", 0, &opt_misc, 'U' }, + { "--unify_units", 0, &opt_misc, 'u' }, + { "-u", 0, &opt_misc, 'u' }, + { "--verbose", 0, &opt_misc, 'v' }, + { "-v", 0, &opt_misc, 'v' }, + { "--verbose_conf", 0, &opt_misc, 'v', 'c' }, + { "-vc", 0, &opt_misc, 'v', 'c' }, + { "--verbose_stat", 0, &opt_misc, 'v', 's' }, + { "-vs", 0, &opt_misc, 'v', 's' }, + { "--verbose_time", 0, &opt_misc, 'v', 't' }, + { "-vt", 0, &opt_misc, 'v', 't' }, + { "--verbose_used", 0, &opt_misc, 'v', 'u' }, + { "-vu", 0, &opt_misc, 'v', 'u' }, + { "--verbose_more", 0, &opt_misc, 'v', 'v' }, + { "-vv", 0, &opt_misc, 'v', 'v' }, + { "--verbose_more_conf", 0, &opt_misc, 'v', 'c' }, + { "-vC", 0, &opt_misc, 'v', 'C' }, + { "--verbose_more_stat", 0, &opt_misc, 'v', 's' }, + { "-vS", 0, &opt_misc, 'v', 'S' }, + { "--verbose_more_time", 0, &opt_misc, 'v', 't' }, + { "-vT", 0, &opt_misc, 'v', 'T' }, + { "--verbose_more_used", 0, &opt_misc, 'v', 'u' }, + { "-vU", 0, &opt_misc, 'v', 'U' }, + { "--version", 0, &opt_vers, }, + { "-V", 0, &opt_vers, }, + { "--wait", 0, &opt_misc, 'W', }, + { "-W", 0, &opt_misc, 'W', }, +}; + + +/* + * Tests. + */ +#define test(n) { #n, run_client_##n, run_server_##n } +TEST Tests[] ={ + test(conf), + test(quit), + test(rds_bw), + test(rds_lat), + test(sdp_bw), + test(sdp_lat), + test(tcp_bw), + test(tcp_lat), + test(udp_bw), + test(udp_lat), +#ifdef RDMA + test(rc_bi_bw), + test(rc_bw), + test(rc_compare_swap_mr), + test(rc_fetch_add_mr), + test(rc_lat), + test(rc_rdma_read_bw), + test(rc_rdma_read_lat), + test(rc_rdma_write_bw), + test(rc_rdma_write_lat), + test(rc_rdma_write_poll_lat), + test(uc_bi_bw), + test(uc_bw), + test(uc_lat), + test(uc_rdma_write_bw), + test(uc_rdma_write_lat), + test(uc_rdma_write_poll_lat), + test(ud_bi_bw), + test(ud_bw), + test(ud_lat), + test(ver_rc_compare_swap), + test(ver_rc_fetch_add), +#endif +}; + + +int +main(int argc, char *argv[]) +{ + initialize(); + set_signals(); + do_args(&argv[1]); + return ExitStatus; +} + + +/* + * Initialize. + */ +static void +initialize(void) +{ + init_vars(); +} + + +/* + * Initialize variables. + */ +static void +init_vars(void) +{ + int i; + + for (i = 0; i < P_N; ++i) + if (ParInfo[i].index != i) + bug_die("initialize: ParInfo: out of order: %d", i); + ProcStatFD = open("/proc/stat", 0); + if (ProcStatFD < 0) + syserror_die("Cannot open /proc/stat"); + IStat.no_cpus = sysconf(_SC_NPROCESSORS_ONLN); + IStat.no_ticks = sysconf(_SC_CLK_TCK); +} + + +/* + * Look for a colon and skip past it and any spaces. + */ +static char * +skip_colon(char *s) +{ + for (;;) { + int c = *s++; + if (c == ':') + break; + if (c == '\0') + return 0; + } + while (*s == ' ') + s++; + return s; +} + + +/* + * A case insensitive string compare. s2 must at least contain all of s1 but + * can be longer. + */ +static int +cmpsub(char *s2, char *s1) +{ + for (;;) { + int c1 = *s1++; + int c2 = *s2++; + if (c1 == '\0') + return 1; + if (c2 == '\0') + return 0; + if (tolower(c1) != tolower(c2)) + return 0; + } +} + + +/* + * Set up signal handlers. + */ +static void +set_signals(void) +{ + struct sigaction alrm ={ .sa_sigaction = sig_alrm }; + sigaction(SIGALRM, &alrm, 0); + sigaction(SIGPIPE, &alrm, 0); +} + + +/* + * Note that time is up. + */ +static void +sig_alrm(int signo, siginfo_t *siginfo, void *ucontext) +{ + set_finished(); +} + + +/* + * Parse arguments. + */ +static void +do_args(char *args[]) +{ + int isClient = 0; + int testSpecified = 0; + + while (*args) { + char *arg = *args; + if (arg[0] == '-') { + OPTION *option = find_option(arg); + if (!option) + error_die("%s: bad option; try qperf --help", arg); + if (!option->server_valid) + isClient = 1; + option->func(option, &args); + } else { + isClient = 1; + if (!ServerName) + ServerName = arg; + else { + TEST *p = find_test(arg); + if (!p) + error_die("%s: bad test; try qperf --help", arg); + client(p); + testSpecified = 1; + } + ++args; + } + } + if (!isClient) + server(); + else if (!testSpecified) { + if (!ServerName) + error_die("You used a client only option but did not specify the " + "server name.\nDo you want to be a client or server?"); + if (find_test(ServerName)) + error_die("Must specify host name first; try qperf --help"); + error_die("Must specify a test type; try qperf --help"); + } +} + + +/* + * Given the name of an option, find it. + */ +static OPTION * +find_option(char *name) +{ + int n = cardof(Options); + OPTION *p = Options; + for (; n--; ++p) + if (streq(name, p->name)) + return p; + return 0; +} + + +/* + * Given the name of a test, find it. + */ +static TEST * +find_test(char *name) +{ + int n = cardof(Tests); + TEST *p = Tests; + for (; n--; ++p) + if (streq(name, p->name)) + return p; + return 0; +} + + +/* + * Print out a help message. + */ +static void +opt_help(OPTION *option, char ***argvp) +{ + char **usage; + char *category = (*argvp)[1]; + + if (!category) + category = "main"; + for (usage = Usage; *usage; usage += 2) + if (streq(*usage, category)) + break; + if (!*usage) + error_die("Cannot find help category %s; try: qperf --help"); + printf("%s", usage[1]); + exit(0); +} + + +/* + * Handle options requiring a long argument. + */ +static void +opt_long(OPTION *option, char ***argvp) +{ + long l = arg_long(argvp); + setp_u32(option->name, option->arg1, l); + setp_u32(option->name, option->arg2, l); +} + + +/* + * Handle miscellaneous options. + */ +static void +opt_misc(OPTION *option, char ***argvp) +{ + switch (option->arg1 with (option->arg2)) { + case 'e': + Precision = arg_long(argvp); + return; + case 'u': + UnifyUnits = 1; + break; + case 'v': + VerboseConf = 1; + VerboseStat = 1; + VerboseTime = 1; + VerboseUsed = 1; + break; + case 'D': + Debug = 1; + break; + case 'H': + ServerName = arg_strn(argvp); + return; + case 'U': + UnifyNodes = 1; + break; + case 'W': + Wait = arg_time(argvp); + return; + case ('l') with ('p'): + ListenPort = arg_long(argvp); + return; + case ('s') with ('t'): + ServerTimeout = arg_time(argvp); + return; + case ('v') with ('c'): + VerboseConf = 1; + break; + case ('v') with ('s'): + VerboseStat = 1; + break; + case ('v') with ('t'): + VerboseTime = 1; + break; + case ('v') with ('u'): + VerboseUsed = 1; + break; + case ('v') with ('v'): + VerboseConf = 2; + VerboseStat = 2; + VerboseTime = 2; + VerboseUsed = 2; + break; + case ('v') with ('C'): + VerboseConf = 2; + break; + case ('v') with ('S'): + VerboseStat = 2; + break; + case ('v') with ('T'): + VerboseTime = 2; + break; + case ('v') with ('U'): + VerboseUsed = 2; + break; + default: + bug_die("opt_misc: unknown argument: %s", option->name); + } + *argvp += 1; +} + + +/* + * Handle options requiring a size argument. + */ +static void +opt_size(OPTION *option, char ***argvp) +{ + long l = arg_size(argvp); + setp_u32(option->name, option->arg1, l); + setp_u32(option->name, option->arg2, l); +} + + +/* + * Handle options requiring a string argument. + */ +static void +opt_strn(OPTION *option, char ***argvp) +{ + char *s = arg_strn(argvp); + setp_str(option->name, option->arg1, s); + setp_str(option->name, option->arg2, s); +} + + +/* + * Handle options requiring a time argument. + */ +static void +opt_time(OPTION *option, char ***argvp) +{ + long l = arg_time(argvp); + setp_u32(option->name, option->arg1, l); + setp_u32(option->name, option->arg2, l); +} + + +/* + * Print out our current version. + */ +static void +opt_vers(OPTION *option, char ***argvp) +{ + printf("qperf %d.%d.%d\n", VER_MAJ, VER_MIN, VER_INC); + exit(0); +} + + +/* + * If any options were set but were not used, print out a warning message for + * the user. + */ +void +opt_check(void) +{ + PAR_INFO *p; + PAR_INFO *q; + PAR_INFO *r = endof(ParInfo); + + for (p = ParInfo; p < r; ++p) { + if (p->used || !p->set) + continue; + error("warning: %s set but not used in test %s", p->name, TestName); + for (q = p+1; q < r; ++q) + if (q->set && q->name == p->name) + q->set = 0; + } +} + + +/* + * Return the value of a long argument. It must be non-negative. + */ +static long +arg_long(char ***argvp) +{ + char **argv = *argvp; + char *p; + long l; + + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + l = strtol(argv[1], &p, 10); + if (p[0] != '\0') + error_die("Bad argument: %s", argv[1]); + if (l < 0) + error_die("%s requires a non-negative number", argv[0]); + *argvp += 2; + return l; +} + + +/* + * Return the value of a size argument. + */ +static long +arg_size(char ***argvp) +{ + char *p; + long double d; + long l = 0; + char **argv = *argvp; + + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + d = strtold(argv[1], &p); + if (d < 0) + error_die("%s requires a non-negative number", argv[0]); + + if (p[0] == '\0') + l = d; + else { + if (streq(p, "kb") || streq(p, "k")) + l = (long)(d * (1000)); + else if (streq(p, "mb") || streq(p, "m")) + l = (long)(d * (1000 * 1000)); + else if (streq(p, "gb") || streq(p, "g")) + l = (long)(d * (1000 * 1000 * 1000)); + else if (streq(p, "kib") || streq(p, "K")) + l = (long)(d * (1024)); + else if (streq(p, "mib") || streq(p, "M")) + l = (long)(d * (1024 * 1024)); + else if (streq(p, "gib") || streq(p, "G")) + l = (long)(d * (1024 * 1024 * 1024)); + else + error_die("Bad argument: %s", argv[1]); + } + *argvp += 2; + return l; +} + + +/* + * Return the value of a string argument. + */ +static char * +arg_strn(char ***argvp) +{ + char **argv = *argvp; + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + *argvp += 2; + return argv[1]; +} + + +/* + * Return the value of a size argument. + */ +static long +arg_time(char ***argvp) +{ + char *p; + long double d; + + long l = 0; + char **argv = *argvp; + if (!argv[1]) + error_die("Missing argument to %s", argv[0]); + d = strtold(argv[1], &p); + if (d < 0) + error_die("%s requires a non-negative number", argv[0]); + + if (p[0] == '\0') + l = (long)d; + else { + int u = *p; + if (p[1] != '\0') + error_die("Bad argument: %s", argv[1]); + if (u == 's' || u == 'S') + l = (long)d; + else if (u == 'm' || u == 'M') + l = (long)(d * (60)); + else if (u == 'h' || u == 'H') + l = (long)(d * (60 * 60)); + else if (u == 'd' || u == 'D') + l = (long)(d * (60 * 60 * 24)); + else + error_die("Bad argument: %s", argv[1]); + } + *argvp += 2; + return l; +} + + +/* + * Set a value stored in a 32 bit value without letting anyone know we set it. + */ +void +setv_u32(PAR_INDEX index, uint32_t l) +{ + PAR_INFO *p = par_info(index); + *((uint32_t *)p->ptr) = l; +} + + +/* + * Set an option stored in a 32 bit value. + */ +void +setp_u32(char *name, PAR_INDEX index, uint32_t l) +{ + PAR_INFO *p = par_set(name, index); + if (!p) + return; + *((uint32_t *)p->ptr) = l; +} + + +/* + * Set an option stored in a string vector. + */ +void +setp_str(char *name, PAR_INDEX index, char *s) +{ + PAR_INFO *p = par_set(name, index); + if (!p) + return; + if (strlen(s) >= STRSIZE) + error_die("%s: too long", s); + strcpy(p->ptr, s); +} + + +/* + * Note a parameter as being used. + */ +void +par_use(PAR_INDEX index) +{ + PAR_INFO *p = par_info(index); + p->used = 1; + p->inuse = 1; +} + + +/* + * Set the PAR_INFO.name value. + */ +static PAR_INFO * +par_set(char *name, PAR_INDEX index) +{ + PAR_INFO *p = par_info(index); + if (index == P_NULL) + return 0; + if (name) { + p->name = name; + p->set = 1; + } else { + p->used = 1; + p->inuse = 1; + if (p->name) + return 0; + } + return p; +} + + +/* + * Determine if a parameter is set. + */ +static int +par_isset(PAR_INDEX index) +{ + return par_info(index)->name != 0; +} + + +/* + * Index the ParInfo table. + */ +static PAR_INFO * +par_info(PAR_INDEX index) +{ + PAR_INFO *p = &ParInfo[index]; + + if (index != p->index) + bug_die("par_info: table out of order: %d != %d", index, p-index); + return p; +} + + +/* + * Server. + */ +static void +server(void) +{ + pid_t pid; + + server_listen(); + for (;;) { + TEST *test; + + debug("waiting for request"); + if (!server_recv_request()) + continue; + if (Req.ver_maj != VER_MAJ || Req.ver_min != VER_MIN) { + int h_maj = Req.ver_maj; + int h_min = Req.ver_min; + int h_inc = Req.ver_inc; + int l_maj = VER_MAJ; + int l_min = VER_MIN; + int l_inc = VER_INC; + char *msg = "upgrade %s from %d.%d.%d to %d.%d.%d"; + char *low = "client"; + + if (l_maj > h_maj || (l_maj == h_maj && l_min > h_min)) { + h_maj = VER_MAJ; + h_min = VER_MIN; + h_inc = VER_INC; + l_maj = Req.ver_maj; + l_min = Req.ver_min; + l_inc = Req.ver_inc; + low = "server"; + } + error(msg, low, l_maj, l_min, l_inc, h_maj, h_min, h_inc); + continue; + } + if (Req.req_index >= cardof(Tests)) { + error("server: bad request index: %d", Req.req_index); + continue; + } + test = &Tests[Req.req_index]; + TestName = test->name; + debug("request is %s", TestName); + pid = fork(); + if (pid == 0) { + init_lstat(); + Finished = 0; + Successful = 0; + set_affinity(); + (test->server)(); + stop_timing(); + exit(0); + } else + waitpid(pid, 0, 0); + close(RemoteFD); + } + close(ListenFD); +} + + +/* + * Listen for any requests. + */ +static void +server_listen(void) +{ + int stat; + char *service; + struct addrinfo *r; + struct addrinfo *res; + struct addrinfo hints ={ + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + service = qasprintf("%d", ListenPort); + stat = getaddrinfo(0, service, &hints, &res); + if (stat != SUCCESS0) + error_die("getaddrinfo failed: %s", gai_strerror(stat)); + free(service); + + ListenFD = -1; + for (r = res; r; r = r->ai_next) { + ListenFD = socket(r->ai_family, r->ai_socktype, r->ai_protocol); + if (ListenFD >= 0) { + int one = 1; + stat = setsockopt(ListenFD, SOL_SOCKET, SO_REUSEADDR, + &one, sizeof(one)); + if (stat < 0) + syserror_die("setsockopt failed"); + if (bind(ListenFD, r->ai_addr, r->ai_addrlen) == SUCCESS0) + break; + close(ListenFD); + ListenFD = -1; + } + } + freeaddrinfo(res); + if (ListenFD < 0) + error_die("Unable to bind to listen port"); + + Req.timeout = ServerTimeout; + if (listen(ListenFD, LISTENQ) < 0) + syserror_die("listen failed"); +} + + +/* + * Accept a request from a client. + */ +static int +server_recv_request(void) +{ + REQ req; + socklen_t clientLen; + struct sockaddr_in clientAddr; + + clientLen = sizeof(clientAddr); + RemoteFD = accept(ListenFD, (struct sockaddr *)&clientAddr, &clientLen); + if (RemoteFD < 0) + return syserror("accept failed"); + if (!set_nonblock(RemoteFD)) + goto err; + if (!recv_mesg(&req, sizeof(req), "request data")) + goto err; + dec_init(&req); + dec_req(&Req); + return 1; + +err: + close(RemoteFD); + return 0; +} + + +/* + * Client. + */ +static void +client(TEST *test) +{ + int i; + + for (i = 0; i < P_N; ++i) + ParInfo[i].inuse = 0; + if (!par_isset(L_NO_MSGS)) + setp_u32(0, L_TIME, 2); + if (!par_isset(R_NO_MSGS)) + setp_u32(0, R_TIME, 2); + setp_u32(0, L_TIMEOUT, 5); + setp_u32(0, R_TIMEOUT, 5); + par_use(L_AFFINITY); + par_use(R_AFFINITY); + par_use(L_TIME); + par_use(R_TIME); + + set_affinity(); + RReq.ver_maj = VER_MAJ; + RReq.ver_min = VER_MIN; + RReq.ver_inc = VER_INC; + RReq.req_index = test - Tests; + TestName = test->name; + debug("sending request %s", TestName); + init_lstat(); + printf("%s:\n", TestName); + Finished = 0; + Successful = 0; + (*test->client)(); + close(RemoteFD); + if (!Successful) + ExitStatus = 1; + place_show(); +} + + +/* + * Send a request to the server. + */ +void +client_send_request(void) +{ + REQ req; + int stat; + char *service; + struct addrinfo *r; + struct addrinfo *res; + struct addrinfo hints ={ + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + + service = qasprintf("%d", ListenPort); + stat = getaddrinfo(ServerName, service, &hints, &res); + if (stat != SUCCESS0) + error_die("getaddrinfo failed: %s", gai_strerror(stat)); + free(service); + + RemoteFD = -1; + if (Wait) + start_timing(Wait); + for (;;) { + for (r = res; r; r = r->ai_next) { + RemoteFD = socket(r->ai_family, r->ai_socktype, r->ai_protocol); + if (RemoteFD >= 0) { + if (connect(RemoteFD, r->ai_addr, r->ai_addrlen) == SUCCESS0) + break; + close(RemoteFD); + RemoteFD = -1; + } + } + if (RemoteFD >= 0 || !Wait || Finished) + break; + sleep(1); + } + if (Wait) + stop_timing(); + freeaddrinfo(res); + if (RemoteFD < 0) + error_die("Failed to connect"); + if (!set_nonblock(RemoteFD)) + die(); + enc_init(&req); + enc_req(&RReq); + if (!send_mesg(&req, sizeof(req), "request data")) + die(); +} + + +/* + * Set a file descriptor to non-blocking. + */ +static int +set_nonblock(int fd) +{ + int one = 1; + if (ioctl(fd, FIONBIO, &one) < 0) + return syserror("failed to set to non-blocking"); + return 1; +} + + +/* + * Synchronize the client and server. + */ +int +synchronize(void) +{ + if (is_client()) { + if (!send_sync()) + return 0; + if (!recv_sync()) + return 0; + } else { + if (!recv_sync()) + return 0; + if (!send_sync()) + return 0; + } + debug("sync completed"); + start_timing(Req.time); + return 1; +} + + +/* + * Exchange results. We sync up only to ensure that the client is out of its + * loop so we can close our socket or whatever communication medium we are + * using. + */ +void +exchange_results(void) +{ + STAT stat; + + if (!Successful) + return; + Successful = 0; + if (is_client()) { + if (!recv_mesg(&stat, sizeof(stat), "results")) + return; + dec_init(&stat); + dec_stat(&RStat); + if (!send_sync()) + return; + } else { + enc_init(&stat); + enc_stat(&LStat); + if (!send_mesg(&stat, sizeof(stat), "results")) + return; + if (!recv_sync()) + return; + } + Successful = 1; +} + + +/* + * Send a synchronize message. + */ +static int +send_sync(void) +{ + return send_mesg(SYNCMESG, SYNCSIZE, "sync"); +} + + +/* + * Receive a synchronize message. + */ +static int +recv_sync(void) +{ + char data[SYNCSIZE]; + + if (!recv_mesg(data, sizeof(data), "sync")) + return 0; + if (memcmp(data, SYNCMESG, SYNCSIZE) != SUCCESS0) + return error("sync failure: data does not match"); + return 1; +} + + +/* + * Send a message to the client. + */ +int +send_mesg(void *ptr, int len, char *item) +{ + debug("sending %s", item); + return send_recv_mesg('s', item, RemoteFD, ptr, len); +} + + +/* + * Receive a response from the server. + */ +int +recv_mesg(void *ptr, int len, char *item) +{ + debug("waiting for %s", item); + return send_recv_mesg('r', item, RemoteFD, ptr, len); +} + + +/* + * Send or receive a message to a file descriptor timing out after a certain + * amount of time. + */ +static int +send_recv_mesg(int sr, char *item, int fd, char *buf, int len) +{ + typedef ssize_t (IO)(int fd, void *buf, size_t count); + double etime; + fd_set *fdset; + fd_set rfdset; + fd_set wfdset; + char *action; + IO *func; + + if (sr == 'r') { + func = (IO *)read; + fdset = &rfdset; + action = "receive"; + } else { + func = (IO *)write; + fdset = &wfdset; + action = "send"; + } + + etime = get_seconds() + Req.timeout; + while (len) { + int n; + double time; + struct timeval timeval; + + errno = 0; + time = etime - get_seconds(); + if (time <= 0) + return error("failed to %s %s: timed out", action, item); + n = time += 1.0 / (1000*1000); + timeval.tv_sec = n; + timeval.tv_usec = (time-n) * 1000*1000; + + FD_ZERO(&rfdset); + FD_ZERO(&wfdset); + FD_SET(fd, fdset); + if (select(fd+1, &rfdset, &wfdset, 0, &timeval) < 0) + return syserror("failed to %s %s: select failed", action, item); + if (!FD_ISSET(fd, fdset)) + continue; + n = func(fd, buf, len); + if (n < 0) + return syserror("failed to %s %s", action, item); + if (n == 0) { + char *side = is_client() ? "server" : "client"; + return syserror("failed to %s %s: %s not responding", + action, item, side); + } + len -= n; + } + return 1; +} + + +/* + * Initialize local status information. + */ +static void +init_lstat(void) +{ + memcpy(&LStat, &IStat, sizeof(LStat)); +} + + +/* + * Show configuration (client side). + */ +static void +run_client_conf(void) +{ + CONF lconf; + CONF rconf; + + client_send_request(); + if (!recv_mesg(&rconf, sizeof(rconf), "configuration")) + return; + get_conf(&lconf); + view_strn('a', "", "loc_node", lconf.node); + view_strn('a', "", "loc_cpu", lconf.cpu); + view_strn('a', "", "loc_os", lconf.os); + view_strn('a', "", "loc_qperf", lconf.qperf); + view_strn('a', "", "rem_node", rconf.node); + view_strn('a', "", "rem_cpu", rconf.cpu); + view_strn('a', "", "rem_os", rconf.os); + view_strn('a', "", "rem_qperf", rconf.qperf); +} + + +/* + * Show configuration (server side). + */ +static void +run_server_conf(void) +{ + CONF conf; + get_conf(&conf); + send_mesg(&conf, sizeof(conf), "configuration"); +} + + +/* + * Get configuration. + */ +static void +get_conf(CONF *conf) +{ + struct utsname utsname; + + uname(&utsname); + strncopy(conf->node, utsname.nodename, sizeof(conf->node)); + snprintf(conf->os, sizeof(conf->os), "%s %s", utsname.sysname, + utsname.release); + get_cpu(conf); + snprintf(conf->qperf, sizeof(conf->qperf), "%d.%d.%d", + VER_MAJ, VER_MIN, VER_INC); +} + + +/* + * Get CPU information. + */ +static void +get_cpu(CONF *conf) +{ + char count[STRSIZE]; + char speed[STRSIZE]; + char buf[BUFSIZE]; + char cpu[BUFSIZE]; + char mhz[BUFSIZE]; + + int cpus = 0; + int mixed = 0; + FILE *fp = fopen("/proc/cpuinfo", "r"); + if (!fp) + error_die("Cannot open /proc/cpuinfo"); + cpu[0] = '\0'; + mhz[0] = '\0'; + while (fgets(buf, sizeof(buf), fp)) { + int n = strlen(buf); + if (cmpsub(buf, "model name")) { + ++cpus; + if (!mixed) { + if (cpu[0] == '\0') + strncopy(cpu, buf, sizeof(cpu)); + else if (!streq(buf, cpu)) + mixed = 1; + } + } else if (cmpsub(buf, "cpu MHz")) { + if (!mixed) { + if (mhz[0] == '\0') + strncopy(mhz, buf, sizeof(mhz)); + else if (!streq(buf, mhz)) + mixed = 1; + } + } + while (n && buf[n-1] != '\n') { + if (!fgets(buf, sizeof(buf), fp)) + break; + n = strlen(buf); + } + } + fclose(fp); + + /* CPU name */ + if (mixed) + strncopy(cpu, "Mixed CPUs", sizeof(cpu)); + else { + char *p = cpu; + char *q = skip_colon(cpu); + if (!q) + return; + for (;;) { + if (*q == '(' && cmpsub(q, "(r)")) + q += 3; + else if (*q == '(' && cmpsub(q, "(tm)")) + q += 4; + if (tolower(*q) == 'c' && cmpsub(q, "cpu ")) + q += 4; + if (tolower(*q) == 'p' && cmpsub(q, "processor ")) + q += 10; + else if (q[0] == ' ' && q[1] == ' ') + q += 1; + else if (q[0] == '\n') + q += 1; + else if (!(*p++ = *q++)) + break; + } + } + + /* CPU speed */ + speed[0] = '\0'; + if (!mixed) { + int n = strlen(cpu); + if (n < 3 || cpu[n-2] != 'H' || cpu[n-1] != 'z') { + char *q = skip_colon(mhz); + if (q) { + int freq = atoi(q); + if (freq < 1000) + snprintf(speed, sizeof(speed), " %dMHz", freq); + else + snprintf(speed, sizeof(speed), " %.1fGHz", freq/1000.0); + } + } + } + + /* Number of CPUs */ + if (cpus == 1) + count[0] = '\0'; + else if (cpus == 2) + snprintf(count, sizeof(count), "Dual-Core "); + else if (cpus == 4) + snprintf(count, sizeof(count), "Quad-Core "); + else + snprintf(count, sizeof(count), "%d-Core ", cpus); + + snprintf(conf->cpu, sizeof(conf->cpu), "%s%s%s", count, cpu, speed); +} + + +/* + * Quit (client side). + */ +static void +run_client_quit(void) +{ + opt_check(); + client_send_request(); + synchronize(); + exit(0); +} + + +/* + * Quit (server side). The read is to ensure that the client first quits to + * ensure that everything closes down cleanly. + */ +static void +run_server_quit(void) +{ + char buf[1]; + + synchronize(); + read(RemoteFD, buf, sizeof(buf)); + exit(0); +} + + +/* + * Start timing. + */ +static void +start_timing(int seconds) +{ + struct itimerval itimerval = {{0}}; + + get_times(LStat.time_s); + setitimer(ITIMER_REAL, &itimerval, 0); + if (!seconds) + return; + + debug("starting timer"); + itimerval.it_value.tv_sec = seconds; + itimerval.it_interval.tv_usec = 1; + setitimer(ITIMER_REAL, &itimerval, 0); +} + + +/* + * Stop timing. Note that the end time is obtained by the first call to + * set_finished. In the tests, usually, when SIGALRM goes off, it is executing + * a read or write system call which gets interrupted. If SIGALRM goes off + * after Finished is checked but before the system call is performed, the + * system call will be executed and it will take the second SIGALRM call + * generated by the interval timer to wake it up. Hence, we save the end times + * in sig_alrm. Note that if Finished is set, we reject any packets that are + * sent or arrive in order not to cheat. + */ +void +stop_timing(void) +{ + struct itimerval itimerval = {{0}}; + + set_finished(); + setitimer(ITIMER_REAL, &itimerval, 0); + debug("stopping timer"); +} + + +/* + * Establish the current test as finished. + */ +void +set_finished(void) +{ + if (Finished++ == 0) + get_times(LStat.time_e); +} + + +/* + * Show results. + */ +void +show_results(MEASURE measure) +{ + calc_results(); + show_info(measure); +} + + +/* + * Calculate results. + */ +static void +calc_results(void) +{ + double no_msgs; + double locTime; + double remTime; + double midTime; + double gB = 1000 * 1000 * 1000; + + if (!Successful) + return; + + add_ustat(&LStat.s, &RStat.rem_s); + add_ustat(&LStat.r, &RStat.rem_r); + add_ustat(&RStat.s, &LStat.rem_s); + add_ustat(&RStat.r, &LStat.rem_r); + + memset(&Res, 0, sizeof(Res)); + calc_node(&Res.l, &LStat); + calc_node(&Res.r, &RStat); + no_msgs = LStat.r.no_msgs + RStat.r.no_msgs; + if (no_msgs) + Res.latency = Res.l.time_real / no_msgs; + + locTime = Res.l.time_real; + remTime = Res.r.time_real; + midTime = (locTime + remTime) / 2; + + if (locTime == 0 || remTime == 0) + return; + + /* Calculate messaging rate */ + if (!RStat.r.no_msgs) + Res.msg_rate = LStat.r.no_msgs / remTime; + else if (!LStat.r.no_msgs) + Res.msg_rate = RStat.r.no_msgs / locTime; + else + Res.msg_rate = (LStat.r.no_msgs + RStat.r.no_msgs) / midTime; + + /* Calculate send bandwidth */ + if (!RStat.s.no_bytes) + Res.send_bw = LStat.s.no_bytes / locTime; + else if (!LStat.s.no_bytes) + Res.send_bw = RStat.s.no_bytes / remTime; + else + Res.send_bw = (LStat.s.no_bytes + RStat.s.no_bytes) / midTime; + + /* Calculate receive bandwidth. */ + if (!RStat.r.no_bytes) + Res.recv_bw = LStat.r.no_bytes / locTime; + else if (!LStat.r.no_bytes) + Res.recv_bw = RStat.r.no_bytes / remTime; + else + Res.recv_bw = (LStat.r.no_bytes + RStat.r.no_bytes) / midTime; + + /* Calculate costs */ + if (LStat.s.no_bytes && !LStat.r.no_bytes && !RStat.s.no_bytes) + Res.send_cost = Res.l.time_cpu*gB / LStat.s.no_bytes; + else if (RStat.s.no_bytes && !RStat.r.no_bytes && !LStat.s.no_bytes) + Res.send_cost = Res.r.time_cpu*gB / RStat.s.no_bytes; + if (RStat.r.no_bytes && !RStat.s.no_bytes && !LStat.r.no_bytes) + Res.recv_cost = Res.r.time_cpu*gB / RStat.r.no_bytes; + else if (LStat.r.no_bytes && !LStat.s.no_bytes && !RStat.r.no_bytes) + Res.recv_cost = Res.l.time_cpu*gB / LStat.r.no_bytes; +} + + +/* + * Determine the number of packets left to send. + */ +int +left_to_send(long *sentp, int room) +{ + int n; + + if (!Req.no_msgs) + return room; + n = Req.no_msgs - *sentp; + if (n <= 0) + return 0; + if (n > room) + return room; + return n; +} + + +/* + * Touch data. + */ +void +touch_data(void *p, int n) +{ + uint64_t a; + volatile uint64_t *p64 = p; + + while (n >= sizeof(*p64)) { + a = *p64++; + n -= sizeof(*p64); + } + if (n) { + volatile uint8_t *p8 = (uint8_t *)p64; + while (n >= sizeof(*p8)) { + a = *p8++; + n -= sizeof(*p8); + } + } +} + + +/* + * Combine statistics that the remote node kept track of with those that the + * local node kept. + */ +static void +add_ustat(USTAT *l, USTAT *r) +{ + l->no_bytes += r->no_bytes; + l->no_msgs += r->no_msgs; + l->no_errs += r->no_errs; +} + + +/* + * Calculate time values for a node. + */ +static void +calc_node(RESN *resn, STAT *stat) +{ + int i; + CLOCK cpu; + double s = stat->time_e[T_REAL] - stat->time_s[T_REAL]; + + memset(resn, 0, sizeof(*resn)); + if (s == 0) + return; + if (stat->no_ticks == 0) + return; + + resn->time_real = s / stat->no_ticks; + + cpu = 0; + for (i = 0; i < T_N; ++i) + if (i != T_REAL && i != T_IDLE) + cpu += stat->time_e[i] - stat->time_s[i]; + resn->time_cpu = (float) cpu / stat->no_ticks; + + resn->cpu_user = (stat->time_e[T_USER] - stat->time_s[T_USER] + + stat->time_e[T_NICE] - stat->time_s[T_NICE]) / s; + + resn->cpu_intr = (stat->time_e[T_IRQ] - stat->time_s[T_IRQ] + + stat->time_e[T_SOFTIRQ] - stat->time_s[T_SOFTIRQ]) / s; + + resn->cpu_idle = (stat->time_e[T_IDLE] - stat->time_s[T_IDLE]) / s; + + resn->cpu_kernel = (stat->time_e[T_KERNEL] - stat->time_s[T_KERNEL] + + stat->time_e[T_STEAL] - stat->time_s[T_STEAL]) / s; + + resn->cpu_io_wait = (stat->time_e[T_IOWAIT] - stat->time_s[T_IOWAIT]) / s; + + resn->cpu_total = resn->cpu_user + resn->cpu_intr + + resn->cpu_kernel + resn->cpu_io_wait; +} + + +/* + * Show relevant values. + */ +static void +show_info(MEASURE measure) +{ + if (!Successful) + return; + if (measure == LATENCY) { + view_time('a', "", "latency", Res.latency); + view_rate('s', "", "msg_rate", Res.msg_rate); + } else if (measure == MSG_RATE) { + view_rate('a', "", "msg_rate", Res.msg_rate); + } else if (measure == BANDWIDTH) { + view_band('a', "", "bw", Res.recv_bw); + view_rate('s', "", "msg_rate", Res.msg_rate); + } else if (measure == BANDWIDTH_SR) { + view_band('a', "", "send_bw", Res.send_bw); + view_band('a', "", "recv_bw", Res.recv_bw); + view_rate('s', "", "msg_rate", Res.msg_rate); + } + show_used(); + view_cost('t', "", "send_cost", Res.send_cost); + view_cost('t', "", "recv_cost", Res.recv_cost); + show_rest(); + if (Debug) + show_debug(); +} + + +/* + * Show parameters the user set. + */ +static void +show_used(void) +{ + PAR_NAME *p; + PAR_NAME *q = endof(ParName); + + if (!VerboseUsed) + return; + for (p = ParName; p < q; ++p) { + PAR_INFO *l = par_info(p->loc_i); + PAR_INFO *r = par_info(p->rem_i); + + if (!l->inuse && !r->inuse) + continue; + if (VerboseUsed < 2 && !l->set & !r->set) + continue; + if (l->type == 'l') { + uint32_t lv = *(uint32_t *)l->ptr; + uint32_t rv = *(uint32_t *)r->ptr; + if (lv == rv) + view_long('u', "", p->name, lv); + else { + view_long('u', "loc_", p->name, lv); + view_long('u', "rem_", p->name, rv); + } + } else if (l->type == 'p') { + if (streq(l->ptr, r->ptr)) + view_strn('u', "", p->name, l->ptr); + else { + view_strn('u', "loc_", p->name, l->ptr); + view_strn('u', "rem_", p->name, r->ptr); + } + } else if (l->type == 's') { + uint32_t lv = *(uint32_t *)l->ptr; + uint32_t rv = *(uint32_t *)r->ptr; + if (lv == rv) + view_size('u', "", p->name, lv); + else { + view_size('u', "loc_", p->name, lv); + view_size('u', "rem_", p->name, rv); + } + } else if (l->type == 't') { + uint32_t lv = *(uint32_t *)l->ptr; + uint32_t rv = *(uint32_t *)r->ptr; + if (lv == rv) + view_time('u', "", p->name, lv); + else { + view_time('u', "loc_", p->name, lv); + view_time('u', "rem_", p->name, rv); + } + } + } +} + + +/* + * Show the remaining parameters. + */ +static void +show_rest(void) +{ + RESN *resnS; + RESN *resnR; + STAT *statS; + STAT *statR; + int srmode = 0; + + if (!UnifyNodes) { + uint64_t ls = LStat.s.no_bytes; + uint64_t lr = LStat.r.no_bytes; + uint64_t rs = RStat.s.no_bytes; + uint64_t rr = RStat.r.no_bytes; + + if (ls && !rs && rr && !lr) { + srmode = 1; + resnS = &Res.l; + resnR = &Res.r; + statS = &LStat; + statR = &RStat; + } else if (rs && !ls && lr && !rr) { + srmode = 1; + resnS = &Res.r; + resnR = &Res.l; + statS = &RStat; + statR = &LStat; + } + } + + if (srmode) { + view_cpus('t', "", "send_cpus_used", resnS->cpu_total); + view_cpus('T', "", "send_cpus_user", resnS->cpu_user); + view_cpus('T', "", "send_cpus_intr", resnS->cpu_intr); + view_cpus('T', "", "send_cpus_kernel", resnS->cpu_kernel); + view_cpus('T', "", "send_cpus_iowait", resnS->cpu_io_wait); + view_time('T', "", "send_real_time", resnS->time_real); + view_time('T', "", "send_cpu_time", resnS->time_cpu); + view_long('S', "", "send_errors", statS->s.no_errs); + view_size('S', "", "send_bytes", statS->s.no_bytes); + view_long('S', "", "send_msgs", statS->s.no_msgs); + view_long('S', "", "send_max_cqe", statS->max_cqes); + + view_cpus('t', "", "recv_cpus_used", resnR->cpu_total); + view_cpus('T', "", "recv_cpus_user", resnR->cpu_user); + view_cpus('T', "", "recv_cpus_intr", resnR->cpu_intr); + view_cpus('T', "", "recv_cpus_kernel", resnR->cpu_kernel); + view_cpus('T', "", "recv_cpus_iowait", resnR->cpu_io_wait); + view_time('T', "", "recv_real_time", resnR->time_real); + view_time('T', "", "recv_cpu_time", resnR->time_cpu); + view_long('S', "", "recv_errors", statR->r.no_errs); + view_size('S', "", "recv_bytes", statR->r.no_bytes); + view_long('S', "", "recv_msgs", statR->r.no_msgs); + view_long('S', "", "recv_max_cqe", statR->max_cqes); + } else { + view_cpus('t', "", "loc_cpus_used", Res.l.cpu_total); + view_cpus('T', "", "loc_cpus_user", Res.l.cpu_user); + view_cpus('T', "", "loc_cpus_intr", Res.l.cpu_intr); + view_cpus('T', "", "loc_cpus_kernel", Res.l.cpu_kernel); + view_cpus('T', "", "loc_cpus_iowait", Res.l.cpu_io_wait); + view_time('T', "", "loc_real_time", Res.l.time_real); + view_time('T', "", "loc_cpu_time", Res.l.time_cpu); + view_long('S', "", "loc_send_errors", LStat.s.no_errs); + view_long('S', "", "loc_recv_errors", LStat.r.no_errs); + view_size('S', "", "loc_send_bytes", LStat.s.no_bytes); + view_size('S', "", "loc_recv_bytes", LStat.r.no_bytes); + view_long('S', "", "loc_send_msgs", LStat.s.no_msgs); + view_long('S', "", "loc_recv_msgs", LStat.r.no_msgs); + view_long('S', "", "loc_max_cqe", LStat.max_cqes); + + view_cpus('t', "", "rem_cpus_used", Res.r.cpu_total); + view_cpus('T', "", "rem_cpus_user", Res.r.cpu_user); + view_cpus('T', "", "rem_cpus_intr", Res.r.cpu_intr); + view_cpus('T', "", "rem_cpus_kernel", Res.r.cpu_kernel); + view_cpus('T', "", "rem_cpus_iowait", Res.r.cpu_io_wait); + view_time('T', "", "rem_real_time", Res.r.time_real); + view_time('T', "", "rem_cpu_time", Res.r.time_cpu); + view_long('S', "", "rem_send_errors", RStat.s.no_errs); + view_long('S', "", "rem_recv_errors", RStat.r.no_errs); + view_size('S', "", "rem_send_bytes", RStat.s.no_bytes); + view_size('S', "", "rem_recv_bytes", RStat.r.no_bytes); + view_long('S', "", "rem_send_msgs", RStat.s.no_msgs); + view_long('S', "", "rem_recv_msgs", RStat.r.no_msgs); + view_long('S', "", "rem_max_cqe", RStat.max_cqes); + } +} + + +/* + * Show all values. + */ +static void +show_debug(void) +{ + /* Local node */ + view_long('d', "", "l_no_cpus", LStat.no_cpus); + view_long('d', "", "l_no_ticks", LStat.no_ticks); + view_long('d', "", "l_max_cqes", LStat.max_cqes); + + if (LStat.no_ticks) { + double t = LStat.no_ticks; + CLOCK *s = LStat.time_s; + CLOCK *e = LStat.time_e; + double real = (e[T_REAL] - s[T_REAL]) / t; + double user = (e[T_USER] - s[T_USER]) / t; + double nice = (e[T_NICE] - s[T_NICE]) / t; + double system = (e[T_KERNEL] - s[T_KERNEL]) / t; + double idle = (e[T_IDLE] - s[T_IDLE]) / t; + double iowait = (e[T_IOWAIT] - s[T_IOWAIT]) / t; + double irq = (e[T_IRQ] - s[T_IRQ]) / t; + double softirq = (e[T_SOFTIRQ] - s[T_SOFTIRQ]) / t; + double steal = (e[T_STEAL] - s[T_STEAL]) / t; + + view_time('d', "", "l_timer_real", real); + view_time('d', "", "l_timer_user", user); + view_time('d', "", "l_timer_nice", nice); + view_time('d', "", "l_timer_system", system); + view_time('d', "", "l_timer_idle", idle); + view_time('d', "", "l_timer_iowait", iowait); + view_time('d', "", "l_timer_irq", irq); + view_time('d', "", "l_timer_softirq", softirq); + view_time('d', "", "l_timer_steal", steal); + } + + view_size('d', "", "l_s_no_bytes", LStat.s.no_bytes); + view_long('d', "", "l_s_no_msgs", LStat.s.no_msgs); + view_long('d', "", "l_s_no_errs", LStat.s.no_errs); + + view_size('d', "", "l_r_no_bytes", LStat.r.no_bytes); + view_long('d', "", "l_r_no_msgs", LStat.r.no_msgs); + view_long('d', "", "l_r_no_errs", LStat.r.no_errs); + + view_size('d', "", "l_rem_s_no_bytes", LStat.rem_s.no_bytes); + view_long('d', "", "l_rem_s_no_msgs", LStat.rem_s.no_msgs); + view_long('d', "", "l_rem_s_no_errs", LStat.rem_s.no_errs); + + view_size('d', "", "l_rem_r_no_bytes", LStat.rem_r.no_bytes); + view_long('d', "", "l_rem_r_no_msgs", LStat.rem_r.no_msgs); + view_long('d', "", "l_rem_r_no_errs", LStat.rem_r.no_errs); + + /* Remote node */ + view_long('d', "", "r_no_cpus", RStat.no_cpus); + view_long('d', "", "r_no_ticks", RStat.no_ticks); + view_long('d', "", "r_max_cqes", RStat.max_cqes); + + if (RStat.no_ticks) { + double t = RStat.no_ticks; + CLOCK *s = RStat.time_s; + CLOCK *e = RStat.time_e; + + double real = (e[T_REAL] - s[T_REAL]) / t; + double user = (e[T_USER] - s[T_USER]) / t; + double nice = (e[T_NICE] - s[T_NICE]) / t; + double system = (e[T_KERNEL] - s[T_KERNEL]) / t; + double idle = (e[T_IDLE] - s[T_IDLE]) / t; + double iowait = (e[T_IOWAIT] - s[T_IOWAIT]) / t; + double irq = (e[T_IRQ] - s[T_IRQ]) / t; + double softirq = (e[T_SOFTIRQ] - s[T_SOFTIRQ]) / t; + double steal = (e[T_STEAL] - s[T_STEAL]) / t; + + view_time('d', "", "r_timer_real", real); + view_time('d', "", "r_timer_user", user); + view_time('d', "", "r_timer_nice", nice); + view_time('d', "", "r_timer_system", system); + view_time('d', "", "r_timer_idle", idle); + view_time('d', "", "r_timer_iowait", iowait); + view_time('d', "", "r_timer_irq", irq); + view_time('d', "", "r_timer_softirq", softirq); + view_time('d', "", "r_timer_steal", steal); + } + + view_size('d', "", "r_s_no_bytes", RStat.s.no_bytes); + view_long('d', "", "r_s_no_msgs", RStat.s.no_msgs); + view_long('d', "", "r_s_no_errs", RStat.s.no_errs); + + view_size('d', "", "r_r_no_bytes", RStat.r.no_bytes); + view_long('d', "", "r_r_no_msgs", RStat.r.no_msgs); + view_long('d', "", "r_r_no_errs", RStat.r.no_errs); + + view_size('d', "", "r_rem_s_no_bytes", RStat.rem_s.no_bytes); + view_long('d', "", "r_rem_s_no_msgs", RStat.rem_s.no_msgs); + view_long('d', "", "r_rem_s_no_errs", RStat.rem_s.no_errs); + + view_size('d', "", "r_rem_r_no_bytes", RStat.rem_r.no_bytes); + view_long('d', "", "r_rem_r_no_msgs", RStat.rem_r.no_msgs); + view_long('d', "", "r_rem_r_no_errs", RStat.rem_r.no_errs); +} + + +/* + * Show a cost in terms of seconds per gigabyte. + */ +static void +view_cost(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ "ns/GB", "us/GB", "ms/GB", "sec/GB" }; + + value *= 1E9; + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Show the number of cpus. + */ +static void +view_cpus(int type, char *pref, char *name, double value) +{ + value *= 100; + if (!verbose(type, value)) + return; + place_val(pref, name, "% cpus", value); +} + + +/* + * Show a messaging rate. + */ +static void +view_rate(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ "/sec", "K/sec", "M/sec", "G/sec", "T/sec" }; + + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Show a number. + */ +static void +view_long(int type, char *pref, char *name, long long value) +{ + int n = 0; + double val = value; + static char *tab[] ={ "", "thousand", "million", "billion", "trillion" }; + + if (!verbose(type, val)) + return; + if (!UnifyUnits && val >= 1000*1000) { + while (val >= 1000 && n < (int)cardof(tab)-1) { + val /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], val); +} + + +/* + * Show a bandwidth value. + */ +static void +view_band(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ + "bytes/sec", "KB/sec", "MB/sec", "GB/sec", "TB/sec" + }; + + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Show a size. + */ +static void +view_size(int type, char *pref, char *name, long long value) +{ + int n = 0; + double val = value; + static char *tab[] ={ "bytes", "KB", "MB", "GB", "TB" }; + + if (!verbose(type, val)) + return; + if (!UnifyUnits) { + if (nice_1024(pref, name, value)) + return; + while (val >= 1000 && n < (int)cardof(tab)-1) { + val /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], val); +} + + +/* + * Show a number if it can be expressed as a nice multiple of a power of 1024. + */ +static int +nice_1024(char *pref, char *name, long long value) +{ + char *data; + char *altn; + int n = 0; + long long val = value; + static char *tab[] ={ "KiB", "MiB", "GiB", "TiB" }; + + if (val < 1024 || val % 1024) + return 0; + val /= 1024; + while (val >= 1024 && n < (int)cardof(tab)-1) { + if (val % 1024) + return 0; + val /= 1024; + ++n; + } + data = qasprintf("%lld", val); + altn = qasprintf("%lld", value); + place_any(pref, name, tab[n], commify(data), commify(altn)); + return 1; +} + + +/* + * Show a string. + */ +static void +view_strn(int type, char *pref, char *name, char *value) +{ + if (!verbose(type, value[0] != '\0')) + return; + place_any(pref, name, 0, strdup(value), 0); +} + + +/* + * Show a time. + */ +static void +view_time(int type, char *pref, char *name, double value) +{ + int n = 0; + static char *tab[] ={ "ns", "us", "ms", "sec" }; + + value *= 1E9; + if (!verbose(type, value)) + return; + if (!UnifyUnits) { + while (value >= 1000 && n < (int)cardof(tab)-1) { + value /= 1000; + ++n; + } + } + place_val(pref, name, tab[n], value); +} + + +/* + * Determine if we are verbose enough to show a value. + */ +static int +verbose(int type, double value) +{ + if (type == 'a') + return 1; + if (value <= 0) + return 0; + switch (type) { + case 'd': return Debug; + case 'c': return VerboseConf >= 1; + case 's': return VerboseStat >= 1; + case 't': return VerboseTime >= 1; + case 'u': return VerboseUsed >= 1; + case 'C': return VerboseConf >= 2; + case 'S': return VerboseStat >= 2; + case 'T': return VerboseTime >= 2; + case 'U': return VerboseUsed >= 2; + default: bug_die("verbose: bad type: %c (%o)", type, type); + } + return 0; +} + + +/* + * Place a value to be shown later. + */ +static void +place_val(char *pref, char *name, char *unit, double value) +{ + char *data = qasprintf("%.0f", value); + char *p = data; + int n = Precision; + + if (*p == '-') + ++p; + while (isdigit(*p++)) + --n; + if (n > 0) { + free(data); + data = qasprintf("%.*f", n, value); + p = &data[strlen(data)]; + while (p > data && *--p == '0') + ; + if (p > data && *p == '.') + --p; + p[1] = '\0'; + } + place_any(pref, name, unit, commify(data), 0); +} + + +/* + * Place an entry in our show table. + */ +static void +place_any(char *pref, char *name, char *unit, char *data, char *altn) +{ + SHOW *show = &ShowTable[ShowIndex++]; + if (ShowIndex > cardof(ShowTable)) + bug_die("Need to increase size of ShowTable"); + show->pref = pref; + show->name = name; + show->unit = unit; + show->data = data; + show->altn = altn; +} + + +/* + * Show all saved values. + */ +static void +place_show(void) +{ + int i; + int nameLen = 0; + int dataLen = 0; + int unitLen = 0; + + /* First compute formating sizes */ + for (i = 0; i < ShowIndex; ++i) { + int n; + SHOW *show = &ShowTable[i]; + n = (show->pref ? strlen(show->pref) : 0) + strlen(show->name); + if (n > nameLen) + nameLen = n; + n = strlen(show->data); + if (show->unit) { + if (n > dataLen) + dataLen = n; + n = strlen(show->unit); + if (n > unitLen) + unitLen = n; + } + } + + /* Then display results */ + for (i = 0; i < ShowIndex; ++i) { + int n = 0; + SHOW *show = &ShowTable[i]; + + printf(" "); + if (show->pref) { + n = strlen(show->pref); + printf("%s", show->pref); + } + printf("%-*s", nameLen-n, show->name); + if (show->unit) { + printf(" = %*s", dataLen, show->data); + printf(" %s", show->unit); + } else + printf(" = %s", show->data); + if (show->altn) + printf(" (%s)", show->altn); + printf("\n"); + free(show->data); + free(show->altn); + } + ShowIndex = 0; +} + + +/* + * Set the processor affinity. + */ +static void +set_affinity(void) +{ + cpu_set_t set; + int a = Req.affinity; + + if (!a) + return; + CPU_ZERO(&set); + CPU_SET(a-1, &set); + if (sched_setaffinity(0, sizeof(set), &set) < 0) + syserror_die("Cannot set processor affinity (cpu %d)", a-1); +} + + +/* + * Encode a REQ structure into a data stream. + */ +static void +enc_req(REQ *host) +{ + enc_int(host->ver_maj, sizeof(host->ver_maj)); + enc_int(host->ver_min, sizeof(host->ver_min)); + enc_int(host->ver_inc, sizeof(host->ver_inc)); + enc_int(host->req_index, sizeof(host->req_index)); + enc_int(host->flip, sizeof(host->flip)); + enc_int(host->access_recv, sizeof(host->access_recv)); + enc_int(host->affinity, sizeof(host->affinity)); + enc_int(host->poll_mode, sizeof(host->poll_mode)); + enc_int(host->port, sizeof(host->port)); + enc_int(host->rd_atomic, sizeof(host->rd_atomic)); + enc_int(host->timeout, sizeof(host->timeout)); + enc_int(host->msg_size, sizeof(host->msg_size)); + enc_int(host->mtu_size, sizeof(host->mtu_size)); + enc_int(host->no_msgs, sizeof(host->no_msgs)); + enc_int(host->sock_buf_size, sizeof(host->sock_buf_size)); + enc_int(host->time, sizeof(host->time)); + enc_str(host->id, sizeof(host->id)); +} + + +/* + * Decode a REQ structure from a data stream. + */ +static void +dec_req(REQ *host) +{ + host->ver_maj = dec_int(sizeof(host->ver_maj)); + host->ver_min = dec_int(sizeof(host->ver_min)); + host->ver_inc = dec_int(sizeof(host->ver_inc)); + host->req_index = dec_int(sizeof(host->req_index)); + host->flip = dec_int(sizeof(host->flip)); + host->access_recv = dec_int(sizeof(host->access_recv)); + host->affinity = dec_int(sizeof(host->affinity)); + host->poll_mode = dec_int(sizeof(host->poll_mode)); + host->port = dec_int(sizeof(host->port)); + host->rd_atomic = dec_int(sizeof(host->rd_atomic)); + host->timeout = dec_int(sizeof(host->timeout)); + host->msg_size = dec_int(sizeof(host->msg_size)); + host->mtu_size = dec_int(sizeof(host->mtu_size)); + host->no_msgs = dec_int(sizeof(host->no_msgs)); + host->sock_buf_size = dec_int(sizeof(host->sock_buf_size)); + host->time = dec_int(sizeof(host->time)); + dec_str(host->id, sizeof(host->id)); +} + + +/* + * Encode a STAT structure into a data stream. + */ +static void +enc_stat(STAT *host) +{ + int i; + + enc_int(host->no_cpus, sizeof(host->no_cpus)); + enc_int(host->no_ticks, sizeof(host->no_ticks)); + enc_int(host->max_cqes, sizeof(host->max_cqes)); + for (i = 0; i < T_N; ++i) + enc_int(host->time_s[i], sizeof(host->time_s[i])); + for (i = 0; i < T_N; ++i) + enc_int(host->time_e[i], sizeof(host->time_e[i])); + enc_ustat(&host->s); + enc_ustat(&host->r); + enc_ustat(&host->rem_s); + enc_ustat(&host->rem_r); +} + + +/* + * Decode a STAT structure from a data stream. + */ +static void +dec_stat(STAT *host) +{ + int i; + + host->no_cpus = dec_int(sizeof(host->no_cpus)); + host->no_ticks = dec_int(sizeof(host->no_ticks)); + host->max_cqes = dec_int(sizeof(host->max_cqes)); + for (i = 0; i < T_N; ++i) + host->time_s[i] = dec_int(sizeof(host->time_s[i])); + for (i = 0; i < T_N; ++i) + host->time_e[i] = dec_int(sizeof(host->time_e[i])); + dec_ustat(&host->s); + dec_ustat(&host->r); + dec_ustat(&host->rem_s); + dec_ustat(&host->rem_r); +} + + +/* + * Encode a USTAT structure into a data stream. + */ +static void +enc_ustat(USTAT *host) +{ + enc_int(host->no_bytes, sizeof(host->no_bytes)); + enc_int(host->no_msgs, sizeof(host->no_msgs)); + enc_int(host->no_errs, sizeof(host->no_errs)); +} + + +/* + * Decode a USTAT structure from a data stream. + */ +static void +dec_ustat(USTAT *host) +{ + host->no_bytes = dec_int(sizeof(host->no_bytes)); + host->no_msgs = dec_int(sizeof(host->no_msgs)); + host->no_errs = dec_int(sizeof(host->no_errs)); +} + + +/* + * Initialize encode pointer. + */ +void +enc_init(void *p) +{ + EncodePtr = p; +} + + +/* + * Initialize decode pointer. + */ +void +dec_init(void *p) +{ + DecodePtr = p; +} + + +/* + * Encode a string. + */ +void +enc_str(char *s, int n) +{ + memcpy(EncodePtr, s, n); + EncodePtr += n; +} + + +/* + * Decode a string. + */ +void +dec_str(char *s, int n) +{ + memcpy(s, DecodePtr, n); + DecodePtr += n; +} + + +/* + * Encode an integer. + */ +void +enc_int(int64_t l, int n) +{ + while (n--) { + *EncodePtr++ = l; + l >>= 8; + } +} + + +/* + * Decode an integer. + */ +int64_t +dec_int(int n) +{ + uint64_t l = 0; + uint8_t *p = (DecodePtr += n); + while (n--) + l = (l << 8) | (*--p & 0xFF); + return l; +} + + +/* + * Get various temporal parameters. + */ +static void +get_times(CLOCK timex[T_N]) +{ + int n; + char *p; + char buf[BUFSIZE]; + struct tms tms; + + timex[0] = times(&tms); + if (lseek(ProcStatFD, 0, 0) < 0) + syserror_die("Failed to seek /proc/stat"); + n = read(ProcStatFD, buf, sizeof(buf)-1); + buf[n] = '\0'; + if (strncmp(buf, "cpu ", 4)) + error_die("/proc/stat does not start with 'cpu '"); + p = &buf[3]; + for (n = 1; n < T_N; ++n) { + while (*p == ' ') + ++p; + if (!isdigit(*p)) { + if (*p != '\n' || n < T_N-1) + error_die("/proc/stat has bad format"); + break; + } + timex[n] = strtoll(p, 0, 10); + while (*p != ' ' && *p != '\n' && *p != '\0') + ++p; + } + while (n < T_N) + timex[n++] = 0; +} + + +/* + * Get the time of day in seconds as a floating point number. + */ +static double +get_seconds(void) +{ + struct timeval timeval; + + if (gettimeofday(&timeval, 0) < 0) + syserror_die("gettimeofday failed"); + return timeval.tv_sec + timeval.tv_usec/(1000.0*1000.0); +} + + +/* + * Insert commas within a number for readability. + */ +static char * +commify(char *data) +{ + int s; + int d; + int seqS; + int seqE; + int dataLen; + int noCommas; + + if (!data) + return data; + if (UnifyUnits) + return data; + dataLen = strlen(data); + seqS = seqE = dataLen; + while (--seqS >= 0) + if (!isdigit(data[seqS])) + break; + if (seqS >= 0 && data[seqS] == '.') { + seqE = seqS; + while (--seqS >= 0) + if (!isdigit(data[seqS])) + break; + } + noCommas = (--seqE - ++seqS) / 3; + if (noCommas == 0) + return data; + data = realloc(data, dataLen+noCommas+1); + if (!data) + error_die("Out of space"); + s = dataLen; + d = dataLen + noCommas; + for (;;) { + int n; + data[d--] = data[s--]; + n = seqE - s; + if (n > 0 && n%3 == 0) { + data[d--] = ','; + if (--noCommas == 0) + break; + } + } + return data; +} + + +/* + * Like strncpy but ensures the destination is null terminated. + */ +static void +strncopy(char *d, char *s, int n) +{ + strncpy(d, s, n); + d[n-1] = '\0'; +} + + +/* + * Call malloc and panic on error. + */ +void * +qmalloc(long n) +{ + void *p = malloc(n); + if (!p) + error_die("Out of space"); + return p; +} + + +/* + * Print out an error message and exit. + */ +static char * +qasprintf(char *fmt, ...) +{ + int stat; + char *str; + va_list alist; + + va_start(alist, fmt); + stat = vasprintf(&str, fmt, alist); + va_end(alist); + if (stat < 0) + error_die("Out of space"); + return str; +} + + +/* + * Print out a debug message. + */ +void +debug(char *fmt, ...) +{ + va_list alist; + + if (!Debug) + return; + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); +} + + +/* + * Print out an error message. + */ +int +error(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); + return 0; +} + + +/* + * Print out an error message and exit. + */ +void +error_die(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); + die(); +} + + +/* + * Print out a system error message. + */ +int +syserror(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + if (errno) + fprintf(stderr, ": %s", strerror(errno)); + fprintf(stderr, "\n"); + return 0; +} + +/* + * Print out a system error message and exit. + */ +void +syserror_die(char *fmt, ...) +{ + va_list alist; + + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + if (errno) + fprintf(stderr, ": %s", strerror(errno)); + fprintf(stderr, "\n"); + die(); +} + + +/* + * Print out an internal error and exit. + */ +static void +bug_die(char *fmt, ...) +{ + va_list alist; + + fprintf(stderr, "internal error: "); + va_start(alist, fmt); + vfprintf(stderr, fmt, alist); + va_end(alist); + fprintf(stderr, "\n"); + die(); +} + + +/* + * Exit unsuccessfully. + */ +void +die(void) +{ + exit(1); +} diff --git a/qperf.h b/qperf.h new file mode 100644 index 0000000..0bf5774 --- /dev/null +++ b/qperf.h @@ -0,0 +1,316 @@ +/* + * qperf - general header file. + * + * Copyright (c) 2002-2007 Johann George. All rights reserved. + * Copyright (c) 2006-2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * Parameters. + */ +#define STRSIZE 64 + + +/* + * For convenience and readability. + */ +#define SUCCESS0 0 +#define cardof(a) (sizeof(a)/sizeof(*a)) +#define endof(a) (&a[cardof(a)]) +#define streq(a, b) (strcmp(a, b) == 0) +#define is_client() (ServerName) + +typedef uint64_t CLOCK; + + +/* + * Time indices. + */ +typedef enum { + T_REAL, + T_USER, + T_NICE, + T_KERNEL, + T_IDLE, + T_IOWAIT, + T_IRQ, + T_SOFTIRQ, + T_STEAL, + T_N +} TIME_INDEX; + + +/* + * Parameter indices. P_NULL must be 0. + */ +typedef enum { + P_NULL, + L_ACCESS_RECV, + R_ACCESS_RECV, + L_AFFINITY, + R_AFFINITY, + L_FLIP, + R_FLIP, + L_ID, + R_ID, + L_MSG_SIZE, + R_MSG_SIZE, + L_MTU_SIZE, + R_MTU_SIZE, + L_NO_MSGS, + R_NO_MSGS, + L_POLL_MODE, + R_POLL_MODE, + L_PORT, + R_PORT, + L_RATE, + R_RATE, + L_RD_ATOMIC, + R_RD_ATOMIC, + L_SOCK_BUF_SIZE, + R_SOCK_BUF_SIZE, + L_TIME, + R_TIME, + L_TIMEOUT, + R_TIMEOUT, + P_N +} PAR_INDEX; + + +/* + * What we are measuring. + */ +typedef enum { + LATENCY, + MSG_RATE, + BANDWIDTH, + BANDWIDTH_SR +} MEASURE; + + +/* + * Request to the server. Note that most of these must be of type uint32_t + * because of the way options are set. The minor version must be changed if + * there is change to this data structure. + */ +typedef struct REQ { + uint16_t ver_maj; /* Major version */ + uint16_t ver_min; /* Minor version */ + uint16_t ver_inc; /* Incremental version */ + uint16_t req_index; /* Request index (into Tests) */ + uint32_t flip; /* Flip local/remote node functions */ + uint32_t access_recv; /* Access data after receiving */ + uint32_t affinity; /* Processor affinity */ + uint32_t poll_mode; /* Poll mode */ + uint32_t port; /* Port number requested */ + uint32_t rd_atomic; /* Number of pending RDMA or atomics */ + uint32_t timeout; /* Timeout for messages */ + uint32_t msg_size; /* Message Size */ + uint32_t mtu_size; /* MTU Size */ + uint32_t no_msgs; /* Number of messages */ + uint32_t sock_buf_size; /* Socket buffer size */ + uint32_t time; /* Duration in seconds */ + char id[STRSIZE]; /* Identifier */ + char rate[STRSIZE]; /* Rate */ +} REQ; + + +/* + * Transfer statistics. + */ +typedef struct USTAT { + uint64_t no_bytes; /* Number of bytes transfered */ + uint64_t no_msgs; /* Number of messages */ + uint64_t no_errs; /* Number of errors */ +} USTAT; + + +/* + * Statistics. + */ +typedef struct STAT { + uint32_t no_cpus; /* Number of processors */ + uint32_t no_ticks; /* Ticks per second */ + uint32_t max_cqes; /* Maximum CQ entries */ + CLOCK time_s[T_N]; /* Start times */ + CLOCK time_e[T_N]; /* End times */ + USTAT s; /* Send statistics */ + USTAT r; /* Receive statistics */ + USTAT rem_s; /* Remote send statistics */ + USTAT rem_r; /* Remote receive statistics */ +} STAT; + + +/* + * Results per node. + */ +typedef struct RESN { + double time_real; /* Real (elapsed) time in seconds */ + double time_cpu; /* Cpu time in seconds */ + double cpu_total; /* Cpu time (as a fraction of a cpu) */ + double cpu_user; /* User time (fraction of cpu) */ + double cpu_intr; /* Interrupt time (fraction of cpu) */ + double cpu_idle; /* Idle time (fraction of cpu) */ + double cpu_kernel; /* Kernel time (fraction of cpu) */ + double cpu_io_wait; /* IO wait time (fraction of cpu) */ +} RESN; + + +/* + * Results. + */ +typedef struct RES { + RESN l; /* Local information */ + RESN r; /* Remote information */ + double send_bw; /* Send bandwidth */ + double recv_bw; /* Receive bandwidth */ + double msg_rate; /* Messaging rate */ + double send_cost; /* Send cost */ + double recv_cost; /* Receive cost */ + double latency; /* Latency */ +} RES; + + +/* + * Functions prototypes. + */ +void client_send_request(void); +void debug(char *fmt, ...); +void dec_init(void *p); +int64_t dec_int(int n); +void dec_str(char *s, int n); +void die(void); +void enc_init(void *p); +void enc_int(int64_t l, int n); +void enc_str(char *s, int n); +int error(char *fmt, ...); +void error_die(char *fmt, ...); +void exchange_results(void); +int left_to_send(long *sentp, int room); +void opt_check(void); +void *qmalloc(long n); +int recv_mesg(void *ptr, int len, char *item); +int send_mesg(void *ptr, int len, char *item); +void set_finished(void); +void setp_u32(char *name, PAR_INDEX index, uint32_t l); +void setp_str(char *name, PAR_INDEX index, char *s); +void setv_u32(PAR_INDEX index, uint32_t l); +void show_results(MEASURE measure); +void stop_timing(void); +int synchronize(void); +int syserror(char *fmt, ...); +void syserror_die(char *fmt, ...); +void touch_data(void *p, int n); +void par_use(PAR_INDEX index); + + +/* + * Socket tests (ip.c). + */ +void run_client_rds_bw(void); +void run_server_rds_bw(void); +void run_client_rds_lat(void); +void run_server_rds_lat(void); +void run_client_sdp_bw(void); +void run_server_sdp_bw(void); +void run_client_sdp_lat(void); +void run_server_sdp_lat(void); +void run_client_tcp_bw(void); +void run_server_tcp_bw(void); +void run_client_tcp_lat(void); +void run_server_tcp_lat(void); +void run_client_udp_bw(void); +void run_server_udp_bw(void); +void run_client_udp_lat(void); +void run_server_udp_lat(void); + + +/* + * InfiniBand tests (ib.c). + */ +void run_client_bug(void); +void run_server_bug(void); +void run_client_rc_bi_bw(void); +void run_server_rc_bi_bw(void); +void run_client_rc_bw(void); +void run_server_rc_bw(void); +void run_client_rc_compare_swap_mr(void); +void run_server_rc_compare_swap_mr(void); +void run_client_rc_fetch_add_mr(void); +void run_server_rc_fetch_add_mr(void); +void run_client_rc_lat(void); +void run_server_rc_lat(void); +void run_client_rc_rdma_read_bw(void); +void run_server_rc_rdma_read_bw(void); +void run_client_rc_rdma_read_lat(void); +void run_server_rc_rdma_read_lat(void); +void run_client_rc_rdma_write_bw(void); +void run_server_rc_rdma_write_bw(void); +void run_client_rc_rdma_write_lat(void); +void run_server_rc_rdma_write_lat(void); +void run_client_rc_rdma_write_poll_lat(void); +void run_server_rc_rdma_write_poll_lat(void); +void run_client_uc_bi_bw(void); +void run_server_uc_bi_bw(void); +void run_client_uc_bw(void); +void run_server_uc_bw(void); +void run_client_uc_lat(void); +void run_server_uc_lat(void); +void run_client_uc_rdma_write_bw(void); +void run_server_uc_rdma_write_bw(void); +void run_client_uc_rdma_write_lat(void); +void run_server_uc_rdma_write_lat(void); +void run_client_uc_rdma_write_poll_lat(void); +void run_server_uc_rdma_write_poll_lat(void); +void run_client_ud_bi_bw(void); +void run_server_ud_bi_bw(void); +void run_client_ud_bw(void); +void run_server_ud_bw(void); +void run_client_ud_lat(void); +void run_server_ud_lat(void); +void run_client_ver_rc_compare_swap(void); +void run_server_ver_rc_compare_swap(void); +void run_client_ver_rc_fetch_add(void); +void run_server_ver_rc_fetch_add(void); + + +/* + * Variables. + */ +extern RES Res; +extern REQ Req; +extern STAT LStat; +extern char *Usage[]; +extern char *TestName; +extern char *ServerName; +extern int Successful; +extern volatile int Finished;