-
Notifications
You must be signed in to change notification settings - Fork 1
/
ib_bench.c
2197 lines (1854 loc) · 72.2 KB
/
ib_bench.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* ib_bench.c */
#define IB_BENCH_VERSION_STRING "0.3 July 28, 2011"
/* Copyright 2011, System Fabric Works, Inc., All Rights Reserved
Compile and link under Linux and OFED using:
cc -lrt -lrdmacm -libverbs -g -O0 -o ib_bench ib_bench.c
cc -Wall -lrt -lrdmacm -libverbs -g -O2 -o ib_bench ib_bench.c
This program is loosely based along the lines of the OFED perftest
programs ib_{send,read,write}_{bw,lat} but is intended to handle the
following:
1) Use rdma_cm for all connection management, capable of
running under all OFED supported RDMA technologies.
2) Allow the use of two HCAs simultaneously for high speeds.
3) Allow an arbitrary number of queue pairs to be used.
4) Allow a bandwidth limitation on the queue pairs
The program is invoked with optional arguments (below) and then either
nothing (indicating passive side server) or one or two IP addresses to
connect to (indicating active side client).
Optional arguments and their meaning:
-p --port N The rdma_cm port number, Default is 18516
The port number is used by the passive side to set up
a listening mechanism, and by the active side to make
the connection to the passive. In Infiniband rdma_cm,
the port space is disjoint between RC and UD, and is
also disjoint from TCP/IP or UDP/IP.
--bind0 IP A local IP address to bind to for first HCA
The passive side can listen to all interfaces, which is
the default, or it can be bound to listen to specific
interfaces. The active side can just connect to the
target address, or it can request rdma_cm to make the
connection through a specific interface as defined by
this argument.
--bind1 IP A local IP address to bind to for second HCA
* -m --mtu N The MTU to use, Default is 2048
This parameter is informational, to tell ib_bench what
the MTU is. Connection MTU cannot be known until the
connection is established. It is possible to modify
the MTU value for Infiniband, but this is not done.
-c --connection RC/UD The connection type, Default RC
These are the only two connection types supported by
rdma-cm. The UD connections only support send, and do
not support RDMA read or write. Performance measurement
is different for UD. The sender can only measure how
many packets have been sent, while the receiver can only
measure how many actually were received. It is a good
idea to capture reports from both sides and compare.
* -o --operation SEND/WRITE/READ The type of test to run, Default SEND
The operation is always with respect to the client or
active side, connecting to the server or passive side.
Data moves from client to server for send and write,
and from server to client for read. One other choice
is READWRITE (or WRITEREAD) which will alternate
connections between doing a read and write to use
bidirectional bandwidth.
* -l --latency Show performance as latency instead of bandwidth
Latency measurements are round trip, from entry into
send queue until exit from completion queue. The
transmit queue depth impacts the apparent latency.
* -s --size N The message size to use, Default is 65536
All transfers use the same size. The size is for the
actual useful data payload, and does not include any
required overhead on the transport. The size for UD
connections may not be larger than the MTU. For RC
the messages will be segmented as required.
* -a --all Use all available message sizes in powers of 2
This option will automatically change the message size
every time a report is printed. If no reports are done,
the message size will not change. Note that the change
in size only applies to new messages. The ones that
are already in the transmit queue remain at whatever
size they were when they were queued. It is difficult
to get useful performance information on any particular
size message when this is used.
* -t --tx-depth N The depth of the transmit queue, Default 250
This can have a significant impact on the ability of
a interface to keep the pipeline full for a single
connection. Also, since latency is measured from the
time the request is queued until the request completes,
a depth larger than 1 will show longer latency than the
latency of a single request.
* -r --rx-depth N The depth of the receive queue, Default 500
This number should be kept large enough for the server
to always have a receive posted on every connection, or
the performance will be depressed by receiver delay.
* -u --qp-timeout N The timeout for the queue pairs
Carried over from other tests, currently unused.
* -n --number N The number of queue pairs to test, Default 1
There appears to be a limit of just over 28,000 rdma-cm
connections from one system. The default value is
actually 2 if there are two servers specified by the
client, with at least one created for each server.
-d --duration N The time to run the tests in seconds, Default forever
The test will be terminated after this amount of time.
The countdown does not start until all connections have
been established. If the duration is a multiple of the
report time, the report will print before termination.
--report N The time between reports, Default 6 seconds
Performance data is continuously tracked once all of
the connections have been established. An asynchronous
thread prints out this information at the specified
interval. The data shows the performance since the
last report to the left of a // mark, and the total
performance data since the beginning to the right.
Due to the asynchronous nature, the device and queue
pair performance information may not exactly match, as
operations may complete as the report is being printed.
The report shows an average except for where the labels
"min" and "max" appear. These are the single transfer
minimum and maximum values. This measures completions
that have happened in the reporting period.
--cqperqp Create one CQ for each QP instead of one per device
There is a choice of a single completion queue for each
interface, resulting in one or two queues, or else a
completion queue for each connection. The default is
one per device.
* --max_rd_atomic N Responder resource control, default 4
The value set in the initiator_depth field for
rdma_accept, or the responder_resources field for
rdma_connect.
* --max_dest_rd_atomic N Responder resource control, default 4
The value set in the responder_resources field for
rdma_accept, or the initiator_depth field for
rdma_connect.
* --min_rnr_timer N Minimum RNR NAK timer
Carried over from other tests, currently unused.
* --timeout N Local ack timeout
Carried over from other tests, currently unused.
* --retry_cnt N Retry count, default value 6
The value set in the retry_count field for
rdma_connect or rdma_accept
* --rnr_retry N RNR retry, default value 6
The value set in the rnr_retry_count field for
rdma_connect or rdma_accept
-b --bandwidth F The bandwidth limit for each connection in MB/second
-h --help Display the usage help text
Note: The --connection type and the --port number must match on both
sides to make the connection.
The optional --bind0 and --bind1 are local to each side.
The --duration and --report values are local to each side so they
can differ. Note that --report 0 means do not report.
The --cqperqp option is local to each side so they can differ.
The rest of the parameters, marked with an *, are ignored on the
passive side, as they are overwritten by the active's incoming
connection. If they are given, the message "Arguments given to
passive side that will be overwritten by active connection." is
written to standard output as a reminder, to support running
both sides with (mostly) the same arguments.
The passive side will exit when all connections have been shut
down for RC connections. For UD connections this cannot be
detected and the passive side will have to be stopped with some
kind of signal, like that generated by control-C. If no
connections are ever seen in RC mode, then the passive side will
never exit, since it will never see the connections shut down.
The performance report includes totals which are calculated as
lists are processed. The device report happens first, and then
the set of connections are processed. Data continues to flow
during this processing, so the total from the queue pair report
may be higher than the device report, especially when there are
a large number of connections. This difference is very small,
and is left intact because it helps assure that connections are
progressing as expected.
The performance report shows the rate for the last reporting
period, followed by //, followed by the cumulative rate since
all of the connections were established. Each line has a unique
string to facilitate awk/grep/etc extraction of information.
In latency mode, the performance report also shows the all-time
single-message minimum and maximum latency for each device. The
latency is measured from the time the message is posted to the
send queue until the time the message is acknowledged as seen
by the appearance of the work completion queue entry. This is
full end-to-end round trip latency, continuously measured for
all messages.
*/
#include <getopt.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#include <netdb.h>
#include <byteswap.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <pthread.h>
#include <infiniband/verbs.h>
#include <rdma/rdma_cma.h>
#include <signal.h>
#include <sys/statfs.h>
#include <linux/fs.h>
#include <sched.h>
#ifdef WITH_XSP
#include "libxsp_client.h"
#endif
#define DEFAULT_PORT 18516 /* Default port number used for rdma_connect */
#define DEFAULT_MTU 2048 /* Default MTU to use */
#define DEFAULT_SIZE 65536 /* Default message size to use */
#define MAX_LISTEN_QUEUE 256 /* Number of incoming connects to queue */
#define MAX_CONNECT_QUEUE 192 /* Number of outgoing connects in progress */
/* Option values, see head of this file for the meaning of each one */
/* XSP hop */
char *xsp_hop = NULL;
/* -p --port N The rdma_cm port number to listen on or connect to */
int arg_port = DEFAULT_PORT;
/* --bind0 IP A local IP address to bind to */
/* --bind1 IP A local IP address to bind to */
char *arg_bind0 = 0; /* Text version of bind0 local IP address */
char *arg_bind1 = 0;
struct sockaddr_in sa_bind0; /* Binary version of bind0 local IP address */
struct sockaddr_in sa_bind1;
/* -m --mtu N The MTU to use */
int arg_mtu = DEFAULT_MTU;
/* -c --connection RC/UD The connection type */
#define BENCH_CONN_RC 1
#define BENCH_CONN_UD 2
int arg_connection = BENCH_CONN_RC;
/* -o --operation SEND/WRITE/READ The type of test to run */
#define BENCH_SEND 1 /* Doing send tests */
#define BENCH_WRITE 2 /* Doing rdma write tests */
#define BENCH_READ 4 /* Doing rdma read tests */
#define BENCH_RDWR 8 /* Doing rdma read and write tests */
int arg_operation = BENCH_SEND;
/* -l --latency Show performance as latency instead of bandwidth */
int arg_latency = 0;
/* -s --size N The message size to use */
#define BENCH_MIN_MSG 1 /* Minumum message size */
#define BENCH_MAX_MSG (2<<29) /* Maximum message size */
int arg_size = DEFAULT_SIZE; /* Test message size */
/* -a --all Use all available message sizes in powers of 2 */
int arg_all = 0;
/* -t --tx-depth N The depth of the transmit queue on sender queue pair */
int arg_tx_depth = 250;
/* -r --rx-depth N The depth of the receive queue on receiver queue pair */
int arg_rx_depth = 500;
/* -u --qp-timeout N The timeout for the queue pairs */
int arg_qp_timeout = 0;
/* -n --number N The number of queue pairs to test */
int arg_number = 0;
/* -d --duration N The amount of time to run the tests in seconds */
int arg_duration = 0;
/* --report N The amount of time between performance reports (sec) */
int arg_report = 6;
/* --cqperqp Create one CQ for each QP instead of one per device */
int arg_cqperqp = 0;
/* --max_rd_atomic N Responder resource control */
int arg_max_rd_atomic = 4;
/* --max_dest_rd_atomic N Responder resource control */
int arg_max_dest_rd_atomic = 4;
/* --min_rnr_timer N Minimum RNR NAK timer */
int arg_min_rnr_timer = 0;
/* --timeout N Local ack timeout */
int arg_timeout = 0;
/* --retry_cnt N Retry count */
int arg_retry_cnt = 6;
/* --rnr_retry N RNR retry */
int arg_rnr_retry = 6;
/* -b --bandwidth F The bandwidth limit for each connection in MB/second */
float arg_bandwidth = 0.0;
/* This global variable allows a test to see if an option was actually given
on the command, or if the value it has is from the defaults. */
int options_present = 0; /* Bit set from masks below when option is given */
#define ARG_port 1
#define ARG_bind0 2
#define ARG_bind1 4
#define ARG_mtu 8
#define ARG_connection 16
#define ARG_operation 32
#define ARG_latency 64
#define ARG_size 128
#define ARG_all 256
#define ARG_tx_depth 512
#define ARG_rx_depth 1024
#define ARG_qp_timeout 2048
#define ARG_number 4096
#define ARG_duration 8192
#define ARG_max_rd_adomic 16384
#define ARG_max_dest_rd_atomic 32768
#define ARG_min_rnr_timer 65536
#define ARG_timeout 131072
#define ARG_retry_cnt 262144
#define ARG_rnr_retry 524288
#define ARG_bandwidth 1048576
#define ARG_report 2097152
#define ARG_cqperqp 4194304
/* Structure used during connection to communicate test parameters from */
/* the initiation (client) side to the responder (server) side. */
/* The maximum size of the structure is 56 bytes for Infiniband. */
/* Currently this structure is 52 bytes for x86_64 architectures */
typedef struct {
uint8_t version; /* Identify version of structure in first byte */
#define IB_BENCH_VERSION 1 /* Change when the structure changes */
uint8_t arg_all; /* Send all arguments to the other side */
uint8_t arg_latency;
uint8_t arg_operation;
uint16_t arg_max_dest_rd_atomic;
uint16_t arg_max_rd_atomic;
uint16_t arg_min_rnr_timer;
uint16_t arg_mtu;
uint16_t arg_retry_cnt;
uint16_t arg_rnr_retry;
uint16_t arg_rx_depth;
uint16_t arg_timeout;
uint16_t arg_tx_depth;
uint32_t arg_number;
uint32_t arg_qp_timeout;
uint32_t arg_size;
uint32_t rkey; /* These 3 returned during accept */
uint32_t remote_addr_low;
uint32_t remote_addr_high;
#define ud_send_qkey rkey /* For UD SEND, reuse fields */
#define ud_send_qpn remote_addr_low
float arg_bandwidth;
} ib_bench_conn_data;
/* Global Variables */
char *prog_name; /* Holds a copy of argv[0] */
int not_exiting = 1; /* True until we are exiting */
char *connect_to_0 = 0; /* Text version of destination IP address */
char *connect_to_1 = 0;
struct sockaddr_in sa_connect_to_0; /* Binary of connect_to_0 IP address */
struct sockaddr_in sa_connect_to_1;
struct verbs_per_dev {
/* Elements created for all connections on a
single rdma device */
struct verbs_per_dev *next; /* Pointer to the next device's info */
struct ibv_context *verbs_dev; /* Identifies the unique device */
char *dev_name; /* Holds the name of the device */
struct ibv_pd *pd; /* Protection domain for this device */
struct ibv_mr *mr; /* Memory region for all possible areas */
struct ibv_cq *cq; /* Completion queue for device if not cqperqp */
struct ibv_device_attr da; /* ibv_query_device attributes */
uint64_t min_latency; /* Minimum single message latency in us */
uint64_t max_latency; /* Maximum single message latency in us */
uint64_t total_latency; /* Total latency of all messages in us */
uint64_t total_bytes; /* Total number of bytes transferred */
uint64_t total_messages; /* Total number of messages transferred */
uint64_t lastd_latency; /* Message latency at last perf display */
uint64_t lastd_bytes; /* Number of bytes at last perf display */
uint64_t lastd_messages; /* Number of messages at last perf display */
} *verbs_per_devs; /* Pointer to the first device's information */
struct connection { /* Information on the rdma_cm connections */
struct connection *next; /* Pointer to the next connection */
struct connection *type_next; /* Next connection of the same type */
int state; /* Current state of the connection */
#define CONN_DISCONNECTED 0 /* Not currently connected */
#define CONN_CONNECTING 1 /* Connection in progress */
#define CONN_RUNNING 2 /* Connection completed */
int operation; /* Holds BENCH_SEND, BENCH_WRITE, BENCH_READ, */
/* or BENCH_RDWR for this connection */
struct sockaddr_in sin; /* Resolved address and socket number */
struct rdma_cm_id *cm_id; /* The rdma_cm ID for this connection */
struct timespec *sentts; /* Array of time values for sends done */
uint32_t *sentsz; /* Array of sizes for sends done */
int next_send_idx; /* Post send index into above arrays */
int next_wc_idx; /* Completion index into above arrays */
struct verbs_per_dev *vpd; /* The pd/cq/mr for this connection */
struct ibv_cq *cq; /* Completion queue for this QP if cqperqp */
struct ibv_ah *other_ah; /* UD target AH */
uint32_t other_qp_num; /* UD target QP number */
uint32_t other_qkey; /* UD target qkey */
uint32_t rkey; /* RDMA rkey for accessing passive side */
uint64_t remote_addr; /* RDMA passive side virtual address */
uint32_t send_queued; /* Total number of outstanding send WRs */
uint32_t bytes_queued; /* Total bytes in outstanding send WRs */
uint64_t min_latency; /* Minimum single message latency in us */
uint64_t max_latency; /* Maximum single message latency in us */
uint64_t total_latency; /* Total latency of all messages in us */
uint64_t total_bytes; /* Total number of bytes transferred */
uint64_t total_messages; /* Total number of messages transferred */
uint64_t lastd_latency; /* Message latency at last perf display */
uint64_t lastd_bytes; /* Number of bytes at last perf display */
uint64_t lastd_messages; /* Number of messages at last perf display */
} *connections; /* Pointer to first entry for all IDs */
struct connection listen0, listen1; /* Dummys for incoming connects */
int conn_disconnected = 0; /* Count of QPs in CONN_DISCONNECTED state */
int conn_connecting = 0; /* Count of QPs in CONN_CONNECTING state */
int conn_running = 0; /* Count of QPs in CONN_RUNNING state */
ib_bench_conn_data conn_data; /* Same connection data for every connect */
pthread_mutex_t con_mutex = /* Lock multithread access to connection info */
PTHREAD_MUTEX_INITIALIZER;
pthread_t event_thread_id; /* Holds the thread ID for the event thread */
pthread_t display_thread_id; /* Holds the thread ID for display thread */
struct rdma_event_channel *event_chan; /* Single event channel for all */
int pagesize; /* Holds the page size for this system */
struct timespec completion_time; /* Time of day for completion loop start */
void *datamembase; /* Points to allocated memory area */
off_t datamemsize; /* Size of allocated memory area */
struct timespec startup; /* The time when transfers started */
int first_incoming_connection = 1; /* The first connection is accepted */
/* with its parameters, the rest are verified to be */
/* using the same parameters. */
/* Function to print an error message and exit */
void err_exit(const char *text) {
not_exiting = 0;
fprintf(stderr,
"\nConnections: Disconnected %d Connecting %d Running %d"
" Total %d\n",
conn_disconnected, conn_connecting, conn_running,
conn_disconnected + conn_connecting + conn_running);
fprintf(stderr, "%s side: %s\n", connect_to_1 ? "Client" : "Server",
text);
/* This is a good place for a breakpoint if you are trying to track
down the reason for an error exit */
exit(1);
}
/* Function to catch SIGINT and error exit */
void catch_sigint(int x) {
err_exit("Interrupted...");
}
/* Function to output the usage help text */
void usage() {
fprintf(stderr,
"\nUsage: %s [options] [serverIP1 [serverIP2]]\n%sVersion %s\n\n", prog_name,
"Options:\n"
"-p --port N The rdma_cm port number, Default is 18516\n"
" --bind0 IP A local IP address to bind to for first HCA\n"
" --bind1 IP A local IP address to bind to for second HCA\n"
"-m --mtu N The MTU to use, Default is 2048\n"
"-c --connection RC/UD The connection type, Default RC\n"
"-o --operation SEND/WRITE/READ The type of test to run, Default SEND\n"
"-l --latency Show performance as latency instead of bandwidth\n"
"-s --size N The message size to use, Default is 65536\n"
"-a --all Use all available message sizes in powers of 2\n"
"-t --tx-depth N The depth of the transmit queue, Default 250\n"
"-r --rx-depth N The depth of the receive queue, Default 500\n"
"-u --qp-timeout N The timeout for the queue pairs\n"
"-n --number N The number of queue pairs to test, Default 1\n"
"-d --duration N The time to run the tests in seconds, Default forever\n"
" --report N The time between performance reports, Default 6 seconds\n"
" --cqperqp Create one CQ for each QP instead of one per device\n"
" --max_rd_atomic N Responder resource control, Default 4\n"
" --max_dest_rd_atomic N Responder resource control, Default 4\n"
" --min_rnr_timer N Minimum RNR NAK timer\n"
" --timeout N Local ack timeout\n"
" --retry_cnt N Retry count, Default 6\n"
" --rnr_retry N RNR retry, Default 6\n"
"-b --bandwidth F The bandwidth limit for each connection in MB/second\n"
"-x --xsp_hop host/port The address of the XSP daemon\n"
"-h --help Display the usage help text\n\n"
"Note: The --connection type must match on active and passive sides.\n"
" The optional --bind0 and --bind1 are local to each side.\n"
" All other parameters are ignored on the passive side, as they are\n"
" overwritten by the active's incoming connection.\n\n",
IB_BENCH_VERSION_STRING);
}
/* Function to report that the command line is invalid and exit */
void usage_exit() {
usage();
err_exit("Invalid command line option/value\n");
}
/* Function to display the errno with descriptive text and exit */
void perror_exit(const char *text) {
not_exiting = 0;
perror(text);
err_exit("Giving up...");
}
/* Function to print the time of day and a message */
void timestamp_msg(char *text) {
char ts[20];
time_t t;
t = time(NULL);
strftime(ts, sizeof(ts), "%Y-%m-%d %H:%M:%S", localtime(&t));
printf("%s %s\n", ts, text);
fflush(stdout);
}
/* This function will find or create the struct verbs_per_dev needed for the
device that a connection is using. If it does not exist, it will be
created. Depending on cqperqp, there will be one CQ created for the
device, or a CQ will be created for the connection. */
void get_vpd_pd_cq_mr(struct connection *cn) {
struct verbs_per_dev *vpd;
int cqsize;
for (vpd = verbs_per_devs; vpd; vpd = vpd->next)
if (cn->cm_id->verbs == vpd->verbs_dev) break;
if (vpd == NULL) {
vpd = calloc(1, sizeof(*vpd));
if (vpd == NULL)
err_exit("get_vpd_pd_cq_mr() cannot allocate memory");
vpd->verbs_dev = cn->cm_id->verbs;
if (ibv_query_device(vpd->verbs_dev, &(vpd->da)))
err_exit("get_vpd_pd_cq_mr() cannot ibv_query_device");
vpd->dev_name =
strdup(ibv_get_device_name(vpd->verbs_dev->device));
vpd->pd = ibv_alloc_pd(vpd->verbs_dev);
if (vpd->pd == NULL)
err_exit("get_vpd_pd_cq_mr() cannot allocate PD");
vpd->mr = ibv_reg_mr(vpd->pd, datamembase, datamemsize,
IBV_ACCESS_REMOTE_WRITE
| IBV_ACCESS_LOCAL_WRITE
| IBV_ACCESS_REMOTE_READ);
if (vpd->mr == NULL)
err_exit("get_vpd_pd_cq_mr() cannot create MR");
vpd->min_latency = 0x7FFFFFFFFFFFFFFF;
vpd->next = verbs_per_devs;
verbs_per_devs = vpd;
}
if (connect_to_0)
cqsize = arg_tx_depth; /* active, sending side */
else
cqsize = arg_rx_depth; /* passive, receiving side */
cqsize = cqsize + 4; /* Allow management packets */
if (arg_cqperqp == 0) { /* One CQ for all QPs on this device */
if (vpd->cq == NULL) {
cqsize = cqsize * arg_number; /* Times number QPs */
if (cqsize > vpd->da.max_cqe) /* Clip CQ size */
cqsize = vpd->da.max_cqe;
vpd->cq = ibv_create_cq(vpd->verbs_dev, cqsize,
NULL, NULL, 0);
if (vpd->cq == NULL)
err_exit("get_vpd_pd_cq_mr() cannot create CQ");
}
}
else {
if (cqsize > vpd->da.max_cqe) /* Clip CQ size to maximum */
cqsize = vpd->da.max_cqe;
cn->cq = ibv_create_cq(vpd->verbs_dev, cqsize, NULL, NULL, 0);
if (cn->cq == NULL)
err_exit("get_vpd_pd_cq_mr() cannot create CQ");
}
cn->vpd = vpd;
}
/* This function accepts an incoming connection */
void incoming_connect(struct rdma_cm_event e, void *private_data) {
ib_bench_conn_data *cdata = private_data;
struct connection *cn;
struct rdma_conn_param conp;
struct ibv_qp_init_attr iqpa;
struct ibv_recv_wr wr, *bad_wr;
struct ibv_sge sge;
int i, j;
if (e.param.conn.private_data_len < sizeof(ib_bench_conn_data))
err_exit("Incoming connect with too little connect data");
if (cdata->version != IB_BENCH_VERSION)
err_exit("Incoming connect with wrong version number");
if (first_incoming_connection) {
first_incoming_connection = 0;
arg_all = cdata->arg_all;
arg_latency = cdata->arg_latency;
arg_operation = cdata->arg_operation;
arg_max_dest_rd_atomic = cdata->arg_max_dest_rd_atomic;
arg_max_rd_atomic = cdata->arg_max_rd_atomic;
arg_min_rnr_timer = cdata->arg_min_rnr_timer;
arg_mtu = cdata->arg_mtu;
arg_retry_cnt = cdata->arg_retry_cnt;
arg_rnr_retry = cdata->arg_rnr_retry;
arg_rx_depth = cdata->arg_rx_depth;
arg_timeout = cdata->arg_timeout;
arg_tx_depth = cdata->arg_tx_depth;
arg_bandwidth = cdata->arg_bandwidth;
arg_number = cdata->arg_number;
arg_qp_timeout = cdata->arg_qp_timeout;
arg_size = cdata->arg_size;
}
else if (arg_all != cdata->arg_all
|| arg_latency != cdata->arg_latency
|| arg_operation != cdata->arg_operation
|| arg_max_dest_rd_atomic != cdata->arg_max_dest_rd_atomic
|| arg_max_rd_atomic != cdata->arg_max_rd_atomic
|| arg_min_rnr_timer != cdata->arg_min_rnr_timer
|| arg_mtu != cdata->arg_mtu
|| arg_retry_cnt != cdata->arg_retry_cnt
|| arg_rnr_retry != cdata->arg_rnr_retry
|| arg_rx_depth != cdata->arg_rx_depth
|| arg_timeout != cdata->arg_timeout
|| arg_tx_depth != cdata->arg_tx_depth
|| arg_bandwidth != cdata->arg_bandwidth
|| arg_number != cdata->arg_number
|| arg_qp_timeout != cdata->arg_qp_timeout
|| arg_size != cdata->arg_size) {
err_exit("Incoming connect with different parameters");
}
cn = calloc(1, sizeof(struct connection));
if (cn == NULL)
err_exit("Cannot allocate connection structure");
cn->state = CONN_CONNECTING;
pthread_mutex_lock(&con_mutex);
conn_connecting++;
cn->next = connections;
connections = cn;
pthread_mutex_unlock(&con_mutex);
memcpy(&(cn->sin), rdma_get_peer_addr(e.id),
sizeof(struct sockaddr_in));
cn->cm_id = e.id;
e.id->context = cn;
if (cn->sin.sin_port != rdma_get_dst_port(e.id))
err_exit("incoming_connect() port number mismatch");
/* Setup normal PD, CQ, and MR for the device. These
are common to all connections */
get_vpd_pd_cq_mr(cn);
/* Create the queue pair for receiving the messages */
memset(&iqpa, 0, sizeof(iqpa));
/* incoming connection, this is passive side, so this only */
/* receives data */
iqpa.cap.max_send_wr = 1;
iqpa.cap.max_recv_wr = arg_rx_depth;
iqpa.cap.max_send_sge = 1;
iqpa.cap.max_recv_sge = 1;
iqpa.qp_context = cn;
iqpa.sq_sig_all = 1;
if (arg_connection == BENCH_CONN_RC)
iqpa.qp_type = IBV_QPT_RC;
else
iqpa.qp_type = IBV_QPT_UD;
if (arg_cqperqp == 0) { /* One CQ for all QPs on this device */
iqpa.send_cq = cn->vpd->cq;
iqpa.recv_cq = cn->vpd->cq;
}
else {
iqpa.send_cq = cn->cq;
iqpa.recv_cq = cn->cq;
}
j = rdma_create_qp(cn->cm_id, cn->vpd->pd, &iqpa);
if (j) {
if (j != -1) errno = j;
perror_exit("Cannot create incoming QP");
}
/* Post the receive buffers to the QP */
sge.addr = (uint64_t)datamembase;
sge.length = datamemsize;
sge.lkey = cn->vpd->mr->lkey;
for (i = 0; i < arg_rx_depth; i++) {
memset(&wr, 0, sizeof(wr));
wr.wr_id = (uint64_t)cn;
wr.next = NULL;
wr.sg_list = &sge;
wr.num_sge = 1;
j = ibv_post_recv(cn->cm_id->qp, &wr, &bad_wr);
if (j) perror_exit("Cannot ibv_post_recv for connection");
}
/* Now we accept the connection */
/* incoming connection, this is passive side, so the */
/* values are reversed from the outgoing connection */
memset(&conp, 0, sizeof(conp));
conp.retry_count = arg_retry_cnt;
conp.rnr_retry_count = arg_rnr_retry;
conp.responder_resources = arg_max_dest_rd_atomic;
conp.initiator_depth = arg_max_rd_atomic;
conp.private_data = cdata;
conp.private_data_len = sizeof(ib_bench_conn_data);
if (arg_connection == BENCH_CONN_UD) {
struct ibv_qp_attr qpa;
struct ibv_qp_init_attr qpia;
j = ibv_query_qp(cn->cm_id->qp, &qpa, IBV_QP_QKEY, &qpia);
if (j) {
errno = j;
perror_exit("Cannot ibv_query_qp for connection");
}
cdata->ud_send_qkey = qpa.qkey;
cdata->ud_send_qpn = cn->cm_id->qp->qp_num;
cn->state = CONN_RUNNING; /* For UD, we will not see */
pthread_mutex_lock(&con_mutex); /* RDMA_CM_EVENT_ESTABLISHED */
conn_connecting--;
conn_running++;
pthread_mutex_unlock(&con_mutex);
}
else {
cdata->rkey = cn->vpd->mr->rkey;
cdata->remote_addr_low = (uint32_t)((uint64_t)datamembase);
cdata->remote_addr_high = (uint32_t)((uint64_t)datamembase>>32);
}
j = rdma_accept(cn->cm_id, &conp);
if (j) {
if (j != -1) errno = j;
perror_exit("Cannot rdma_accept incoming connection");
}
}
/* This function is the top level for the event thread that processes all
rdma_cm events posted to this process. The associated rdma_cm id is
always created with a context value that is a pointer to the struct
connection defining the state of each rdma_cm connection. This is
examined to determine the proper course of action for the particular
event reported. */
void * event_thread(void *ignored) {
struct rdma_cm_event *event;
struct rdma_cm_event e;
void *private_data = 0;
size_t max_private_data_len = 0;
struct connection *cn;
struct ibv_qp_init_attr iqpa;
struct rdma_conn_param conp;
struct connection **lh;
int j;
/* Endless loop gathering whatever events are presented */
while (not_exiting && !rdma_get_cm_event(event_chan, &event)) {
/* Make a copy of the event and ack it so we won't hang */
memcpy(&e, event, sizeof(e));
if (e.param.conn.private_data_len > max_private_data_len) {
if (private_data) free(private_data);
max_private_data_len = e.param.conn.private_data_len;
private_data = malloc(max_private_data_len);
if (private_data == NULL)
err_exit("malloc() failed in event_thread()");
}
if (e.param.conn.private_data_len)
memcpy(private_data, e.param.conn.private_data,
e.param.conn.private_data_len);
rdma_ack_cm_event(event);
/* Find our struct connection and save last event/status */
cn = e.id->context;
if (e.event == RDMA_CM_EVENT_CONNECT_REQUEST)
cn = e.listen_id->context;
/* Dispatch based on the event */
switch (e.event) {
case RDMA_CM_EVENT_ADDR_RESOLVED: /* event 0 */
/* rdma_resolve_addr() work is done */
j = rdma_resolve_route(cn->cm_id, 1000);
if (j) {
if (j != -1) errno = j;
perror_exit(
"Cannot start rdma_resolve_route process");
}
break;
case RDMA_CM_EVENT_ADDR_ERROR: /* event 1 */
/* rdma_resolve_addr() work has failed. We must destroy the
rdma_cm ID used, as that is the only way to reset it. */
err_exit("Unexpected RDMA_CM_EVENT_ADDR_ERROR");
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED: /* event 2 */
/* rdma_resolve_route(() work is done */
/* Setup normal PD, CQ, and MR for the device. These
are common to all connections, and the MR maps the
shared memory segment. */
get_vpd_pd_cq_mr(cn);
/* Create the queue pair */
/* This is the active side of things, the sender */
memset(&iqpa, 0, sizeof(iqpa));
iqpa.cap.max_send_wr = arg_tx_depth;
iqpa.cap.max_recv_wr = 1;
iqpa.cap.max_send_sge = 1;
iqpa.cap.max_recv_sge = 1;
iqpa.qp_context = cn;
iqpa.sq_sig_all = 1;
if (arg_connection == BENCH_CONN_RC)
iqpa.qp_type = IBV_QPT_RC;
else
iqpa.qp_type = IBV_QPT_UD;
if (arg_cqperqp == 0) { /* One CQ for all QPs */
iqpa.send_cq = cn->vpd->cq;
iqpa.recv_cq = cn->vpd->cq;
}
else {
iqpa.send_cq = cn->cq;
iqpa.recv_cq = cn->cq;
}
j = rdma_create_qp(cn->cm_id, cn->vpd->pd, &iqpa);
if (j) {
if (j != -1) errno = j;
perror_exit(
"Cannot create QP");
}
/* Now we make the connection to the passive side */
memset(&conp, 0, sizeof(conp));
conp.retry_count = arg_retry_cnt;
conp.rnr_retry_count = arg_rnr_retry;
conp.responder_resources = arg_max_rd_atomic;
conp.initiator_depth = arg_max_dest_rd_atomic;
conp.private_data = &conn_data;
conp.private_data_len = sizeof(conn_data);
j = rdma_connect(cn->cm_id, &conp);
if (j) {
if (j != -1) errno = j;
perror_exit(
"Cannot start rdma_connect process");
}
break;
case RDMA_CM_EVENT_ROUTE_ERROR: /* event 3 */
/* rdma_resolve_route() work has failed. We must destroy the
rdma_cm ID used, as that is the only way to reset it. */
err_exit("RDMA_CM_EVENT_ROUTE_ERROR");
break;
case RDMA_CM_EVENT_CONNECT_REQUEST: /* event 4 */
/* Incoming connection request */
#ifdef SHOW_INCOMING_CONNECTS
{
struct sockaddr *a = rdma_get_peer_addr(e.id);
uint16_t d = rdma_get_dst_port(e.id); /* remote port */
uint16_t s = rdma_get_src_port(e.id); /* local port */
uint16_t f = a->sa_family;
struct sockaddr_in *i = (struct sockaddr_in *)a;
char *aa = inet_ntoa(i->sin_addr);
printf("Connect from AF %d addr %s port %d"
" coming to my port %d\n",
f, aa, ntohs(d), ntohs(s));
}
#endif
incoming_connect(e, private_data);
break;
case RDMA_CM_EVENT_CONNECT_RESPONSE: /* event 5 */
/* rdma_connect() work is done and there is no QP yet */
/* This should never happen, since we create the QP
before calling rdma_connect() */
err_exit("Unexpected RDMA_CM_EVENT_CONNECT_RESPONSE");
case RDMA_CM_EVENT_CONNECT_ERROR: /* event 6 */
/* Connection establishment failed */
/* This is very unusual, and there is no standard test
case developed, but handle it gracefully. */
err_exit("RDMA_CM_EVENT_CONNECT_ERROR");
break;
case RDMA_CM_EVENT_UNREACHABLE: /* event 7 */
/* rdma_connect() failed with unreachable or unresponsive */
/* This can also happen when attempting rdma_accept() */