21 #include <linux/if_link.h> 22 #include <linux/if_ether.h> 34 0x2c, 0xc6, 0x81, 0xd1,
35 0x5b, 0xdb, 0xf4, 0xf7,
36 0xfc, 0xa2, 0x83, 0x19,
37 0xdb, 0x1a, 0x3e, 0x94,
38 0x6b, 0x9e, 0x38, 0xd9,
39 0x2c, 0x9c, 0x03, 0xd1,
40 0xad, 0x99, 0x44, 0xa7,
41 0xd9, 0x56, 0x3d, 0x59,
42 0x06, 0x3c, 0x25, 0xf3,
43 0xfc, 0x1f, 0xdc, 0x2a,
48 #define rdma_log__(lvl, dev, f, ...) \ 50 vlib_log((lvl), rdma_main.log_class, "%s: " f, \ 51 &(dev)->name, ##__VA_ARGS__); \ 54 #define rdma_log(lvl, dev, f, ...) \ 55 rdma_log__((lvl), (dev), "%s (%d): " f, strerror(errno), errno, ##__VA_ARGS__) 57 static struct ibv_flow *
62 struct ibv_flow *
flow;
63 struct raw_eth_flow_attr
65 struct ibv_flow_attr attr;
66 struct ibv_flow_spec_eth spec_eth;
67 } __attribute__ ((packed)) fa;
69 memset (&fa, 0,
sizeof (fa));
70 fa.attr.num_of_specs = 1;
72 fa.attr.flags =
flags;
73 fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
74 fa.spec_eth.size =
sizeof (
struct ibv_flow_spec_eth);
76 memcpy (fa.spec_eth.val.dst_mac, mac, sizeof (fa.spec_eth.val.dst_mac));
77 memcpy (fa.spec_eth.mask.dst_mac, mask, sizeof (fa.spec_eth.mask.dst_mac));
81 fa.spec_eth.val.ether_type = ether_type;
82 fa.spec_eth.mask.ether_type = 0xffff;
85 flow = ibv_create_flow (qp, &fa.attr);
87 rdma_log (VLIB_LOG_LEVEL_ERR, rd,
"ibv_create_flow() failed");
97 if (ibv_destroy_flow (*flow))
99 rdma_log (VLIB_LOG_LEVEL_ERR, rd,
"ibv_destroy_flow() failed");
126 rd->
flags |= RDMA_DEVICE_F_PROMISC;
147 ntohs (ETH_P_IPV6), 0);
150 IBV_FLOW_ATTR_FLAGS_DONT_TRAP
157 IBV_FLOW_ATTR_FLAGS_DONT_TRAP
164 rd->
flags &= ~RDMA_DEVICE_F_PROMISC;
185 rdma_log__ (VLIB_LOG_LEVEL_ERR, rd,
"MTU change not supported");
205 rdma_log__ (VLIB_LOG_LEVEL_ERR, rd,
"unknown flag %x requested", flags);
212 struct ibv_port_attr attr;
216 if (ibv_query_port (rd->
ctx, port, &attr))
226 case IBV_PORT_ACTIVE:
227 case IBV_PORT_ACTIVE_DEFER:
228 rd->
flags |= RDMA_DEVICE_F_LINK_UP;
233 rd->
flags &= ~RDMA_DEVICE_F_LINK_UP;
239 switch (attr.active_width)
254 switch (attr.active_speed)
291 struct ibv_async_event event;
292 ret = ibv_get_async_event (rd->
ctx, &event);
296 switch (event.event_type)
298 case IBV_EVENT_PORT_ACTIVE:
301 case IBV_EVENT_PORT_ERR:
304 case IBV_EVENT_DEVICE_FATAL:
305 rd->
flags &= ~RDMA_DEVICE_F_LINK_UP;
310 rdma_log__ (VLIB_LOG_LEVEL_ERR, rd,
"unhandeld RDMA async event %i",
315 ibv_ack_async_event (&event);
326 ret = fcntl (rd->
ctx->async_fd, F_GETFL);
330 ret = fcntl (rd->
ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
383 #define _(fn, arg) if (arg) \ 386 if ((rv = fn (arg))) \ 387 rdma_log (VLIB_LOG_LEVEL_DEBUG, rd, #fn "() failed (rv = %d)", rv); \ 394 _(ibv_dereg_mr, rd->
mr);
397 _(ibv_destroy_qp, txq->
qp);
398 _(ibv_destroy_cq, txq->
cq);
402 _(ibv_destroy_wq, rxq->
wq);
403 _(ibv_destroy_cq, rxq->
cq);
406 _(ibv_destroy_qp, rd->
rx_qp6);
407 _(ibv_destroy_qp, rd->
rx_qp4);
408 _(ibv_dealloc_pd, rd->
pd);
409 _(ibv_close_device, rd->
ctx);
423 u8 no_multi_seg,
u16 max_pktlen)
426 struct ibv_wq_init_attr wqia;
427 struct ibv_cq_init_attr_ex cqa = { };
428 struct ibv_wq_attr wqa;
429 struct ibv_cq_ex *cqex;
430 struct mlx5dv_wq_init_attr dv_wqia = { };
431 int is_mlx5dv = ! !(rd->
flags & RDMA_DEVICE_F_MLX5DV);
432 int is_striding = ! !(rd->
flags & RDMA_DEVICE_F_STRIDING_RQ);
444 struct mlx5dv_cq_init_attr dvcq = { };
445 dvcq.comp_mask = MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
446 dvcq.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
448 if ((cqex = mlx5dv_create_cq (rd->
ctx, &cqa, &dvcq)) == 0)
453 if ((cqex = ibv_create_cq_ex (rd->
ctx, &cqa)) == 0)
457 rxq->
cq = ibv_cq_ex_to_cq (cqex);
459 memset (&wqia, 0,
sizeof (wqia));
460 wqia.wq_type = IBV_WQT_RQ;
461 wqia.max_wr = n_desc;
470 uword data_seg_log2_sz =
472 rxq->
buf_sz = 1 << data_seg_log2_sz;
483 int max_chain_log_sz =
484 max_pktlen ?
max_log2 ((max_pktlen /
487 max_chain_log_sz =
clib_max (max_chain_log_sz, 3);
488 wqia.max_sge = 1 << max_chain_log_sz;
489 dv_wqia.comp_mask = MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
490 dv_wqia.striding_rq_attrs.two_byte_shift_en = 0;
491 dv_wqia.striding_rq_attrs.single_wqe_log_num_of_strides =
493 dv_wqia.striding_rq_attrs.single_stride_log_num_of_bytes =
495 wqia.max_wr >>= max_chain_log_sz;
512 max_pktlen ? (max_pktlen /
515 int max_chain_log_sz =
max_log2 (max_chain_sz);
516 wqia.max_sge = 1 << max_chain_log_sz;
523 if ((rxq->
wq = mlx5dv_create_wq (rd->
ctx, &wqia, &dv_wqia)))
525 rxq->
wq->events_completed = 0;
526 pthread_mutex_init (&rxq->
wq->mutex, NULL);
527 pthread_cond_init (&rxq->
wq->cond, NULL);
532 else if ((rxq->
wq = ibv_create_wq (rd->
ctx, &wqia)) == 0)
535 memset (&wqa, 0,
sizeof (wqa));
536 wqa.attr_mask = IBV_WQ_ATTR_STATE;
537 wqa.wq_state = IBV_WQS_RDY;
538 if (ibv_modify_wq (rxq->
wq, &wqa) != 0)
543 struct mlx5dv_obj obj = { };
544 struct mlx5dv_cq dv_cq;
545 struct mlx5dv_rwq dv_rwq;
552 obj.rwq.in = rxq->
wq;
553 obj.rwq.out = &dv_rwq;
555 if ((mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ)))
563 rxq->
cq_db = (
volatile u32 *) dv_cq.dbrec;
564 rxq->
cqn = dv_cq.cqn;
567 rxq->
wq_db = (
volatile u32 *) dv_rwq.dbrec;
571 qw0 = clib_host_to_net_u32 (rxq->
buf_sz);
573 qw0 |= (
u64) clib_host_to_net_u32 (rd->
lkey) << 32;
574 qw0_nullseg |= (
u64) clib_host_to_net_u32 (rd->
lkey) << 32;
603 for (
int i = 0;
i < n_desc;
i++)
614 struct ibv_rwq_ind_table_init_attr rwqia;
615 struct ibv_qp_init_attr_ex qpia;
616 struct ibv_wq **ind_tbl;
620 &&
"rxq number should be a power of 2");
625 memset (&rwqia, 0,
sizeof (rwqia));
627 rwqia.ind_tbl = ind_tbl;
632 memset (&qpia, 0,
sizeof (qpia));
633 qpia.qp_type = IBV_QPT_RAW_PACKET;
635 IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_IND_TABLE |
636 IBV_QP_INIT_ATTR_RX_HASH;
642 qpia.rx_hash_conf.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ;
644 qpia.rx_hash_conf.rx_hash_fields_mask =
645 IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 | IBV_RX_HASH_SRC_PORT_TCP |
646 IBV_RX_HASH_DST_PORT_TCP;
647 if ((rd->
rx_qp4 = ibv_create_qp_ex (rd->
ctx, &qpia)) == 0)
650 qpia.rx_hash_conf.rx_hash_fields_mask =
651 IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 | IBV_RX_HASH_SRC_PORT_TCP |
652 IBV_RX_HASH_DST_PORT_TCP;
653 if ((rd->
rx_qp6 = ibv_create_qp_ex (rd->
ctx, &qpia)) == 0)
666 struct ibv_qp_init_attr qpia;
667 struct ibv_qp_attr qpa;
676 if ((txq->
cq = ibv_create_cq (rd->
ctx, n_desc, NULL, NULL, 0)) == 0)
679 memset (&qpia, 0,
sizeof (qpia));
680 qpia.send_cq = txq->
cq;
681 qpia.recv_cq = txq->
cq;
682 qpia.cap.max_send_wr = n_desc;
683 qpia.cap.max_send_sge = 1;
684 qpia.qp_type = IBV_QPT_RAW_PACKET;
686 if ((txq->
qp = ibv_create_qp (rd->
pd, &qpia)) == 0)
689 memset (&qpa, 0,
sizeof (qpa));
690 qp_flags = IBV_QP_STATE | IBV_QP_PORT;
691 qpa.qp_state = IBV_QPS_INIT;
693 if (ibv_modify_qp (txq->
qp, &qpa, qp_flags) != 0)
696 memset (&qpa, 0,
sizeof (qpa));
697 qp_flags = IBV_QP_STATE;
698 qpa.qp_state = IBV_QPS_RTR;
699 if (ibv_modify_qp (txq->
qp, &qpa, qp_flags) != 0)
702 memset (&qpa, 0,
sizeof (qpa));
703 qp_flags = IBV_QP_STATE;
704 qpa.qp_state = IBV_QPS_RTS;
705 if (ibv_modify_qp (txq->
qp, &qpa, qp_flags) != 0)
711 if (rd->
flags & RDMA_DEVICE_F_MLX5DV)
714 struct mlx5dv_cq dv_cq;
715 struct mlx5dv_qp dv_qp;
716 struct mlx5dv_obj obj = { };
723 if (mlx5dv_init_obj (&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP))
734 || sizeof (
struct mlx5_cqe64) != dv_cq.cqe_size
735 || (
uword) dv_cq.buf % sizeof (
struct mlx5_cqe64))
751 mlx5dv_set_ctrl_seg (&tmpl->ctrl, 0, MLX5_OPCODE_SEND, 0,
755 mlx5dv_set_data_seg (&tmpl->dseg, 0, rd->
lkey, 0);
776 if ((rd->
pd = ibv_alloc_pd (rd->
ctx)) == 0)
781 IBV_ACCESS_LOCAL_WRITE)) == 0)
790 IBV_ACCESS_LOCAL_WRITE)) == 0)
803 for (i = 0; i < rxq_num; i++)
838 vlib_pci_addr_t pci_addr;
839 struct ibv_device **dev_list;
851 args->
rv = VNET_API_ERROR_INVALID_VALUE;
861 args->
rv = VNET_API_ERROR_INVALID_VALUE;
863 "between %i and 65535",
868 dev_list = ibv_get_device_list (&n_devs);
873 "no RDMA devices available. Is the ib_uverbs module loaded?");
878 s =
format (0,
"/sys/class/net/%s/device%c", args->
ifname, 0);
891 if (!args->
name || 0 == args->
name[0])
910 "invalid interface (only mlx5 supported for now)");
914 for (i = 0; i < n_devs; i++)
916 vlib_pci_addr_t
addr;
919 s =
format (s,
"%s/device%c", dev_list[i]->dev_path, 0);
924 if (addr.as_u32 != rd->
pci->
addr.as_u32)
927 if ((rd->
ctx = ibv_open_device (dev_list[i])))
933 struct mlx5dv_context mlx5dv_attrs = { };
934 mlx5dv_attrs.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
936 if (mlx5dv_query_device (rd->
ctx, &mlx5dv_attrs) == 0)
938 uword data_seg_log2_sz =
941 if ((mlx5dv_attrs.flags & MLX5DV_CONTEXT_FLAGS_CQE_V1))
942 rd->
flags |= RDMA_DEVICE_F_MLX5DV;
947 && data_seg_log2_sz <=
948 mlx5dv_attrs.striding_rq_caps.max_single_stride_log_num_of_bytes
949 && data_seg_log2_sz >=
950 mlx5dv_attrs.striding_rq_caps.min_single_stride_log_num_of_bytes
952 mlx5dv_attrs.striding_rq_caps.min_single_wqe_log_num_of_strides
954 mlx5dv_attrs.striding_rq_caps.max_single_wqe_log_num_of_strides)
955 rd->
flags |= RDMA_DEVICE_F_STRIDING_RQ;
962 "supported on this interface");
999 ibv_free_device_list (dev_list);
1001 args->
rv = VNET_API_ERROR_INVALID_INTERFACE;
1022 if (rd->
flags & RDMA_DEVICE_F_ERROR)
1029 rd->
flags |= RDMA_DEVICE_F_ADMIN_UP;
1034 rd->
flags &= ~RDMA_DEVICE_F_ADMIN_UP;
1061 .name =
"RDMA interface",
vlib_log_class_t vlib_log_register_class(char *class, char *subclass)
u32 flags
buffer flags: VLIB_BUFFER_FREE_LIST_INDEX_MASK: bits used to store free list index, VLIB_BUFFER_IS_TRACED: trace this buffer.
volatile u32 * dv_sq_dbrec
struct mlx5_cqe64 * dv_cq_cqes
#define vec_foreach_index(var, v)
Iterate over vector indices.
static u32 rdma_dev_set_ucast(rdma_device_t *rd)
__clib_export u8 * clib_sysfs_link_to_name(char *link)
rdma_mlx5_wqe_t * dv_sq_wqes
vl_api_wireguard_peer_flags_t flags
void ethernet_delete_interface(vnet_main_t *vnm, u32 hw_if_index)
struct ibv_flow * flow_mcast6
static u32 rdma_rxq_destroy_flow(const rdma_device_t *rd, struct ibv_flow **flow)
vnet_main_t * vnet_get_main(void)
#define pool_get_zero(P, E)
Allocate an object E from a pool P and zero it.
#define rdma_log(lvl, dev, f,...)
format_function_t format_rdma_device
#define RDMA_RXQ_MAX_CHAIN_LOG_SZ
static u32 rdma_dev_set_promisc(rdma_device_t *rd)
vlib_pci_device_info_t * pci
clib_memset(h->entries, 0, sizeof(h->entries[0]) *entries)
static clib_error_t * rdma_rxq_init(vlib_main_t *vm, rdma_device_t *rd, u16 qid, u32 n_desc, u8 no_multi_seg, u16 max_pktlen)
static clib_error_t * rdma_rxq_finalize(vlib_main_t *vm, rdma_device_t *rd)
u8 opcode_cqefmt_se_owner
static vnet_hw_interface_t * vnet_get_hw_interface(vnet_main_t *vnm, u32 hw_if_index)
#define RDMA_TXQ_DV_INVALID_ID
static clib_error_t * rdma_dev_init(vlib_main_t *vm, rdma_device_t *rd, rdma_create_if_args_t *args)
volatile u32 * dv_cq_dbrec
u32 per_interface_next_index
static void vlib_pci_free_device_info(vlib_pci_device_info_t *di)
vlib_buffer_main_t * buffer_main
#define vec_validate_aligned(V, I, A)
Make sure vector is long enough for given index (no header, specified alignment)
#define ETHERNET_INTERFACE_FLAG_DEFAULT_L3
clib_error_t * vnet_hw_interface_set_flags(vnet_main_t *vnm, u32 hw_if_index, vnet_hw_interface_flags_t flags)
static uword vlib_node_add_next(vlib_main_t *vm, uword node, uword next_node)
static uword min_log2(uword x)
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
clib_file_function_t * read_function
static vnet_sw_interface_t * vnet_get_hw_sw_interface(vnet_main_t *vnm, u32 hw_if_index)
vlib_log_class_t log_class
static void rdma_async_event_cleanup(rdma_device_t *rd)
struct ibv_flow * flow_ucast6
#define rdma_log__(lvl, dev, f,...)
rdma_per_thread_data_t * per_thread_data
VNET_DEVICE_CLASS(af_xdp_device_class)
struct ibv_flow * flow_mcast4
#define VLIB_INIT_FUNCTION(x)
static void rdma_set_interface_next_node(vnet_main_t *vnm, u32 hw_if_index, u32 node_index)
static uword sysfs_path_to_pci_addr(char *path, vlib_pci_addr_t *addr)
#define vec_new(T, N)
Create new vector of given type and length (unspecified alignment, no header).
description fragment has unexpected format
vnet_hw_interface_flags_t flags
#define vec_elt_at_index(v, i)
Get vector value at index i checking that i is in bounds.
#define clib_error_return(e, args...)
static void rdma_dev_cleanup(rdma_device_t *rd)
clib_file_main_t file_main
#define vlib_log_emerg(...)
static clib_error_t * rdma_txq_init(vlib_main_t *vm, rdma_device_t *rd, u16 qid, u32 n_desc)
vlib_pci_device_info_t * vlib_pci_get_device_info(vlib_main_t *vm, vlib_pci_addr_t *addr, clib_error_t **error)
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
static_always_inline void mac_address_from_bytes(mac_address_t *mac, const u8 *bytes)
clib_error_t * rdma_init(vlib_main_t *vm)
static clib_error_t * rdma_interface_admin_up_down(vnet_main_t *vnm, u32 hw_if_index, u32 flags)
#define clib_error_return_unix(e, args...)
static u32 rdma_dev_change_mtu(rdma_device_t *rd)
#define pool_put(P, E)
Free an object E in pool P.
unformat_function_t unformat_vlib_pci_addr
static u8 rdma_rss_hash_key[]
static_always_inline u32 vlib_buffer_get_default_data_size(vlib_main_t *vm)
vlib_buffer_t buffer_template
#define RDMA_RXQ_LEGACY_MODE_MAX_CHAIN_SZ
struct ibv_rwq_ind_table * rx_rwq_ind_tbl
static void rdma_unregister_interface(vnet_main_t *vnm, rdma_device_t *rd)
struct ibv_flow * flow_ucast4
sll srl srl sll sra u16x4 i
#define vec_free(V)
Free vector's memory (no header).
#define ETHERNET_INTERFACE_FLAG_MTU
#define ETHERNET_INTERFACE_FLAG_ACCEPT_ALL
#define RDMA_TXQ_BUF_SZ(txq)
format_function_t format_rdma_device_name
void vnet_hw_interface_assign_rx_thread(vnet_main_t *vnm, u32 hw_if_index, u16 queue_id, uword thread_index)
static uword clib_file_add(clib_file_main_t *um, clib_file_t *template)
static void clib_file_del_by_index(clib_file_main_t *um, uword index)
static u32 rdma_flag_change(vnet_main_t *vnm, vnet_hw_interface_t *hw, u32 flags)
static vlib_main_t * vlib_get_main(void)
vnet_device_class_t rdma_device_class
static uword is_pow2(uword x)
u32 async_event_clib_file_index
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
u16 n_total_additional_segs
static void ethernet_mac_address_generate(u8 *mac)
clib_error_t * ethernet_register_interface(vnet_main_t *vnm, u32 dev_class_index, u32 dev_instance, const u8 *address, u32 *hw_if_index_return, ethernet_flag_change_function_t flag_change)
static uword max_log2(uword x)
VLIB buffer representation.
static struct ibv_flow * rdma_rxq_init_flow(const rdma_device_t *rd, struct ibv_qp *qp, const mac_address_t *mac, const mac_address_t *mask, u16 ether_type, u32 flags)
static void rdma_update_state(vnet_main_t *vnm, rdma_device_t *rd, int port)
#define foreach_rdma_tx_func_error
#define clib_error_free(e)
static char * rdma_tx_func_error_strings[]
clib_file_function_t * error_function
int vnet_hw_interface_unassign_rx_thread(vnet_main_t *vnm, u32 hw_if_index, u16 queue_id)
static clib_error_t * rdma_mac_change(vnet_hw_interface_t *hw, const u8 *old, const u8 *new)
static clib_error_t * rdma_register_interface(vnet_main_t *vnm, rdma_device_t *rd)
static clib_error_t * rdma_async_event_init(rdma_device_t *rd)
static vlib_thread_main_t * vlib_get_thread_main()
#define vec_foreach(var, vec)
Vector iterator.
static clib_error_t * rdma_async_event_error_ready(clib_file_t *f)
#define MLX5_ETH_L2_INLINE_HEADER_SIZE
#define vlib_log_err(...)
static clib_error_t * rdma_async_event_read_ready(clib_file_t *f)
void rdma_delete_if(vlib_main_t *vm, rdma_device_t *rd)
#define CLIB_CACHE_LINE_BYTES
void rdma_create_if(vlib_main_t *vm, rdma_create_if_args_t *args)
#define STATIC_ASSERT_SIZEOF(d, s)
static u8 vlib_buffer_pool_get_default_for_numa(vlib_main_t *vm, u32 numa_node)
volatile u8 ref_count
Reference count for this buffer.
__clib_export u8 * format_clib_error(u8 *s, va_list *va)
static void vnet_hw_interface_set_link_speed(vnet_main_t *vnm, u32 hw_if_index, u32 link_speed)
static void vnet_hw_interface_set_input_node(vnet_main_t *vnm, u32 hw_if_index, u32 node_index)
u32 ethernet_set_flags(vnet_main_t *vnm, u32 hw_if_index, u32 flags)