21 #include <linux/if_link.h> 22 #include <linux/if_ether.h> 34 0x2c, 0xc6, 0x81, 0xd1,
35 0x5b, 0xdb, 0xf4, 0xf7,
36 0xfc, 0xa2, 0x83, 0x19,
37 0xdb, 0x1a, 0x3e, 0x94,
38 0x6b, 0x9e, 0x38, 0xd9,
39 0x2c, 0x9c, 0x03, 0xd1,
40 0xad, 0x99, 0x44, 0xa7,
41 0xd9, 0x56, 0x3d, 0x59,
42 0x06, 0x3c, 0x25, 0xf3,
43 0xfc, 0x1f, 0xdc, 0x2a,
48 #define rdma_log_debug(dev, f, ...) \ 50 vlib_log(VLIB_LOG_LEVEL_DEBUG, rdma_main.log_class, "%U: " f, \ 51 format_vlib_pci_addr, &rd->pci_addr, ##__VA_ARGS__); \ 65 struct ibv_port_attr attr;
69 if (ibv_query_port (rd->
ctx, port, &attr))
80 case IBV_PORT_ACTIVE_DEFER:
81 rd->
flags |= RDMA_DEVICE_F_LINK_UP;
86 rd->
flags &= ~RDMA_DEVICE_F_LINK_UP;
92 switch (attr.active_width)
107 switch (attr.active_speed)
145 struct ibv_async_event event;
146 ret = ibv_get_async_event (rd->
ctx, &event);
152 switch (event.event_type)
154 case IBV_EVENT_PORT_ACTIVE:
157 case IBV_EVENT_PORT_ERR:
160 case IBV_EVENT_DEVICE_FATAL:
161 rd->
flags &= ~RDMA_DEVICE_F_LINK_UP;
168 "Unhandeld RDMA async event %i for device %U",
173 ibv_ack_async_event (&event);
184 ret = fcntl (rd->
ctx->async_fd, F_GETFL);
189 ret = fcntl (rd->
ctx->async_fd, F_SETFL, ret | O_NONBLOCK);
237 #define _(fn, arg) if (arg) \ 240 if ((rv = fn (arg))) \ 241 rdma_log_debug (rd, #fn "() failed (rv = %d)", rv); \ 246 _(ibv_dereg_mr, rd->
mr);
249 _(ibv_destroy_qp, txq->
qp);
250 _(ibv_destroy_cq, txq->
cq);
254 _(ibv_destroy_wq, rxq->
wq);
255 _(ibv_destroy_cq, rxq->
cq);
258 _(ibv_destroy_qp, rd->
rx_qp);
259 _(ibv_dealloc_pd, rd->
pd);
260 _(ibv_close_device, rd->
ctx);
276 struct raw_eth_flow_attr
278 struct ibv_flow_attr attr;
279 struct ibv_flow_spec_eth spec_eth;
280 } __attribute__ ((packed)) fa;
282 memset (&fa, 0,
sizeof (fa));
283 fa.attr.num_of_specs = 1;
285 fa.attr.flags =
flags;
286 fa.spec_eth.type = IBV_FLOW_SPEC_ETH;
287 fa.spec_eth.size =
sizeof (
struct ibv_flow_spec_eth);
289 memcpy (fa.spec_eth.val.dst_mac, mac, sizeof (fa.spec_eth.val.dst_mac));
290 memcpy (fa.spec_eth.mask.dst_mac, mask, sizeof (fa.spec_eth.mask.dst_mac));
292 if ((*flow = ibv_create_flow (qp, &fa.attr)) == 0)
302 struct ibv_wq_init_attr wqia;
303 struct ibv_wq_attr wqa;
309 if ((rxq->
cq = ibv_create_cq (rd->
ctx, n_desc,
NULL,
NULL, 0)) == 0)
312 memset (&wqia, 0,
sizeof (wqia));
313 wqia.wq_type = IBV_WQT_RQ;
314 wqia.max_wr = n_desc;
318 if ((rxq->
wq = ibv_create_wq (rd->
ctx, &wqia)) == 0)
321 memset (&wqa, 0,
sizeof (wqa));
322 wqa.attr_mask = IBV_WQ_ATTR_STATE;
323 wqa.wq_state = IBV_WQS_RDY;
324 if (ibv_modify_wq (rxq->
wq, &wqa) != 0)
333 struct ibv_rwq_ind_table_init_attr rwqia;
334 struct ibv_qp_init_attr_ex qpia;
338 struct ibv_wq **ind_tbl;
343 &&
"rxq number should be a power of 2");
348 memset (&rwqia, 0,
sizeof (rwqia));
350 rwqia.ind_tbl = ind_tbl;
355 memset (&qpia, 0,
sizeof (qpia));
356 qpia.qp_type = IBV_QPT_RAW_PACKET;
358 IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_IND_TABLE |
359 IBV_QP_INIT_ATTR_RX_HASH;
365 qpia.rx_hash_conf.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ;
366 qpia.rx_hash_conf.rx_hash_fields_mask =
367 IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4;
368 if ((rd->
rx_qp = ibv_create_qp_ex (rd->
ctx, &qpia)) == 0)
378 IBV_FLOW_ATTR_FLAGS_DONT_TRAP
387 struct ibv_qp_init_attr qpia;
388 struct ibv_qp_attr qpa;
395 if ((txq->
cq = ibv_create_cq (rd->
ctx, n_desc,
NULL,
NULL, 0)) == 0)
398 memset (&qpia, 0,
sizeof (qpia));
399 qpia.send_cq = txq->
cq;
400 qpia.recv_cq = txq->
cq;
401 qpia.cap.max_send_wr = n_desc;
402 qpia.cap.max_send_sge = 1;
403 qpia.qp_type = IBV_QPT_RAW_PACKET;
406 if ((txq->
qp = ibv_create_qp (rd->
pd, &qpia)) == 0)
409 memset (&qpa, 0,
sizeof (qpa));
410 qp_flags = IBV_QP_STATE | IBV_QP_PORT;
411 qpa.qp_state = IBV_QPS_INIT;
413 if (ibv_modify_qp (txq->
qp, &qpa, qp_flags) != 0)
416 memset (&qpa, 0,
sizeof (qpa));
417 qp_flags = IBV_QP_STATE;
418 qpa.qp_state = IBV_QPS_RTR;
419 if (ibv_modify_qp (txq->
qp, &qpa, qp_flags) != 0)
422 memset (&qpa, 0,
sizeof (qpa));
423 qp_flags = IBV_QP_STATE;
424 qpa.qp_state = IBV_QPS_RTS;
425 if (ibv_modify_qp (txq->
qp, &qpa, qp_flags) != 0)
432 u32 txq_size,
u32 rxq_num)
442 if ((rd->
pd = ibv_alloc_pd (rd->
ctx)) == 0)
447 for (i = 0; i < rxq_num; i++)
459 IBV_ACCESS_LOCAL_WRITE)) == 0)
486 struct ibv_device **dev_list = 0;
497 args->
rv = VNET_API_ERROR_INVALID_VALUE;
505 args->
rv = VNET_API_ERROR_INVALID_VALUE;
517 s =
format (s,
"/sys/class/net/%s/device/driver/module%c", args->
ifname, 0);
520 if (s2 == 0 || strncmp ((
char *) s2,
"mlx5_core", 9) != 0)
524 "invalid interface (only mlx5 supported for now)");
530 s =
format (s,
"/sys/class/net/%s/device%c", args->
ifname, 0);
537 dev_list = ibv_get_device_list (&n_devs);
542 "no RDMA devices available, errno = %d. " 543 "Is the ib_uverbs module loaded?", errno);
547 for (
int i = 0;
i < n_devs;
i++)
549 vlib_pci_addr_t
addr;
552 s =
format (s,
"%s/device%c", dev_list[
i]->dev_path, 0);
557 if (addr.as_u32 != rd->
pci_addr.as_u32)
560 if ((rd->
ctx = ibv_open_device (dev_list[
i])))
594 ibv_free_device_list (dev_list);
598 args->
rv = VNET_API_ERROR_INVALID_INTERFACE;
618 if (rd->
flags & RDMA_DEVICE_F_ERROR)
625 rd->
flags |= RDMA_DEVICE_F_ADMIN_UP;
630 rd->
flags &= ~RDMA_DEVICE_F_ADMIN_UP;
644 if (node_index == ~0)
663 .name =
"RDMA interface",
vlib_log_class_t vlib_log_register_class(char *class, char *subclass)
#define vec_foreach_index(var, v)
Iterate over vector indices.
u8 * format_clib_error(u8 *s, va_list *va)
#define vlib_log_warn(...)
void ethernet_delete_interface(vnet_main_t *vnm, u32 hw_if_index)
static clib_error_t * rdma_rxq_finalize(vlib_main_t *vm, rdma_device_t *rd)
vnet_main_t * vnet_get_main(void)
#define pool_get_zero(P, E)
Allocate an object E from a pool P and zero it.
format_function_t format_rdma_device
static vnet_hw_interface_t * vnet_get_hw_interface(vnet_main_t *vnm, u32 hw_if_index)
u32 per_interface_next_index
vlib_buffer_main_t * buffer_main
static void rdma_unregister_interface(vnet_main_t *vnm, rdma_device_t *rd)
#define vec_validate_aligned(V, I, A)
Make sure vector is long enough for given index (no header, specified alignment)
static uword vlib_node_add_next(vlib_main_t *vm, uword node, uword next_node)
static uword min_log2(uword x)
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
clib_file_function_t * read_function
static vnet_sw_interface_t * vnet_get_hw_sw_interface(vnet_main_t *vnm, u32 hw_if_index)
vlib_log_class_t log_class
static clib_error_t * rdma_interface_admin_up_down(vnet_main_t *vnm, u32 hw_if_index, u32 flags)
#define VLIB_INIT_FUNCTION(x)
static void rdma_update_state(vnet_main_t *vnm, rdma_device_t *rd, int port)
#define vec_new(T, N)
Create new vector of given type and length (unspecified alignment, no header).
static clib_error_t * rdma_txq_init(vlib_main_t *vm, rdma_device_t *rd, u16 qid, u32 n_desc)
#define vec_elt_at_index(v, i)
Get vector value at index i checking that i is in bounds.
#define clib_error_return(e, args...)
clib_file_main_t file_main
#define vlib_log_emerg(...)
static clib_error_t * rdma_async_event_init(rdma_device_t *rd)
#define pool_elt_at_index(p, i)
Returns pointer to element at given index.
#define clib_error_return_unix(e, args...)
#define pool_put(P, E)
Free an object E in pool P.
unformat_function_t unformat_vlib_pci_addr
#define vec_dup(V)
Return copy of vector (no header, no alignment)
static uword sysfs_path_to_pci_addr(char *path, vlib_pci_addr_t *addr)
static void rdma_async_event_cleanup(rdma_device_t *rd)
static clib_error_t * rdma_dev_init(vlib_main_t *vm, rdma_device_t *rd, u32 rxq_size, u32 txq_size, u32 rxq_num)
static clib_error_t * rdma_rxq_init(vlib_main_t *vm, rdma_device_t *rd, u16 qid, u32 n_desc)
struct ibv_rwq_ind_table * rx_rwq_ind_tbl
static clib_error_t * rdma_rxq_init_flow(struct ibv_flow **flow, struct ibv_qp *qp, const mac_address_t *mac, const mac_address_t *mask, u32 flags)
#define vec_free(V)
Free vector's memory (no header).
static clib_error_t * rdma_register_interface(vnet_main_t *vnm, rdma_device_t *rd)
static clib_error_t * rdma_async_event_error_ready(clib_file_t *f)
void rdma_create_if(vlib_main_t *vm, rdma_create_if_args_t *args)
static u8 rdma_rss_hash_key[]
static clib_error_t * rdma_async_event_read_ready(clib_file_t *f)
format_function_t format_rdma_device_name
void vnet_hw_interface_assign_rx_thread(vnet_main_t *vnm, u32 hw_if_index, u16 queue_id, uword thread_index)
static void rdma_dev_cleanup(rdma_device_t *rd)
static uword clib_file_add(clib_file_main_t *um, clib_file_t *template)
static void clib_file_del_by_index(clib_file_main_t *um, uword index)
VNET_DEVICE_CLASS(bond_dev_class)
clib_error_t * rdma_init(vlib_main_t *vm)
static vlib_main_t * vlib_get_main(void)
vnet_device_class_t rdma_device_class
static uword is_pow2(uword x)
u32 async_event_clib_file_index
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
static void ethernet_mac_address_generate(u8 *mac)
clib_error_t * ethernet_register_interface(vnet_main_t *vnm, u32 dev_class_index, u32 dev_instance, const u8 *address, u32 *hw_if_index_return, ethernet_flag_change_function_t flag_change)
clib_error_t * vnet_hw_interface_set_flags(vnet_main_t *vnm, u32 hw_if_index, vnet_hw_interface_flags_t flags)
#define foreach_rdma_tx_func_error
void rdma_delete_if(vlib_main_t *vm, rdma_device_t *rd)
#define clib_error_free(e)
clib_file_function_t * error_function
int vnet_hw_interface_unassign_rx_thread(vnet_main_t *vnm, u32 hw_if_index, u16 queue_id)
struct ibv_flow * flow_ucast
static vlib_thread_main_t * vlib_get_thread_main()
static char * rdma_tx_func_error_strings[]
#define vec_foreach(var, vec)
Vector iterator.
#define vlib_log_err(...)
struct ibv_flow * flow_mcast
#define CLIB_CACHE_LINE_BYTES
#define STATIC_ASSERT_SIZEOF(d, s)
static u32 rdma_flag_change(vnet_main_t *vnm, vnet_hw_interface_t *hw, u32 flags)
static void vnet_hw_interface_set_link_speed(vnet_main_t *vnm, u32 hw_if_index, u32 link_speed)
static void vnet_hw_interface_set_input_node(vnet_main_t *vnm, u32 hw_if_index, u32 node_index)
format_function_t format_vlib_pci_addr
u8 * clib_sysfs_link_to_name(char *link)
static void rdma_set_interface_next_node(vnet_main_t *vnm, u32 hw_if_index, u32 node_index)