FD.io VPP  v16.09
Vector Packet Processing
node.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #include <vnet/vnet.h>
16 #include <vppinfra/vec.h>
17 #include <vppinfra/error.h>
18 #include <vppinfra/format.h>
19 #include <vppinfra/xxhash.h>
20 
21 #include <vnet/ethernet/ethernet.h>
22 #include <vnet/devices/dpdk/dpdk.h>
24 #include <vnet/mpls-gre/packet.h>
25 #include <vnet/handoff.h>
26 
27 #include "dpdk_priv.h"
28 
29 #ifndef MAX
30 #define MAX(a,b) ((a) < (b) ? (b) : (a))
31 #endif
32 
33 #ifndef MIN
34 #define MIN(a,b) ((a) < (b) ? (a) : (b))
35 #endif
36 
37 /*
38  * At least in certain versions of ESXi, vmware e1000's don't honor the
39  * "strip rx CRC" bit. Set this flag to work around that bug FOR UNIT TEST ONLY.
40  *
41  * If wireshark complains like so:
42  *
43  * "Frame check sequence: 0x00000000 [incorrect, should be <hex-num>]"
44  * and you're using ESXi emulated e1000's, set this flag FOR UNIT TEST ONLY.
45  *
46  * Note: do NOT check in this file with this workaround enabled! You'll lose
47  * actual data from e.g. 10xGE interfaces. The extra 4 bytes annoy
48  * wireshark, but they're harmless...
49  */
50 #define VMWARE_LENGTH_BUG_WORKAROUND 0
51 
52 static char *dpdk_error_strings[] = {
53 #define _(n,s) s,
55 #undef _
56 };
57 
58 always_inline int
59 dpdk_mbuf_is_ip4 (struct rte_mbuf *mb)
60 {
61  return RTE_ETH_IS_IPV4_HDR (mb->packet_type) != 0;
62 }
63 
64 always_inline int
65 dpdk_mbuf_is_ip6 (struct rte_mbuf *mb)
66 {
67  return RTE_ETH_IS_IPV6_HDR (mb->packet_type) != 0;
68 }
69 
70 always_inline int
72 {
74  return (h->type == clib_host_to_net_u16 (ETHERNET_TYPE_MPLS_UNICAST));
75 }
76 
77 always_inline void
79  struct rte_mbuf *mb,
80  vlib_buffer_t * b0, u8 * next0,
81  u8 * error0)
82 {
83  u8 n0;
84  uint16_t mb_flags = mb->ol_flags;
85 
86  if (PREDICT_FALSE (mb_flags & (
87 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
88  PKT_EXT_RX_PKT_ERROR | PKT_EXT_RX_BAD_FCS |
89 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
90  PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)))
91  {
92  /* some error was flagged. determine the drop reason */
93  n0 = DPDK_RX_NEXT_DROP;
94  *error0 =
95 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
96  (mb_flags & PKT_EXT_RX_PKT_ERROR) ? DPDK_ERROR_RX_PACKET_ERROR :
97  (mb_flags & PKT_EXT_RX_BAD_FCS) ? DPDK_ERROR_RX_BAD_FCS :
98 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
99  (mb_flags & PKT_RX_IP_CKSUM_BAD) ? DPDK_ERROR_IP_CHECKSUM_ERROR :
100  (mb_flags & PKT_RX_L4_CKSUM_BAD) ? DPDK_ERROR_L4_CHECKSUM_ERROR :
101  DPDK_ERROR_NONE;
102  }
103  else
104  {
105  *error0 = DPDK_ERROR_NONE;
106  if (PREDICT_FALSE (xd->per_interface_next_index != ~0))
107  {
108  n0 = xd->per_interface_next_index;
110  if (PREDICT_TRUE (dpdk_mbuf_is_ip4 (mb)))
111  vnet_buffer (b0)->handoff.next_index =
113  else if (PREDICT_TRUE (dpdk_mbuf_is_ip6 (mb)))
114  vnet_buffer (b0)->handoff.next_index =
116  else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
117  vnet_buffer (b0)->handoff.next_index =
119  else
120  vnet_buffer (b0)->handoff.next_index =
122  }
123  else
124  if (PREDICT_FALSE (xd->vlan_subifs || (mb_flags & PKT_RX_VLAN_PKT)))
126  else
127  {
128  if (PREDICT_TRUE (dpdk_mbuf_is_ip4 (mb)))
130  else if (PREDICT_TRUE (dpdk_mbuf_is_ip6 (mb)))
132  else if (PREDICT_TRUE (vlib_buffer_is_mpls (b0)))
134  else
136  }
137  }
138  *next0 = n0;
139 }
140 
141 void
143  vlib_node_runtime_t * node,
144  dpdk_device_t * xd,
145  u16 queue_id, u32 * buffers, uword n_buffers)
146 {
147  vlib_main_t *vm = vlib_get_main ();
148  u32 *b, n_left;
149  u8 next0;
150 
151  n_left = n_buffers;
152  b = buffers;
153 
154  while (n_left >= 1)
155  {
156  u32 bi0;
157  vlib_buffer_t *b0;
159  struct rte_mbuf *mb;
160  u8 error0;
161 
162  bi0 = b[0];
163  n_left -= 1;
164 
165  b0 = vlib_get_buffer (vm, bi0);
166  mb = rte_mbuf_from_vlib_buffer (b0);
167  dpdk_rx_next_and_error_from_mb_flags_x1 (xd, mb, b0, &next0, &error0);
168  vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
169  t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
170  t0->queue_index = queue_id;
171  t0->device_index = xd->device_index;
172  t0->buffer_index = bi0;
173 
174  clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
175  clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
176  clib_memcpy (t0->buffer.pre_data, b0->data,
177  sizeof (t0->buffer.pre_data));
178  clib_memcpy (&t0->data, mb->buf_addr + mb->data_off, sizeof (t0->data));
179 
180 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
181  /*
182  * Clear overloaded TX offload flags when a DPDK driver
183  * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
184  */
185  mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
186 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
187 
188  b += 1;
189  }
190 }
191 
192 /*
193  * dpdk_efd_update_counters()
194  * Update EFD (early-fast-discard) counters
195  */
196 void
197 dpdk_efd_update_counters (dpdk_device_t * xd, u32 n_buffers, u16 enabled)
198 {
199  if (enabled & DPDK_EFD_MONITOR_ENABLED)
200  {
201  u64 now = clib_cpu_time_now ();
202  if (xd->efd_agent.last_poll_time > 0)
203  {
204  u64 elapsed_time = (now - xd->efd_agent.last_poll_time);
205  if (elapsed_time > xd->efd_agent.max_poll_delay)
206  xd->efd_agent.max_poll_delay = elapsed_time;
207  }
208  xd->efd_agent.last_poll_time = now;
209  }
210 
211  xd->efd_agent.total_packet_cnt += n_buffers;
212  xd->efd_agent.last_burst_sz = n_buffers;
213 
214  if (n_buffers > xd->efd_agent.max_burst_sz)
215  xd->efd_agent.max_burst_sz = n_buffers;
216 
217  if (PREDICT_FALSE (n_buffers == VLIB_FRAME_SIZE))
218  {
221  }
222  else
223  {
225  }
226 }
227 
228 /* is_efd_discardable()
229  * returns non zero DPDK error if packet meets early-fast-discard criteria,
230  * zero otherwise
231  */
232 u32
234  vlib_buffer_t * b0, struct rte_mbuf *mb)
235 {
237 
238  if (eh->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP4))
239  {
240  ip4_header_t *ipv4 =
241  (ip4_header_t *) & (b0->data[sizeof (ethernet_header_t)]);
242  u8 pkt_prec = (ipv4->tos >> 5);
243 
244  return (tm->efd.ip_prec_bitmap & (1 << pkt_prec) ?
245  DPDK_ERROR_IPV4_EFD_DROP_PKTS : DPDK_ERROR_NONE);
246  }
247  else if (eh->type == clib_net_to_host_u16 (ETHERNET_TYPE_IP6))
248  {
249  ip6_header_t *ipv6 =
250  (ip6_header_t *) & (b0->data[sizeof (ethernet_header_t)]);
251  u8 pkt_tclass =
252  ((ipv6->ip_version_traffic_class_and_flow_label >> 20) & 0xff);
253 
254  return (tm->efd.ip_prec_bitmap & (1 << pkt_tclass) ?
255  DPDK_ERROR_IPV6_EFD_DROP_PKTS : DPDK_ERROR_NONE);
256  }
257  else if (eh->type == clib_net_to_host_u16 (ETHERNET_TYPE_MPLS_UNICAST))
258  {
259  mpls_unicast_header_t *mpls =
260  (mpls_unicast_header_t *) & (b0->data[sizeof (ethernet_header_t)]);
261  u8 pkt_exp = ((mpls->label_exp_s_ttl >> 9) & 0x07);
262 
263  return (tm->efd.mpls_exp_bitmap & (1 << pkt_exp) ?
264  DPDK_ERROR_MPLS_EFD_DROP_PKTS : DPDK_ERROR_NONE);
265  }
266  else if ((eh->type == clib_net_to_host_u16 (ETHERNET_TYPE_VLAN)) ||
267  (eh->type == clib_net_to_host_u16 (ETHERNET_TYPE_DOT1AD)))
268  {
269  ethernet_vlan_header_t *vlan =
270  (ethernet_vlan_header_t *) & (b0->data[sizeof (ethernet_header_t)]);
271  u8 pkt_cos = ((vlan->priority_cfi_and_id >> 13) & 0x07);
272 
273  return (tm->efd.vlan_cos_bitmap & (1 << pkt_cos) ?
274  DPDK_ERROR_VLAN_EFD_DROP_PKTS : DPDK_ERROR_NONE);
275  }
276 
277  return DPDK_ERROR_NONE;
278 }
279 
280 static inline u32
282 {
283  u32 n_buffers;
284  u32 n_left;
285  u32 n_this_chunk;
286 
287  n_left = VLIB_FRAME_SIZE;
288  n_buffers = 0;
289 
291  {
292  while (n_left)
293  {
294  n_this_chunk = rte_eth_rx_burst (xd->device_index, queue_id,
295  xd->rx_vectors[queue_id] +
296  n_buffers, n_left);
297  n_buffers += n_this_chunk;
298  n_left -= n_this_chunk;
299 
300  /* Empirically, DPDK r1.8 produces vectors w/ 32 or fewer elts */
301  if (n_this_chunk < 32)
302  break;
303  }
304  }
305 #if DPDK_VHOST_USER
306  else if (xd->dev_type == VNET_DPDK_DEV_VHOST_USER)
307  {
308  vlib_main_t *vm = vlib_get_main ();
310  unsigned socket_id = rte_socket_id ();
311  u32 offset = 0;
312 
313  offset = queue_id * VIRTIO_QNUM;
314 
315  struct vhost_virtqueue *vq =
316  xd->vu_vhost_dev.virtqueue[offset + VIRTIO_TXQ];
317 
318  if (PREDICT_FALSE (!vq->enabled))
319  return 0;
320 
321  struct rte_mbuf **pkts = xd->rx_vectors[queue_id];
322  while (n_left)
323  {
324  n_this_chunk = rte_vhost_dequeue_burst (&xd->vu_vhost_dev,
325  offset + VIRTIO_TXQ,
326  bm->pktmbuf_pools
327  [socket_id],
328  pkts + n_buffers, n_left);
329  n_buffers += n_this_chunk;
330  n_left -= n_this_chunk;
331  if (n_this_chunk == 0)
332  break;
333  }
334 
335  int i;
336  u32 bytes = 0;
337  for (i = 0; i < n_buffers; i++)
338  {
339  struct rte_mbuf *buff = pkts[i];
340  bytes += rte_pktmbuf_data_len (buff);
341  }
342 
343  f64 now = vlib_time_now (vm);
344 
345  dpdk_vu_vring *vring = NULL;
346  /* send pending interrupts if needed */
347  if (dpdk_vhost_user_want_interrupt (xd, offset + VIRTIO_TXQ))
348  {
349  vring = &(xd->vu_intf->vrings[offset + VIRTIO_TXQ]);
350  vring->n_since_last_int += n_buffers;
351 
352  if ((vring->n_since_last_int && (vring->int_deadline < now))
353  || (vring->n_since_last_int > dm->conf->vhost_coalesce_frames))
354  dpdk_vhost_user_send_interrupt (vm, xd, offset + VIRTIO_TXQ);
355  }
356 
357  vring = &(xd->vu_intf->vrings[offset + VIRTIO_RXQ]);
358  vring->packets += n_buffers;
359  vring->bytes += bytes;
360 
361  if (dpdk_vhost_user_want_interrupt (xd, offset + VIRTIO_RXQ))
362  {
363  if (vring->n_since_last_int && (vring->int_deadline < now))
364  dpdk_vhost_user_send_interrupt (vm, xd, offset + VIRTIO_RXQ);
365  }
366 
367  }
368 #endif
369 #ifdef RTE_LIBRTE_KNI
370  else if (xd->dev_type == VNET_DPDK_DEV_KNI)
371  {
372  n_buffers =
373  rte_kni_rx_burst (xd->kni, xd->rx_vectors[queue_id], VLIB_FRAME_SIZE);
374  rte_kni_handle_request (xd->kni);
375  }
376 #endif
377  else
378  {
379  ASSERT (0);
380  }
381 
382  return n_buffers;
383 }
384 
385 /*
386  * This function is used when there are no worker threads.
387  * The main thread performs IO and forwards the packets.
388  */
389 static inline u32
391  dpdk_device_t * xd,
392  vlib_node_runtime_t * node,
393  u32 cpu_index, u16 queue_id, int use_efd)
394 {
395  u32 n_buffers;
396  u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT;
397  u32 n_left_to_next, *to_next;
398  u32 mb_index;
399  vlib_main_t *vm = vlib_get_main ();
400  uword n_rx_bytes = 0;
401  u32 n_trace, trace_cnt __attribute__ ((unused));
403  u8 efd_discard_burst = 0;
404  u32 buffer_flags_template;
405 
406  if (xd->admin_up == 0)
407  return 0;
408 
409  n_buffers = dpdk_rx_burst (dm, xd, queue_id);
410 
411  if (n_buffers == 0)
412  {
413  /* check if EFD (dpdk) is enabled */
414  if (PREDICT_FALSE (use_efd && dm->efd.enabled))
415  {
416  /* reset a few stats */
417  xd->efd_agent.last_poll_time = 0;
418  xd->efd_agent.last_burst_sz = 0;
419  }
420  return 0;
421  }
422 
423  buffer_flags_template = dm->buffer_flags_template;
424 
426  trace_cnt = n_trace = vlib_get_trace_count (vm, node);
427 
429 
430  /*
431  * DAW-FIXME: VMXNET3 device stop/start doesn't work,
432  * therefore fake the stop in the dpdk driver by
433  * silently dropping all of the incoming pkts instead of
434  * stopping the driver / hardware.
435  */
436  if (PREDICT_FALSE (xd->admin_up != 1))
437  {
438  for (mb_index = 0; mb_index < n_buffers; mb_index++)
439  rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
440 
441  return 0;
442  }
443 
444  /* Check for congestion if EFD (Early-Fast-Discard) is enabled
445  * in any mode (e.g. dpdk, monitor, or drop_all)
446  */
447  if (PREDICT_FALSE (use_efd && dm->efd.enabled))
448  {
449  /* update EFD counters */
450  dpdk_efd_update_counters (xd, n_buffers, dm->efd.enabled);
451 
453  {
454  /* discard all received packets */
455  for (mb_index = 0; mb_index < n_buffers; mb_index++)
456  rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
457 
458  xd->efd_agent.discard_cnt += n_buffers;
460  DPDK_ERROR_VLAN_EFD_DROP_PKTS,
461  n_buffers);
462 
463  return 0;
464  }
465 
468  {
469  u32 device_queue_sz = rte_eth_rx_queue_count (xd->device_index,
470  queue_id);
471  if (device_queue_sz >= dm->efd.queue_hi_thresh)
472  {
473  /* dpdk device queue has reached the critical threshold */
475 
476  /* apply EFD to packets from the burst */
477  efd_discard_burst = 1;
478  }
479  }
480  }
481 
482  mb_index = 0;
483 
484  while (n_buffers > 0)
485  {
486  u32 bi0;
487  u8 next0, error0;
488  u32 l3_offset0;
489  vlib_buffer_t *b0, *b_seg, *b_chain = 0;
490  u32 cntr_type;
491 
492  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
493 
494  while (n_buffers > 0 && n_left_to_next > 0)
495  {
496  u8 nb_seg = 1;
497  struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
498  struct rte_mbuf *mb_seg = mb->next;
499 
500  if (PREDICT_TRUE (n_buffers > 2))
501  {
502  struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index + 2];
504  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, STORE);
506  }
507 
508  ASSERT (mb);
509 
510  b0 = vlib_buffer_from_rte_mbuf (mb);
511 
512  /* check whether EFD is looking for packets to discard */
513  if (PREDICT_FALSE (efd_discard_burst))
514  {
516 
517  if (PREDICT_TRUE (cntr_type = is_efd_discardable (tm, b0, mb)))
518  {
519  rte_pktmbuf_free (mb);
520  xd->efd_agent.discard_cnt++;
521  increment_efd_drop_counter (vm, cntr_type, 1);
522  n_buffers--;
523  mb_index++;
524  continue;
525  }
526  }
527 
528  /* Prefetch one next segment if it exists. */
529  if (PREDICT_FALSE (mb->nb_segs > 1))
530  {
531  struct rte_mbuf *pfmb = mb->next;
533  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
535  b_chain = b0;
536  }
537 
539 
540  bi0 = vlib_get_buffer_index (vm, b0);
541 
542  to_next[0] = bi0;
543  to_next++;
544  n_left_to_next--;
545 
547  &next0, &error0);
548 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
549  /*
550  * Clear overloaded TX offload flags when a DPDK driver
551  * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
552  */
553 
554  if (PREDICT_TRUE (trace_cnt == 0))
555  mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
556  else
557  trace_cnt--;
558 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
559 
560  b0->error = node->errors[error0];
561 
562  l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
563  next0 == DPDK_RX_NEXT_IP6_INPUT ||
564  next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
565  sizeof (ethernet_header_t) : 0);
566 
567  b0->current_data = l3_offset0;
568  /* Some drivers like fm10k receive frames with
569  mb->data_off > RTE_PKTMBUF_HEADROOM */
570  b0->current_data += mb->data_off - RTE_PKTMBUF_HEADROOM;
571  b0->current_length = mb->data_len - l3_offset0;
572 
573  b0->flags = buffer_flags_template;
574 
576  b0->current_length -= 4;
577 
578  vnet_buffer (b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
579  vnet_buffer (b0)->sw_if_index[VLIB_TX] = (u32) ~ 0;
580  n_rx_bytes += mb->pkt_len;
581 
582  /* Process subsequent segments of multi-segment packets */
583  while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
584  {
585  ASSERT (mb_seg != 0);
586 
587  b_seg = vlib_buffer_from_rte_mbuf (mb_seg);
588  vlib_buffer_init_for_free_list (b_seg, fl);
589 
590  ASSERT ((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
591  ASSERT (b_seg->current_data == 0);
592 
593  /*
594  * The driver (e.g. virtio) may not put the packet data at the start
595  * of the segment, so don't assume b_seg->current_data == 0 is correct.
596  */
597  b_seg->current_data =
598  (mb_seg->buf_addr + mb_seg->data_off) - (void *) b_seg->data;
599 
600  b_seg->current_length = mb_seg->data_len;
601  b0->total_length_not_including_first_buffer += mb_seg->data_len;
602 
603  b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
604  b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
605 
606  b_chain = b_seg;
607  mb_seg = mb_seg->next;
608  nb_seg++;
609  }
610 
611  /*
612  * Turn this on if you run into
613  * "bad monkey" contexts, and you want to know exactly
614  * which nodes they've visited... See main.c...
615  */
617 
618  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
619  to_next, n_left_to_next,
620  bi0, next0);
621  if (PREDICT_FALSE (n_trace > mb_index))
622  vec_add1 (xd->d_trace_buffers, bi0);
623  n_buffers--;
624  mb_index++;
625  }
626  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
627  }
628 
629  if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
630  {
631  dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers,
632  vec_len (xd->d_trace_buffers));
633  vlib_set_trace_count (vm, node,
634  n_trace - vec_len (xd->d_trace_buffers));
635  }
636 
638  (vnet_get_main ()->interface_main.combined_sw_if_counters
640  cpu_index, xd->vlib_sw_if_index, mb_index, n_rx_bytes);
641 
642  dpdk_worker_t *dw = vec_elt_at_index (dm->workers, cpu_index);
643  dw->aggregate_rx_packets += mb_index;
644 
645  return mb_index;
646 }
647 
648 static inline void
650 {
651  /* Limit the poll rate by sleeping for N msec between polls */
652  if (PREDICT_FALSE (dm->poll_sleep != 0))
653  {
654  struct timespec ts, tsrem;
655 
656  ts.tv_sec = 0;
657  ts.tv_nsec = 1000 * 1000 * dm->poll_sleep; /* 1ms */
658 
659  while (nanosleep (&ts, &tsrem) < 0)
660  {
661  ts = tsrem;
662  }
663  }
664 }
665 
666 /** \brief Main DPDK input node
667  @node dpdk-input
668 
669  This is the main DPDK input node: across each assigned interface,
670  call rte_eth_rx_burst(...) or similar to obtain a vector of
671  packets to process. Handle early packet discard. Derive @c
672  vlib_buffer_t metadata from <code>struct rte_mbuf</code> metadata,
673  Depending on the resulting metadata: adjust <code>b->current_data,
674  b->current_length </code> and dispatch directly to
675  ip4-input-no-checksum, or ip6-input. Trace the packet if required.
676 
677  @param vm vlib_main_t corresponding to the current thread
678  @param node vlib_node_runtime_t
679  @param f vlib_frame_t input-node, not used.
680 
681  @par Graph mechanics: buffer metadata, next index usage
682 
683  @em Uses:
684  - <code>struct rte_mbuf mb->ol_flags</code>
685  - PKT_EXT_RX_PKT_ERROR, PKT_EXT_RX_BAD_FCS
686  PKT_RX_IP_CKSUM_BAD, PKT_RX_L4_CKSUM_BAD
687  - <code> RTE_ETH_IS_xxx_HDR(mb->packet_type) </code>
688  - packet classification result
689 
690  @em Sets:
691  - <code>b->error</code> if the packet is to be dropped immediately
692  - <code>b->current_data, b->current_length</code>
693  - adjusted as needed to skip the L2 header in direct-dispatch cases
694  - <code>vnet_buffer(b)->sw_if_index[VLIB_RX]</code>
695  - rx interface sw_if_index
696  - <code>vnet_buffer(b)->sw_if_index[VLIB_TX] = ~0</code>
697  - required by ipX-lookup
698  - <code>b->flags</code>
699  - to indicate multi-segment pkts (VLIB_BUFFER_NEXT_PRESENT), etc.
700 
701  <em>Next Nodes:</em>
702  - Static arcs to: error-drop, ethernet-input,
703  ip4-input-no-checksum, ip6-input, mpls-gre-input
704  - per-interface redirection, controlled by
705  <code>xd->per_interface_next_index</code>
706 */
707 
708 static uword
710 {
711  dpdk_main_t *dm = &dpdk_main;
712  dpdk_device_t *xd;
713  uword n_rx_packets = 0;
715  u32 cpu_index = os_get_cpu_number ();
716 
717  /*
718  * Poll all devices on this cpu for input/interrupts.
719  */
720  /* *INDENT-OFF* */
721  vec_foreach (dq, dm->devices_by_cpu[cpu_index])
722  {
723  xd = vec_elt_at_index(dm->devices, dq->device);
724  ASSERT(dq->queue_id == 0);
725  n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, 0, 0);
726  }
727  /* *INDENT-ON* */
728 
729  poll_rate_limit (dm);
730 
731  return n_rx_packets;
732 }
733 
734 uword
736  vlib_node_runtime_t * node, vlib_frame_t * f)
737 {
738  dpdk_main_t *dm = &dpdk_main;
739  dpdk_device_t *xd;
740  uword n_rx_packets = 0;
742  u32 cpu_index = os_get_cpu_number ();
743 
744  /*
745  * Poll all devices on this cpu for input/interrupts.
746  */
747  /* *INDENT-OFF* */
748  vec_foreach (dq, dm->devices_by_cpu[cpu_index])
749  {
750  xd = vec_elt_at_index(dm->devices, dq->device);
751  n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, 0);
752  }
753  /* *INDENT-ON* */
754 
755  poll_rate_limit (dm);
756 
757  return n_rx_packets;
758 }
759 
760 uword
762  vlib_node_runtime_t * node, vlib_frame_t * f)
763 {
764  dpdk_main_t *dm = &dpdk_main;
765  dpdk_device_t *xd;
766  uword n_rx_packets = 0;
768  u32 cpu_index = os_get_cpu_number ();
769 
770  /*
771  * Poll all devices on this cpu for input/interrupts.
772  */
773  /* *INDENT-OFF* */
774  vec_foreach (dq, dm->devices_by_cpu[cpu_index])
775  {
776  xd = vec_elt_at_index(dm->devices, dq->device);
777  n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id, 1);
778  }
779  /* *INDENT-ON* */
780 
781  poll_rate_limit (dm);
782 
783  return n_rx_packets;
784 }
785 
786 /* *INDENT-OFF* */
788  .function = dpdk_input,
789  .type = VLIB_NODE_TYPE_INPUT,
790  .name = "dpdk-input",
791 
792  /* Will be enabled if/when hardware is detected. */
793  .state = VLIB_NODE_STATE_DISABLED,
794 
795  .format_buffer = format_ethernet_header_with_length,
796  .format_trace = format_dpdk_rx_dma_trace,
797 
798  .n_errors = DPDK_N_ERROR,
799  .error_strings = dpdk_error_strings,
800 
801  .n_next_nodes = DPDK_RX_N_NEXT,
802  .next_nodes = {
803  [DPDK_RX_NEXT_DROP] = "error-drop",
804  [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
805  [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
806  [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
807  [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
808  },
809 };
810 
811 
812 /* handle dpdk_input_rss alternative function */
816 
817 /* this macro defines dpdk_input_rss_multiarch_select() */
821 /* *INDENT-ON* */
822 
823 /*
824  * Override the next nodes for the dpdk input nodes.
825  * Must be invoked prior to VLIB_INIT_FUNCTION calls.
826  */
827 void
829 {
832 
833  switch (next)
834  {
839  r->next_nodes[next] = name;
840  r_handoff->next_nodes[next] = name;
841  break;
842 
843  default:
844  clib_warning ("%s: illegal next %d\n", __FUNCTION__, next);
845  break;
846  }
847 }
848 
849 /*
850  * set_efd_bitmap()
851  * Based on the operation type, set lower/upper bits for the given index value
852  */
853 void
854 set_efd_bitmap (u8 * bitmap, u32 value, u32 op)
855 {
856  int ix;
857 
858  *bitmap = 0;
859  for (ix = 0; ix < 8; ix++)
860  {
861  if (((op == EFD_OPERATION_LESS_THAN) && (ix < value)) ||
862  ((op == EFD_OPERATION_GREATER_OR_EQUAL) && (ix >= value)))
863  {
864  (*bitmap) |= (1 << ix);
865  }
866  }
867 }
868 
869 void
870 efd_config (u32 enabled,
871  u32 ip_prec, u32 ip_op,
872  u32 mpls_exp, u32 mpls_op, u32 vlan_cos, u32 vlan_op)
873 {
875  dpdk_main_t *dm = &dpdk_main;
876 
877  if (enabled)
878  {
881  }
882  else
883  {
886  }
887 
888  set_efd_bitmap (&tm->efd.ip_prec_bitmap, ip_prec, ip_op);
889  set_efd_bitmap (&tm->efd.mpls_exp_bitmap, mpls_exp, mpls_op);
890  set_efd_bitmap (&tm->efd.vlan_cos_bitmap, vlan_cos, vlan_op);
891 }
892 
893 /*
894  * fd.io coding-style-patch-verification: ON
895  *
896  * Local Variables:
897  * eval: (c-set-style "gnu")
898  * End:
899  */
void vlib_put_next_frame(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, u32 n_vectors_left)
Release pointer to next frame vector data.
Definition: main.c:457
void dpdk_rx_trace(dpdk_main_t *dm, vlib_node_runtime_t *node, dpdk_device_t *xd, u16 queue_id, u32 *buffers, uword n_buffers)
Definition: node.c:142
u16 enabled
Definition: dpdk.h:306
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:343
uword dpdk_input_rss(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *f)
Definition: node.c:735
#define rte_mbuf_from_vlib_buffer(x)
Definition: buffer.h:382
static int vlib_buffer_is_mpls(vlib_buffer_t *b)
Definition: node.c:71
static u32 vlib_get_trace_count(vlib_main_t *vm, vlib_node_runtime_t *rt)
Definition: trace_funcs.h:143
u16 max_burst_sz
Definition: dpdk.h:135
void dpdk_efd_update_counters(dpdk_device_t *xd, u32 n_buffers, u16 enabled)
Definition: node.c:197
dpdk_main_t dpdk_main
Definition: dpdk.h:443
static vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:23
static u32 dpdk_rx_burst(dpdk_main_t *dm, dpdk_device_t *xd, u16 queue_id)
Definition: node.c:281
#define VLIB_EFD_DISCARD_ENABLED
Definition: threads.h:244
#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b)
Definition: buffer.h:399
#define PREDICT_TRUE(x)
Definition: clib.h:98
u32 vhost_coalesce_frames
Definition: dpdk.h:362
void dpdk_set_next_node(dpdk_rx_next_t next, char *name)
Definition: node.c:828
u64 last_poll_time
Definition: dpdk.h:139
#define foreach_dpdk_error
Definition: dpdk.h:498
#define NULL
Definition: clib.h:55
static f64 vlib_time_now(vlib_main_t *vm)
Definition: main.h:182
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:482
unsigned rte_socket_id()
static u64 clib_cpu_time_now(void)
Definition: time.h:73
struct _vlib_node_registration vlib_node_registration_t
#define BUFFER_HANDOFF_NEXT_VALID
Definition: buffer.h:68
vlib_buffer_main_t * buffer_main
Definition: main.h:104
u32 per_interface_next_index
Definition: dpdk.h:205
static int dpdk_mbuf_is_ip6(struct rte_mbuf *mb)
Definition: node.c:65
u32 congestion_cnt
Definition: dpdk.h:138
static void poll_rate_limit(dpdk_main_t *dm)
Definition: node.c:649
static char * dpdk_error_strings[]
Definition: node.c:52
vlib_error_t * errors
Definition: node.h:418
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
u8 admin_up
Definition: dpdk.h:221
static u32 dpdk_device_input(dpdk_main_t *dm, dpdk_device_t *xd, vlib_node_runtime_t *node, u32 cpu_index, u16 queue_id, int use_efd)
Definition: node.c:390
u8 mpls_exp_bitmap
Definition: threads.h:255
static void vlib_trace_buffer(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, vlib_buffer_t *b, int follow_chain)
Definition: trace_funcs.h:104
static void vlib_buffer_init_for_free_list(vlib_buffer_t *_dst, vlib_buffer_free_list_t *fl)
Definition: buffer_funcs.h:606
vnet_main_t * vnet_get_main(void)
Definition: misc.c:45
i16 current_data
signed offset in data[], pre_data[] that we are currently processing.
Definition: buffer.h:78
u8 data[256]
Definition: dpdk.h:472
static int dpdk_mbuf_is_ip4(struct rte_mbuf *mb)
Definition: node.c:59
#define always_inline
Definition: clib.h:84
void efd_config(u32 enabled, u32 ip_prec, u32 ip_op, u32 mpls_exp, u32 mpls_op, u32 vlan_cos, u32 vlan_op)
Definition: node.c:870
#define vec_elt_at_index(v, i)
Get vector value at index i checking that i is in bounds.
u8 pre_data[VLIB_BUFFER_PRE_DATA_SIZE]
Space for inserting data before buffer start.
Definition: buffer.h:143
#define clib_warning(format, args...)
Definition: error.h:59
unsigned long u64
Definition: types.h:89
vlib_node_registration_t dpdk_input_node
(constructor) VLIB_REGISTER_NODE (dpdk_input_node)
Definition: node.c:787
#define VMWARE_LENGTH_BUG_WORKAROUND
Definition: node.c:50
u32 device_index
Definition: dpdk.h:199
dpdk_worker_t * workers
Definition: dpdk.h:394
static u32 vlib_get_buffer_index(vlib_main_t *vm, void *p)
Translate buffer pointer into buffer index.
Definition: buffer_funcs.h:82
#define VLIB_BUFFER_NEXT_PRESENT
Definition: buffer.h:95
vlib_node_registration_t handoff_dispatch_node
(constructor) VLIB_REGISTER_NODE (handoff_dispatch_node)
Definition: handoff.c:504
struct rte_mbuf mb
Definition: dpdk.h:470
u16 consec_full_frames_hi_thresh
Definition: dpdk.h:308
u32 vlib_sw_if_index
Definition: dpdk.h:202
format_function_t format_dpdk_rx_dma_trace
Definition: dpdk.h:623
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:82
dpdk_device_and_queue_t ** devices_by_cpu
Definition: dpdk.h:379
u64 max_poll_delay
Definition: dpdk.h:140
static_always_inline void increment_efd_drop_counter(vlib_main_t *vm, u32 counter_index, u32 count)
Definition: dpdk.h:524
u32 consec_full_frames_cnt
Definition: dpdk.h:137
#define DPDK_EFD_DISCARD_ENABLED
Definition: dpdk.h:297
uword os_get_cpu_number(void)
Definition: unix-misc.c:224
u16 last_burst_sz
Definition: dpdk.h:134
unsigned short int uint16_t
Definition: fix_types.h:28
u16 vlan_subifs
Definition: dpdk.h:215
u32 * d_trace_buffers
Definition: dpdk.h:212
#define DPDK_EFD_MONITOR_ENABLED
Definition: dpdk.h:298
#define PREDICT_FALSE(x)
Definition: clib.h:97
u32 full_frames_cnt
Definition: dpdk.h:136
#define VLIB_FRAME_SIZE
Definition: node.h:328
#define vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, bi0, next0)
Finish enqueueing one buffer forward in the graph.
Definition: buffer_node.h:130
#define vlib_get_next_frame(vm, node, next_index, vectors, n_vectors_left)
Get pointer to next frame vector data by (vlib_node_runtime_t, next_index).
Definition: node_funcs.h:348
vlib_error_t error
Error code for buffers to be enqueued to error handler.
Definition: buffer.h:118
u64 aggregate_rx_packets
Definition: dpdk.h:286
u8 * format_ethernet_header_with_length(u8 *s, va_list *args)
Definition: format.c:91
struct rte_mempool ** pktmbuf_pools
Definition: buffer.h:321
dpdk_device_t * devices
Definition: dpdk.h:378
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:82
u8 ip_prec_bitmap
Definition: threads.h:254
struct rte_kni * kni
Definition: dpdk.h:238
static vlib_thread_main_t * vlib_get_thread_main()
Definition: global_funcs.h:32
struct rte_mbuf *** rx_vectors
Definition: dpdk.h:209
u16 queue_hi_thresh
Definition: dpdk.h:307
u8 vlan_cos_bitmap
Definition: threads.h:256
#define clib_memcpy(a, b, c)
Definition: string.h:63
#define VLIB_NODE_FUNCTION_MULTIARCH_CLONE(fn)
Definition: node.h:157
#define EFD_OPERATION_GREATER_OR_EQUAL
Definition: dpdk.h:568
#define VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX
Definition: buffer.h:303
u32 discard_cnt
Definition: dpdk.h:141
static void vlib_increment_combined_counter(vlib_combined_counter_main_t *cm, u32 cpu_index, u32 index, u32 packet_increment, u32 byte_increment)
Increment a combined counter.
Definition: counter.h:241
#define ASSERT(truth)
unsigned int u32
Definition: types.h:88
#define vnet_buffer(b)
Definition: buffer.h:335
u32 poll_sleep
Definition: dpdk.h:435
dpdk_efd_t efd
Definition: dpdk.h:416
static void dpdk_rx_next_and_error_from_mb_flags_x1(dpdk_device_t *xd, struct rte_mbuf *mb, vlib_buffer_t *b0, u8 *next0, u8 *error0)
Definition: node.c:78
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:114
u64 uword
Definition: types.h:112
static void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:109
template key/value backing page structure
Definition: bihash_doc.h:44
u32 ip_version_traffic_class_and_flow_label
Definition: ip6_packet.h:285
dpdk_rx_next_t
Definition: dpdk.h:445
Definition: defs.h:47
unsigned short u16
Definition: types.h:57
vlib_buffer_t buffer
Definition: dpdk.h:471
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
double f64
Definition: types.h:142
dpdk_device_type_t dev_type
Definition: dpdk.h:217
unsigned char u8
Definition: types.h:56
dpdk_efd_agent_t efd_agent
Definition: dpdk.h:268
#define DPDK_EFD_DROPALL_ENABLED
Definition: dpdk.h:299
u32 buffer_flags_template
Definition: dpdk.h:385
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:143
#define vlib_buffer_from_rte_mbuf(x)
Definition: buffer.h:383
u8 data[0]
Packet data.
Definition: buffer.h:151
#define vec_foreach(var, vec)
Vector iterator.
#define EFD_OPERATION_LESS_THAN
Definition: dpdk.h:567
u32 total_packet_cnt
Definition: dpdk.h:142
static vlib_buffer_free_list_t * vlib_buffer_get_free_list(vlib_main_t *vm, u32 free_list_index)
Definition: buffer_funcs.h:337
static void vlib_set_trace_count(vlib_main_t *vm, vlib_node_runtime_t *rt, u32 count)
Definition: trace_funcs.h:159
u32 is_efd_discardable(vlib_thread_main_t *tm, vlib_buffer_t *b0, struct rte_mbuf *mb)
Definition: node.c:233
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:67
CLIB_MULTIARCH_SELECT_FN(dpdk_input)
u32 flags
buffer flags: VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:85
static uword dpdk_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *f)
Main DPDK input node.
Definition: node.c:709
static vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:69
u16 enabled
Definition: threads.h:252
uword dpdk_input_efd(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *f)
Definition: node.c:761
vlib_efd_t efd
Definition: threads.h:305
Definition: defs.h:46
CLIB vectors are ubiquitous dynamically resized arrays with by user defined "headers".
dpdk_config_main_t * conf
Definition: dpdk.h:440
void set_efd_bitmap(u8 *bitmap, u32 value, u32 op)
Definition: node.c:854