FD.io VPP  v16.06
Vector Packet Processing
node.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #include <vnet/vnet.h>
16 #include <vppinfra/vec.h>
17 #include <vppinfra/error.h>
18 #include <vppinfra/format.h>
19 #include <vppinfra/xxhash.h>
20 
21 #include <vnet/ethernet/ethernet.h>
22 #include <vnet/devices/dpdk/dpdk.h>
24 #include <vnet/mpls-gre/packet.h>
25 
26 #include "dpdk_priv.h"
27 
28 #ifndef MAX
29 #define MAX(a,b) ((a) < (b) ? (b) : (a))
30 #endif
31 
32 #ifndef MIN
33 #define MIN(a,b) ((a) < (b) ? (a) : (b))
34 #endif
35 
36 /*
37  * At least in certain versions of ESXi, vmware e1000's don't honor the
38  * "strip rx CRC" bit. Set this flag to work around that bug FOR UNIT TEST ONLY.
39  *
40  * If wireshark complains like so:
41  *
42  * "Frame check sequence: 0x00000000 [incorrect, should be <hex-num>]"
43  * and you're using ESXi emulated e1000's, set this flag FOR UNIT TEST ONLY.
44  *
45  * Note: do NOT check in this file with this workaround enabled! You'll lose
46  * actual data from e.g. 10xGE interfaces. The extra 4 bytes annoy
47  * wireshark, but they're harmless...
48  */
49 #define VMWARE_LENGTH_BUG_WORKAROUND 0
50 
51 typedef struct {
53 
54  /* convenience variables */
58 
59 typedef struct {
64 
65 /* packet trace format function */
66 static u8 * format_handoff_dispatch_trace (u8 * s, va_list * args)
67 {
68  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
69  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
70  handoff_dispatch_trace_t * t = va_arg (*args, handoff_dispatch_trace_t *);
71 
72  s = format (s, "HANDOFF_DISPATCH: sw_if_index %d next_index %d buffer 0x%x",
73  t->sw_if_index,
74  t->next_index,
75  t->buffer_index);
76  return s;
77 }
78 
80 
82 
83 #define foreach_handoff_dispatch_error \
84 _(EXAMPLE, "example packets")
85 
86 typedef enum {
87 #define _(sym,str) HANDOFF_DISPATCH_ERROR_##sym,
89 #undef _
92 
93 static char * handoff_dispatch_error_strings[] = {
94 #define _(sym,string) string,
96 #undef _
97 };
98 
99 static inline
101 {
103  hf->valid = 1;
104 }
105 
106 static uword
108  vlib_node_runtime_t * node,
109  vlib_frame_t * frame)
110 {
111  u32 n_left_from, * from, * to_next;
112  dpdk_rx_next_t next_index;
113 
114  from = vlib_frame_vector_args (frame);
115  n_left_from = frame->n_vectors;
116  next_index = node->cached_next_index;
117 
118  while (n_left_from > 0)
119  {
120  u32 n_left_to_next;
121 
122  vlib_get_next_frame (vm, node, next_index,
123  to_next, n_left_to_next);
124 
125  while (n_left_from >= 4 && n_left_to_next >= 2)
126  {
127  u32 bi0, bi1;
128  vlib_buffer_t * b0, * b1;
129  u32 next0, next1;
130  u32 sw_if_index0, sw_if_index1;
131 
132  /* Prefetch next iteration. */
133  {
134  vlib_buffer_t * p2, * p3;
135 
136  p2 = vlib_get_buffer (vm, from[2]);
137  p3 = vlib_get_buffer (vm, from[3]);
138 
139  vlib_prefetch_buffer_header (p2, LOAD);
140  vlib_prefetch_buffer_header (p3, LOAD);
141  }
142 
143  /* speculatively enqueue b0 and b1 to the current next frame */
144  to_next[0] = bi0 = from[0];
145  to_next[1] = bi1 = from[1];
146  from += 2;
147  to_next += 2;
148  n_left_from -= 2;
149  n_left_to_next -= 2;
150 
151  b0 = vlib_get_buffer (vm, bi0);
152  b1 = vlib_get_buffer (vm, bi1);
153 
154  next0 = vnet_buffer(b0)->io_handoff.next_index;
155  next1 = vnet_buffer(b1)->io_handoff.next_index;
156 
158  {
160  {
161  vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
163  vlib_add_trace (vm, node, b0, sizeof (*t));
164  sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
165  t->sw_if_index = sw_if_index0;
166  t->next_index = next0;
167  t->buffer_index = bi0;
168  }
170  {
171  vlib_trace_buffer (vm, node, next1, b1, /* follow_chain */ 0);
173  vlib_add_trace (vm, node, b1, sizeof (*t));
174  sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX];
175  t->sw_if_index = sw_if_index1;
176  t->next_index = next1;
177  t->buffer_index = bi1;
178  }
179  }
180 
181  /* verify speculative enqueues, maybe switch current next frame */
182  vlib_validate_buffer_enqueue_x2 (vm, node, next_index,
183  to_next, n_left_to_next,
184  bi0, bi1, next0, next1);
185  }
186 
187  while (n_left_from > 0 && n_left_to_next > 0)
188  {
189  u32 bi0;
190  vlib_buffer_t * b0;
191  u32 next0;
192  u32 sw_if_index0;
193 
194  /* speculatively enqueue b0 to the current next frame */
195  bi0 = from[0];
196  to_next[0] = bi0;
197  from += 1;
198  to_next += 1;
199  n_left_from -= 1;
200  n_left_to_next -= 1;
201 
202  b0 = vlib_get_buffer (vm, bi0);
203 
204  next0 = vnet_buffer(b0)->io_handoff.next_index;
205 
207  {
209  {
210  vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
212  vlib_add_trace (vm, node, b0, sizeof (*t));
213  sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX];
214  t->sw_if_index = sw_if_index0;
215  t->next_index = next0;
216  t->buffer_index = bi0;
217  }
218  }
219 
220  /* verify speculative enqueue, maybe switch current next frame */
221  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
222  to_next, n_left_to_next,
223  bi0, next0);
224  }
225 
226  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
227  }
228 
229  return frame->n_vectors;
230 }
231 
233  .function = handoff_dispatch_node_fn,
234  .name = "handoff-dispatch",
235  .vector_size = sizeof (u32),
236  .format_trace = format_handoff_dispatch_trace,
239 
241  .error_strings = handoff_dispatch_error_strings,
242 
243  .n_next_nodes = DPDK_RX_N_NEXT,
244 
245  .next_nodes = {
246  [DPDK_RX_NEXT_DROP] = "error-drop",
247  [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
248  [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input",
249  [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
250  [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
251  },
252 };
253 
255 {
257 
258  mp->vlib_main = vm;
259  mp->vnet_main = &vnet_main;
260 
261  return 0;
262 }
263 
265 
267 {
268  return handoff_dispatch_node.index;
269 }
270 
271 static char * dpdk_error_strings[] = {
272 #define _(n,s) s,
274 #undef _
275 };
276 
277 always_inline void
279  vlib_buffer_t *b0,
280  u8 * next0, u8 * error0)
281 {
282  u8 is0_ip4, is0_ip6, is0_mpls, n0;
283  uint16_t mb_flags = mb->ol_flags;
284 
285  if (PREDICT_FALSE(mb_flags & (
286 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
287  PKT_EXT_RX_PKT_ERROR | PKT_EXT_RX_BAD_FCS |
288 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
289  PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD
290  )))
291  {
292  /* some error was flagged. determine the drop reason */
293  n0 = DPDK_RX_NEXT_DROP;
294  *error0 =
295 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
296  (mb_flags & PKT_EXT_RX_PKT_ERROR) ? DPDK_ERROR_RX_PACKET_ERROR :
297  (mb_flags & PKT_EXT_RX_BAD_FCS) ? DPDK_ERROR_RX_BAD_FCS :
298 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
299  (mb_flags & PKT_RX_IP_CKSUM_BAD) ? DPDK_ERROR_IP_CHECKSUM_ERROR :
300  (mb_flags & PKT_RX_L4_CKSUM_BAD) ? DPDK_ERROR_L4_CHECKSUM_ERROR :
301  DPDK_ERROR_NONE;
302  }
303  else
304  {
305  *error0 = DPDK_ERROR_NONE;
307  n0 = xd->per_interface_next_index;
308  else if (PREDICT_FALSE(xd->vlan_subifs || (mb_flags & PKT_RX_VLAN_PKT)))
310  else
311  {
313 #if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
314  is0_ip4 = RTE_ETH_IS_IPV4_HDR(mb->packet_type) != 0;
315 #else
316  is0_ip4 = (mb_flags & (PKT_RX_IPV4_HDR | PKT_RX_IPV4_HDR_EXT)) != 0;
317 #endif
318 
319  if (PREDICT_TRUE(is0_ip4))
321  else
322  {
323 #if RTE_VERSION >= RTE_VERSION_NUM(2, 1, 0, 0)
324  is0_ip6 = RTE_ETH_IS_IPV6_HDR(mb->packet_type) != 0;
325 #else
326  is0_ip6 =
327  (mb_flags & (PKT_RX_IPV6_HDR | PKT_RX_IPV6_HDR_EXT)) != 0;
328 #endif
329  if (PREDICT_TRUE(is0_ip6))
331  else
332  {
334  is0_mpls = (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST));
335  n0 = is0_mpls ? DPDK_RX_NEXT_MPLS_INPUT : n0;
336  }
337  }
338  }
339  }
340  *next0 = n0;
341 }
342 
344  vlib_node_runtime_t * node,
345  dpdk_device_t * xd,
346  u16 queue_id,
347  u32 * buffers,
348  uword n_buffers)
349 {
350  vlib_main_t * vm = vlib_get_main();
351  u32 * b, n_left;
352  u8 next0;
353 
354  n_left = n_buffers;
355  b = buffers;
356 
357  while (n_left >= 1)
358  {
359  u32 bi0;
360  vlib_buffer_t * b0;
361  dpdk_rx_dma_trace_t * t0;
362  struct rte_mbuf *mb;
363  u8 error0;
364 
365  bi0 = b[0];
366  n_left -= 1;
367 
368  b0 = vlib_get_buffer (vm, bi0);
369  mb = rte_mbuf_from_vlib_buffer(b0);
371  &next0, &error0);
372  vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0);
373  t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0]));
374  t0->queue_index = queue_id;
375  t0->device_index = xd->device_index;
376  t0->buffer_index = bi0;
377 
378  clib_memcpy (&t0->mb, mb, sizeof (t0->mb));
379  clib_memcpy (&t0->buffer, b0, sizeof (b0[0]) - sizeof (b0->pre_data));
380  clib_memcpy (t0->buffer.pre_data, b0->data, sizeof (t0->buffer.pre_data));
381 
382 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
383  /*
384  * Clear overloaded TX offload flags when a DPDK driver
385  * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
386  */
387  mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
388 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
389 
390  b += 1;
391  }
392 }
393 
394 /*
395  * dpdk_efd_update_counters()
396  * Update EFD (early-fast-discard) counters
397  */
399  u32 n_buffers,
400  u16 enabled)
401 {
402  if (enabled & DPDK_EFD_MONITOR_ENABLED)
403  {
404  u64 now = clib_cpu_time_now();
405  if (xd->efd_agent.last_poll_time > 0)
406  {
407  u64 elapsed_time = (now - xd->efd_agent.last_poll_time);
408  if (elapsed_time > xd->efd_agent.max_poll_delay)
409  xd->efd_agent.max_poll_delay = elapsed_time;
410  }
411  xd->efd_agent.last_poll_time = now;
412  }
413 
414  xd->efd_agent.total_packet_cnt += n_buffers;
415  xd->efd_agent.last_burst_sz = n_buffers;
416 
417  if (n_buffers > xd->efd_agent.max_burst_sz)
418  xd->efd_agent.max_burst_sz = n_buffers;
419 
420  if (PREDICT_FALSE(n_buffers == VLIB_FRAME_SIZE))
421  {
424  }
425  else
426  {
428  }
429 }
430 
431 /* is_efd_discardable()
432  * returns non zero DPDK error if packet meets early-fast-discard criteria,
433  * zero otherwise
434  */
436  vlib_buffer_t * b0,
437  struct rte_mbuf *mb)
438 {
440 
441  if (eh->type == clib_host_to_net_u16(ETHERNET_TYPE_IP4))
442  {
443  ip4_header_t *ipv4 =
444  (ip4_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
445  u8 pkt_prec = (ipv4->tos >> 5);
446 
447  return (tm->efd.ip_prec_bitmap & (1 << pkt_prec) ?
448  DPDK_ERROR_IPV4_EFD_DROP_PKTS : DPDK_ERROR_NONE);
449  }
450  else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_IP6))
451  {
452  ip6_header_t *ipv6 =
453  (ip6_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
454  u8 pkt_tclass =
455  ((ipv6->ip_version_traffic_class_and_flow_label >> 20) & 0xff);
456 
457  return (tm->efd.ip_prec_bitmap & (1 << pkt_tclass) ?
458  DPDK_ERROR_IPV6_EFD_DROP_PKTS : DPDK_ERROR_NONE);
459  }
460  else if (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_MPLS_UNICAST))
461  {
462  mpls_unicast_header_t *mpls =
463  (mpls_unicast_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
464  u8 pkt_exp = ((mpls->label_exp_s_ttl >> 9) & 0x07);
465 
466  return (tm->efd.mpls_exp_bitmap & (1 << pkt_exp) ?
467  DPDK_ERROR_MPLS_EFD_DROP_PKTS : DPDK_ERROR_NONE);
468  }
469  else if ((eh->type == clib_net_to_host_u16(ETHERNET_TYPE_VLAN)) ||
470  (eh->type == clib_net_to_host_u16(ETHERNET_TYPE_DOT1AD)))
471  {
472  ethernet_vlan_header_t *vlan =
473  (ethernet_vlan_header_t *)&(b0->data[sizeof(ethernet_header_t)]);
474  u8 pkt_cos = ((vlan->priority_cfi_and_id >> 13) & 0x07);
475 
476  return (tm->efd.vlan_cos_bitmap & (1 << pkt_cos) ?
477  DPDK_ERROR_VLAN_EFD_DROP_PKTS : DPDK_ERROR_NONE);
478  }
479 
480  return DPDK_ERROR_NONE;
481 }
482 
483 /*
484  * This function is used when there are no worker threads.
485  * The main thread performs IO and forwards the packets.
486  */
487 static inline u32 dpdk_device_input ( dpdk_main_t * dm,
488  dpdk_device_t * xd,
489  vlib_node_runtime_t * node,
490  u32 cpu_index,
491  u16 queue_id)
492 {
493  u32 n_buffers;
494  u32 next_index = DPDK_RX_NEXT_ETHERNET_INPUT;
495  u32 n_left_to_next, * to_next;
496  u32 mb_index;
497  vlib_main_t * vm = vlib_get_main();
498  uword n_rx_bytes = 0;
499  u32 n_trace, trace_cnt __attribute__((unused));
501  u8 efd_discard_burst = 0;
502  u16 ip_align_offset = 0;
503  u32 buffer_flags_template;
504 
505  if (xd->admin_up == 0)
506  return 0;
507 
508  n_buffers = dpdk_rx_burst(dm, xd, queue_id);
509 
510  if (n_buffers == 0)
511  {
512  /* check if EFD (dpdk) is enabled */
513  if (PREDICT_FALSE(dm->efd.enabled))
514  {
515  /* reset a few stats */
516  xd->efd_agent.last_poll_time = 0;
517  xd->efd_agent.last_burst_sz = 0;
518  }
519  return 0;
520  }
521 
522  if (xd->pmd == VNET_DPDK_PMD_THUNDERX)
523  ip_align_offset = 6;
524 
525  buffer_flags_template = dm->buffer_flags_template;
526 
528  trace_cnt = n_trace = vlib_get_trace_count (vm, node);
529 
531 
532  /*
533  * DAW-FIXME: VMXNET3 device stop/start doesn't work,
534  * therefore fake the stop in the dpdk driver by
535  * silently dropping all of the incoming pkts instead of
536  * stopping the driver / hardware.
537  */
538  if (PREDICT_FALSE(xd->admin_up != 1))
539  {
540  for (mb_index = 0; mb_index < n_buffers; mb_index++)
541  rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
542 
543  return 0;
544  }
545 
546  /* Check for congestion if EFD (Early-Fast-Discard) is enabled
547  * in any mode (e.g. dpdk, monitor, or drop_all)
548  */
549  if (PREDICT_FALSE(dm->efd.enabled))
550  {
551  /* update EFD counters */
552  dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
553 
555  {
556  /* discard all received packets */
557  for (mb_index = 0; mb_index < n_buffers; mb_index++)
558  rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
559 
560  xd->efd_agent.discard_cnt += n_buffers;
562  DPDK_ERROR_VLAN_EFD_DROP_PKTS,
563  n_buffers);
564 
565  return 0;
566  }
567 
570  {
571  u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
572  queue_id);
573  if (device_queue_sz >= dm->efd.queue_hi_thresh)
574  {
575  /* dpdk device queue has reached the critical threshold */
577 
578  /* apply EFD to packets from the burst */
579  efd_discard_burst = 1;
580  }
581  }
582  }
583 
584  mb_index = 0;
585 
586  while (n_buffers > 0)
587  {
588  u32 bi0;
589  u8 next0, error0;
590  u32 l3_offset0;
591  vlib_buffer_t * b0, * b_seg, * b_chain = 0;
592  u32 cntr_type;
593 
594  vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next);
595 
596  while (n_buffers > 0 && n_left_to_next > 0)
597  {
598  u8 nb_seg = 1;
599  struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
600  struct rte_mbuf *mb_seg = mb->next;
601 
602  if (PREDICT_TRUE(n_buffers > 2))
603  {
604  struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
605  vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
606  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, STORE);
608  }
609 
610  ASSERT(mb);
611 
612  b0 = vlib_buffer_from_rte_mbuf(mb);
613 
614  /* check whether EFD is looking for packets to discard */
615  if (PREDICT_FALSE(efd_discard_burst))
616  {
618 
619  if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
620  {
621  rte_pktmbuf_free(mb);
622  xd->efd_agent.discard_cnt++;
624  cntr_type,
625  1);
626  n_buffers--;
627  mb_index++;
628  continue;
629  }
630  }
631 
632  /* Prefetch one next segment if it exists. */
633  if (PREDICT_FALSE(mb->nb_segs > 1))
634  {
635  struct rte_mbuf *pfmb = mb->next;
636  vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
637  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
639  b_chain = b0;
640  }
641 
643  b0->clone_count = 0;
644 
645  bi0 = vlib_get_buffer_index (vm, b0);
646 
647  to_next[0] = bi0;
648  to_next++;
649  n_left_to_next--;
650 
652  &next0, &error0);
653 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
654  /*
655  * Clear overloaded TX offload flags when a DPDK driver
656  * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
657  */
658 
659  if (PREDICT_TRUE(trace_cnt == 0))
660  mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
661  else
662  trace_cnt--;
663 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
664 
665  b0->error = node->errors[error0];
666 
667  l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
668  next0 == DPDK_RX_NEXT_IP6_INPUT ||
669  next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
670  sizeof (ethernet_header_t) : 0);
671 
672  b0->current_data = l3_offset0;
673  b0->current_length = mb->data_len - l3_offset0;
674 
675  if (PREDICT_FALSE (ip_align_offset != 0))
676  {
677  if (next0 == DPDK_RX_NEXT_IP4_INPUT ||
678  next0 == DPDK_RX_NEXT_IP6_INPUT)
679  b0->current_data += ip_align_offset;
680  }
681 
682  b0->flags = buffer_flags_template;
683 
685  b0->current_length -= 4;
686 
687  vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
688  vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
689  n_rx_bytes += mb->pkt_len;
690 
691  /* Process subsequent segments of multi-segment packets */
692  while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
693  {
694  ASSERT(mb_seg != 0);
695 
696  b_seg = vlib_buffer_from_rte_mbuf(mb_seg);
697  vlib_buffer_init_for_free_list (b_seg, fl);
698  b_seg->clone_count = 0;
699 
700  ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
701  ASSERT(b_seg->current_data == 0);
702 
703  /*
704  * The driver (e.g. virtio) may not put the packet data at the start
705  * of the segment, so don't assume b_seg->current_data == 0 is correct.
706  */
707  b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
708 
709  b_seg->current_length = mb_seg->data_len;
711  mb_seg->data_len;
712 
713  b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
714  b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
715 
716  b_chain = b_seg;
717  mb_seg = mb_seg->next;
718  nb_seg++;
719  }
720 
721  /*
722  * Turn this on if you run into
723  * "bad monkey" contexts, and you want to know exactly
724  * which nodes they've visited... See main.c...
725  */
727 
728  vlib_validate_buffer_enqueue_x1 (vm, node, next_index,
729  to_next, n_left_to_next,
730  bi0, next0);
731  if (PREDICT_FALSE (n_trace > mb_index))
732  vec_add1 (xd->d_trace_buffers, bi0);
733  n_buffers--;
734  mb_index++;
735  }
736  vlib_put_next_frame (vm, node, next_index, n_left_to_next);
737  }
738 
739  if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
740  {
741  dpdk_rx_trace (dm, node, xd, queue_id, xd->d_trace_buffers,
742  vec_len (xd->d_trace_buffers));
743  vlib_set_trace_count (vm, node, n_trace - vec_len (xd->d_trace_buffers));
744  }
745 
747  (vnet_get_main()->interface_main.combined_sw_if_counters
749  cpu_index,
750  xd->vlib_sw_if_index,
751  mb_index, n_rx_bytes);
752 
753  dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
754  dw->aggregate_rx_packets += mb_index;
755 
756  return mb_index;
757 }
758 
759 #if VIRL > 0
760 #define VIRL_SPEED_LIMIT() \
761  /* Limit the input rate to 1000 vectors / sec */ \
762  { \
763  struct timespec ts, tsrem; \
764  \
765  ts.tv_sec = 0; \
766  ts.tv_nsec = 1000*1000; /* 1ms */ \
767  \
768  while (nanosleep(&ts, &tsrem) < 0) \
769  { \
770  ts = tsrem; \
771  } \
772  }
773 #else
774 #define VIRL_SPEED_LIMIT()
775 #endif
776 
777 
778 static uword
780  vlib_node_runtime_t * node,
781  vlib_frame_t * f)
782 {
783  dpdk_main_t * dm = &dpdk_main;
784  dpdk_device_t * xd;
785  uword n_rx_packets = 0;
787  u32 cpu_index = os_get_cpu_number();
788 
789  /*
790  * Poll all devices on this cpu for input/interrupts.
791  */
792  vec_foreach (dq, dm->devices_by_cpu[cpu_index])
793  {
794  xd = vec_elt_at_index(dm->devices, dq->device);
795  ASSERT(dq->queue_id == 0);
796  n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, 0);
797  }
798 
800 
801  return n_rx_packets;
802 }
803 
804 uword
806  vlib_node_runtime_t * node,
807  vlib_frame_t * f)
808 {
809  dpdk_main_t * dm = &dpdk_main;
810  dpdk_device_t * xd;
811  uword n_rx_packets = 0;
813  u32 cpu_index = os_get_cpu_number();
814 
815  /*
816  * Poll all devices on this cpu for input/interrupts.
817  */
818  vec_foreach (dq, dm->devices_by_cpu[cpu_index])
819  {
820  xd = vec_elt_at_index(dm->devices, dq->device);
821  n_rx_packets += dpdk_device_input (dm, xd, node, cpu_index, dq->queue_id);
822  }
823 
825 
826  return n_rx_packets;
827 }
828 
830  .function = dpdk_input,
831  .type = VLIB_NODE_TYPE_INPUT,
832  .name = "dpdk-input",
833 
834  /* Will be enabled if/when hardware is detected. */
835  .state = VLIB_NODE_STATE_DISABLED,
836 
837  .format_buffer = format_ethernet_header_with_length,
838  .format_trace = format_dpdk_rx_dma_trace,
839 
840  .n_errors = DPDK_N_ERROR,
841  .error_strings = dpdk_error_strings,
842 
843  .n_next_nodes = DPDK_RX_N_NEXT,
844  .next_nodes = {
845  [DPDK_RX_NEXT_DROP] = "error-drop",
846  [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
847  [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
848  [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
849  [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
850  },
851 };
852 
853 /*
854  * Override the next nodes for the dpdk input nodes.
855  * Must be invoked prior to VLIB_INIT_FUNCTION calls.
856  */
857 void dpdk_set_next_node (dpdk_rx_next_t next, char *name)
858 {
862 
863  switch (next)
864  {
869  r->next_nodes[next] = name;
870  r_io->next_nodes[next] = name;
871  r_handoff->next_nodes[next] = name;
872  break;
873 
874  default:
875  clib_warning ("%s: illegal next %d\n", __FUNCTION__, next);
876  break;
877  }
878 }
879 
880 inline vlib_frame_queue_elt_t *
881 vlib_get_handoff_queue_elt (u32 vlib_worker_index)
882 {
883  vlib_frame_queue_t *fq;
885  u64 new_tail;
886 
887  fq = vlib_frame_queues[vlib_worker_index];
888  ASSERT (fq);
889 
890  new_tail = __sync_add_and_fetch (&fq->tail, 1);
891 
892  /* Wait until a ring slot is available */
893  while (new_tail >= fq->head_hint + fq->nelts)
895 
896  elt = fq->elts + (new_tail & (fq->nelts-1));
897 
898  /* this would be very bad... */
899  while (elt->valid)
900  ;
901 
903  elt->last_n_vectors = elt->n_vectors = 0;
904 
905  return elt;
906 }
907 
908 static inline vlib_frame_queue_elt_t *
910  u32 vlib_worker_index,
911  vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index)
912 {
914 
915  if (handoff_queue_elt_by_worker_index [vlib_worker_index])
916  return handoff_queue_elt_by_worker_index [vlib_worker_index];
917 
918  elt = vlib_get_handoff_queue_elt (vlib_worker_index);
919 
920  handoff_queue_elt_by_worker_index [vlib_worker_index] = elt;
921 
922  return elt;
923 }
924 
925 static inline vlib_frame_queue_t *
927  u32 vlib_worker_index,
928  u32 queue_hi_thresh,
929  vlib_frame_queue_t ** handoff_queue_by_worker_index)
930 {
931  vlib_frame_queue_t *fq;
932 
933  fq = handoff_queue_by_worker_index [vlib_worker_index];
934  if (fq != (vlib_frame_queue_t *)(~0))
935  return fq;
936 
937  fq = vlib_frame_queues[vlib_worker_index];
938  ASSERT (fq);
939 
940  if (PREDICT_FALSE(fq->tail >= (fq->head_hint + queue_hi_thresh))) {
941  /* a valid entry in the array will indicate the queue has reached
942  * the specified threshold and is congested
943  */
944  handoff_queue_by_worker_index [vlib_worker_index] = fq;
945  fq->enqueue_full_events++;
946  return fq;
947  }
948 
949  return NULL;
950 }
951 
952 static inline u64 ipv4_get_key (ip4_header_t *ip)
953 {
954  u64 hash_key;
955 
956  hash_key = *((u64*)(&ip->address_pair)) ^ ip->protocol;
957 
958  return hash_key;
959 }
960 
961 static inline u64 ipv6_get_key (ip6_header_t *ip)
962 {
963  u64 hash_key;
964 
965  hash_key = ip->src_address.as_u64[0] ^
966  rotate_left(ip->src_address.as_u64[1],13) ^
967  rotate_left(ip->dst_address.as_u64[0],26) ^
968  rotate_left(ip->dst_address.as_u64[1],39) ^
969  ip->protocol;
970 
971  return hash_key;
972 }
973 
974 
975 #define MPLS_BOTTOM_OF_STACK_BIT_MASK 0x00000100U
976 #define MPLS_LABEL_MASK 0xFFFFF000U
977 
979 {
980  u64 hash_key;
981  u8 ip_ver;
982 
983 
984  /* find the bottom of the MPLS label stack. */
985  if (PREDICT_TRUE(m->label_exp_s_ttl &
986  clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
987  goto bottom_lbl_found;
988  }
989  m++;
990 
991  if (PREDICT_TRUE(m->label_exp_s_ttl &
992  clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK))) {
993  goto bottom_lbl_found;
994  }
995  m++;
996 
997  if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
998  goto bottom_lbl_found;
999  }
1000  m++;
1001 
1002  if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
1003  goto bottom_lbl_found;
1004  }
1005  m++;
1006 
1007  if (m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_BOTTOM_OF_STACK_BIT_MASK)) {
1008  goto bottom_lbl_found;
1009  }
1010 
1011  /* the bottom label was not found - use the last label */
1012  hash_key = m->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
1013 
1014  return hash_key;
1015 
1016 
1017 bottom_lbl_found:
1018  m++;
1019  ip_ver = (*((u8 *)m) >> 4);
1020 
1021  /* find out if it is IPV4 or IPV6 header */
1022  if (PREDICT_TRUE(ip_ver == 4)) {
1023  hash_key = ipv4_get_key((ip4_header_t *)m);
1024  } else if (PREDICT_TRUE(ip_ver == 6)) {
1025  hash_key = ipv6_get_key((ip6_header_t *)m);
1026  } else {
1027  /* use the bottom label */
1028  hash_key = (m-1)->label_exp_s_ttl & clib_net_to_host_u32(MPLS_LABEL_MASK);
1029  }
1030 
1031  return hash_key;
1032 
1033 }
1034 
1035 static inline u64 eth_get_key (ethernet_header_t *h0)
1036 {
1037  u64 hash_key;
1038 
1039 
1040  if (PREDICT_TRUE(h0->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
1041  hash_key = ipv4_get_key((ip4_header_t *)(h0+1));
1042  } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_IP6)) {
1043  hash_key = ipv6_get_key((ip6_header_t *)(h0+1));
1044  } else if (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
1045  hash_key = mpls_get_key((mpls_unicast_header_t *)(h0+1));
1046  } else if ((h0->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ||
1047  (h0->type == clib_host_to_net_u16(ETHERNET_TYPE_DOT1AD))) {
1048  ethernet_vlan_header_t * outer = (ethernet_vlan_header_t *)(h0 + 1);
1049 
1050  outer = (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_VLAN)) ?
1051  outer+1 : outer;
1052  if (PREDICT_TRUE(outer->type) == clib_host_to_net_u16(ETHERNET_TYPE_IP4)) {
1053  hash_key = ipv4_get_key((ip4_header_t *)(outer+1));
1054  } else if (outer->type == clib_host_to_net_u16 (ETHERNET_TYPE_IP6)) {
1055  hash_key = ipv6_get_key((ip6_header_t *)(outer+1));
1056  } else if (outer->type == clib_host_to_net_u16(ETHERNET_TYPE_MPLS_UNICAST)) {
1057  hash_key = mpls_get_key((mpls_unicast_header_t *)(outer+1));
1058  } else {
1059  hash_key = outer->type;
1060  }
1061  } else {
1062  hash_key = 0;
1063  }
1064 
1065  return hash_key;
1066 }
1067 
1068 /*
1069  * This function is used when dedicated IO threads feed the worker threads.
1070  *
1071  * Devices are allocated to this thread based on instances and instance_id.
1072  * If instances==0 then the function automatically determines the number
1073  * of instances of this thread, and allocates devices between them.
1074  * If instances != 0, then instance_id must be in the range 0..instances-1.
1075  * The function allocates devices among the specified number of instances,
1076  * with this thread having the given instance id. This option is used for
1077  * splitting devices among differently named "io"-type threads.
1078  */
1080  u32 instances,
1081  u32 instance_id,
1082  char *worker_name,
1083  dpdk_io_thread_callback_t callback)
1084 {
1085  vlib_main_t * vm = vlib_get_main();
1088  dpdk_main_t * dm = &dpdk_main;
1089  char *io_name = w->registration->name;
1090  dpdk_device_t * xd;
1091  dpdk_device_t ** my_devices = 0;
1092  vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index = 0;
1093  vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
1094  vlib_frame_queue_elt_t * hf = 0;
1095  int i;
1096  u32 n_left_to_next_worker = 0, * to_next_worker = 0;
1097  u32 next_worker_index = 0;
1098  u32 current_worker_index = ~0;
1099  u32 cpu_index = os_get_cpu_number();
1100  u32 num_workers = 0;
1101  u32 num_devices = 0;
1102  uword * p;
1103  u16 queue_id = 0;
1104  vlib_node_runtime_t * node_trace = 0;
1105  u32 first_worker_index = 0;
1106  u32 buffer_flags_template;
1107 
1108  /* Wait until the dpdk init sequence is complete */
1109  while (dm->io_thread_release == 0)
1111 
1112  clib_time_init (&vm->clib_time);
1113 
1114  p = hash_get_mem (tm->thread_registrations_by_name, worker_name);
1115  ASSERT (p);
1116  tr = (vlib_thread_registration_t *) p[0];
1117  if (tr)
1118  {
1119  num_workers = tr->count;
1120  first_worker_index = tr->first_index;
1121  }
1122 
1123  /* Allocate devices to this thread */
1124  if (instances == 0)
1125  {
1126  /* auto-assign */
1127  instance_id = w->instance_id;
1128 
1129  p = hash_get_mem (tm->thread_registrations_by_name, io_name);
1130  tr = (vlib_thread_registration_t *) p[0];
1131  /* Otherwise, how did we get here */
1132  ASSERT (tr && tr->count);
1133  instances = tr->count;
1134  }
1135  else
1136  {
1137  /* manually assign */
1138  ASSERT (instance_id < instances);
1139  }
1140 
1141  vec_validate (handoff_queue_elt_by_worker_index,
1142  first_worker_index + num_workers - 1);
1143 
1144  vec_validate_init_empty (congested_handoff_queue_by_worker_index,
1145  first_worker_index + num_workers - 1,
1146  (vlib_frame_queue_t *)(~0));
1147 
1148  buffer_flags_template = dm->buffer_flags_template;
1149 
1150  /* And handle them... */
1151  while (1)
1152  {
1153  u32 n_buffers;
1154  u32 mb_index;
1155  uword n_rx_bytes = 0;
1156  u32 n_trace, trace_cnt __attribute__((unused));
1158  u32 hash;
1159  u64 hash_key;
1160  u8 efd_discard_burst;
1161 
1163 
1164  /* Invoke callback if supplied */
1165  if (PREDICT_FALSE(callback != NULL))
1166  callback(vm);
1167 
1168  if (PREDICT_FALSE(vec_len(dm->devices) != num_devices))
1169  {
1170  vec_reset_length(my_devices);
1171  vec_foreach (xd, dm->devices)
1172  {
1173  if (((xd - dm->devices) % tr->count) == instance_id)
1174  {
1175  fprintf(stderr, "i/o thread %d (cpu %d) takes port %d\n",
1176  instance_id, (int) os_get_cpu_number(), (int) (xd - dm->devices));
1177  vec_add1 (my_devices, xd);
1178  }
1179  }
1180  num_devices = vec_len(dm->devices);
1181  }
1182 
1183  for (i = 0; i < vec_len (my_devices); i++)
1184  {
1185  xd = my_devices[i];
1186 
1187  if (!xd->admin_up)
1188  continue;
1189 
1190  n_buffers = dpdk_rx_burst(dm, xd, 0 /* queue_id */);
1191 
1192  if (n_buffers == 0)
1193  {
1194  /* check if EFD (dpdk) is enabled */
1195  if (PREDICT_FALSE(dm->efd.enabled))
1196  {
1197  /* reset a few stats */
1198  xd->efd_agent.last_poll_time = 0;
1199  xd->efd_agent.last_burst_sz = 0;
1200  }
1201  continue;
1202  }
1203 
1204  trace_cnt = n_trace = 0;
1206  {
1207  /*
1208  * packet tracing is triggered on the dpdk-input node for
1209  * ease-of-use. Re-fetch the node_runtime for dpdk-input
1210  * in case it has changed.
1211  */
1212  node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
1213 
1215  trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
1216  }
1217 
1218  /*
1219  * DAW-FIXME: VMXNET3 device stop/start doesn't work,
1220  * therefore fake the stop in the dpdk driver by
1221  * silently dropping all of the incoming pkts instead of
1222  * stopping the driver / hardware.
1223  */
1224  if (PREDICT_FALSE(xd->admin_up != 1))
1225  {
1226  for (mb_index = 0; mb_index < n_buffers; mb_index++)
1227  rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
1228  continue;
1229  }
1230 
1231  /* reset EFD action for the burst */
1232  efd_discard_burst = 0;
1233 
1234  /* Check for congestion if EFD (Early-Fast-Discard) is enabled
1235  * in any mode (e.g. dpdk, monitor, or drop_all)
1236  */
1237  if (PREDICT_FALSE(dm->efd.enabled))
1238  {
1239  /* update EFD counters */
1240  dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
1241 
1243  {
1244  /* drop all received packets */
1245  for (mb_index = 0; mb_index < n_buffers; mb_index++)
1246  rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
1247 
1248  xd->efd_agent.discard_cnt += n_buffers;
1250  DPDK_ERROR_VLAN_EFD_DROP_PKTS,
1251  n_buffers);
1252 
1253  continue;
1254  }
1255 
1258  {
1259  u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
1260  queue_id);
1261  if (device_queue_sz >= dm->efd.queue_hi_thresh)
1262  {
1263  /* dpdk device queue has reached the critical threshold */
1264  xd->efd_agent.congestion_cnt++;
1265 
1266  /* apply EFD to packets from the burst */
1267  efd_discard_burst = 1;
1268  }
1269  }
1270  }
1271 
1274 
1275  mb_index = 0;
1276 
1277  while (n_buffers > 0)
1278  {
1279  u32 bi0;
1280  u8 next0, error0;
1281  u32 l3_offset0;
1282  vlib_buffer_t * b0, * b_seg, * b_chain = 0;
1283  ethernet_header_t * h0;
1284  u8 nb_seg = 1;
1285  struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
1286  struct rte_mbuf *mb_seg = mb->next;
1287 
1288  if (PREDICT_TRUE(n_buffers > 1))
1289  {
1290  struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
1291  vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1292  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1293  CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1295  }
1296 
1297  b0 = vlib_buffer_from_rte_mbuf(mb);
1298 
1299  /* check whether EFD is looking for packets to discard */
1300  if (PREDICT_FALSE(efd_discard_burst))
1301  {
1302  u32 cntr_type;
1303  if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
1304  {
1305  rte_pktmbuf_free(mb);
1306  xd->efd_agent.discard_cnt++;
1308  cntr_type,
1309  1);
1310 
1311  n_buffers--;
1312  mb_index++;
1313  continue;
1314  }
1315  }
1316 
1317  /* Prefetch one next segment if it exists */
1318  if (PREDICT_FALSE(mb->nb_segs > 1))
1319  {
1320  struct rte_mbuf *pfmb = mb->next;
1321  vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1322  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1323  CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1324  b_chain = b0;
1325  }
1326 
1327  bi0 = vlib_get_buffer_index (vm, b0);
1329  b0->clone_count = 0;
1330 
1332  &next0, &error0);
1333 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
1334  /*
1335  * Clear overloaded TX offload flags when a DPDK driver
1336  * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
1337  */
1338  if (PREDICT_TRUE(trace_cnt == 0))
1339  mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
1340  else
1341  trace_cnt--;
1342 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
1343 
1344  if (error0)
1345  clib_warning ("bi %d error %d", bi0, error0);
1346 
1347  b0->error = 0;
1348 
1349  l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
1350  next0 == DPDK_RX_NEXT_IP6_INPUT ||
1351  next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
1352  sizeof (ethernet_header_t) : 0);
1353 
1354  b0->current_data = l3_offset0;
1355  b0->current_length = mb->data_len - l3_offset0;
1356 
1357  b0->flags = buffer_flags_template;
1358 
1360  b0->current_length -= 4;
1361 
1362  vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
1363  vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
1364  vnet_buffer(b0)->io_handoff.next_index = next0;
1365  n_rx_bytes += mb->pkt_len;
1366 
1367  /* Process subsequent segments of multi-segment packets */
1368  while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
1369  {
1370  ASSERT(mb_seg != 0);
1371 
1372  b_seg = vlib_buffer_from_rte_mbuf(mb_seg);
1373  vlib_buffer_init_for_free_list (b_seg, fl);
1374  b_seg->clone_count = 0;
1375 
1376  ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
1377  ASSERT(b_seg->current_data == 0);
1378 
1379  /*
1380  * The driver (e.g. virtio) may not put the packet data at the start
1381  * of the segment, so don't assume b_seg->current_data == 0 is correct.
1382  */
1383  b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
1384 
1385  b_seg->current_length = mb_seg->data_len;
1387  mb_seg->data_len;
1388 
1389  b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
1390  b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
1391 
1392  b_chain = b_seg;
1393  mb_seg = mb_seg->next;
1394  nb_seg++;
1395  }
1396 
1397  /*
1398  * Turn this on if you run into
1399  * "bad monkey" contexts, and you want to know exactly
1400  * which nodes they've visited... See main.c...
1401  */
1403 
1404  if (PREDICT_FALSE (n_trace > mb_index))
1405  vec_add1 (xd->d_trace_buffers, bi0);
1406 
1407  next_worker_index = first_worker_index;
1408 
1409  /*
1410  * Force unknown traffic onto worker 0,
1411  * and into ethernet-input. $$$$ add more hashes.
1412  */
1413  h0 = (ethernet_header_t *) b0->data;
1414 
1415  /* Compute ingress LB hash */
1416  hash_key = eth_get_key(h0);
1417  hash = (u32)clib_xxhash(hash_key);
1418 
1419  if (PREDICT_TRUE (is_pow2(num_workers)))
1420  next_worker_index += hash & (num_workers - 1);
1421  else
1422  next_worker_index += hash % num_workers;
1423 
1424  /* if EFD is enabled and not already discarding from dpdk,
1425  * check the worker ring/queue for congestion
1426  */
1427  if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
1428  {
1429  vlib_frame_queue_t *fq;
1430 
1431  /* fq will be valid if the ring is congested */
1433  next_worker_index, tm->efd.queue_hi_thresh,
1434  congested_handoff_queue_by_worker_index);
1435 
1436  if (PREDICT_FALSE(fq != NULL))
1437  {
1438  u32 cntr_type;
1439  if (PREDICT_TRUE(cntr_type =
1440  is_efd_discardable(tm, b0, mb)))
1441  {
1442  /* discard the packet */
1443  fq->enqueue_efd_discards++;
1444  increment_efd_drop_counter(vm, cntr_type, 1);
1445  rte_pktmbuf_free(mb);
1446  n_buffers--;
1447  mb_index++;
1448  continue;
1449  }
1450  }
1451  }
1452 
1453  if (next_worker_index != current_worker_index)
1454  {
1455  if (hf)
1456  hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1457 
1459  next_worker_index,
1460  handoff_queue_elt_by_worker_index);
1461 
1462  n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
1463  to_next_worker = &hf->buffer_index[hf->n_vectors];
1464  current_worker_index = next_worker_index;
1465  }
1466 
1467  /* enqueue to correct worker thread */
1468  to_next_worker[0] = bi0;
1469  to_next_worker++;
1470  n_left_to_next_worker--;
1471 
1472  if (n_left_to_next_worker == 0)
1473  {
1474  hf->n_vectors = VLIB_FRAME_SIZE;
1476  current_worker_index = ~0;
1477  handoff_queue_elt_by_worker_index[next_worker_index] = 0;
1478  hf = 0;
1479  }
1480 
1481  n_buffers--;
1482  mb_index++;
1483  }
1484 
1485  if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
1486  {
1487  /* credit the trace to the trace node */
1488  dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
1489  vec_len (xd->d_trace_buffers));
1490  vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
1491  }
1492 
1494  (vnet_get_main()->interface_main.combined_sw_if_counters
1496  cpu_index,
1497  xd->vlib_sw_if_index,
1498  mb_index, n_rx_bytes);
1499 
1500  dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
1501  dw->aggregate_rx_packets += mb_index;
1502  }
1503 
1504  if (hf)
1505  hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1506 
1507  /* Ship frames to the worker nodes */
1508  for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
1509  {
1510  if (handoff_queue_elt_by_worker_index[i])
1511  {
1512  hf = handoff_queue_elt_by_worker_index[i];
1513  /*
1514  * It works better to let the handoff node
1515  * rate-adapt, always ship the handoff queue element.
1516  */
1517  if (1 || hf->n_vectors == hf->last_n_vectors)
1518  {
1520  handoff_queue_elt_by_worker_index[i] = 0;
1521  }
1522  else
1523  hf->last_n_vectors = hf->n_vectors;
1524  }
1525  congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
1526  }
1527  hf = 0;
1528  current_worker_index = ~0;
1529 
1531  }
1532 }
1533 
1534 /*
1535  * This function is used when the main thread performs IO and feeds the
1536  * worker threads.
1537  */
1538 static uword
1540  vlib_node_runtime_t * node,
1541  vlib_frame_t * f)
1542 {
1543  dpdk_main_t * dm = &dpdk_main;
1544  dpdk_device_t * xd;
1546  uword n_rx_packets = 0;
1547  static vlib_frame_queue_elt_t ** handoff_queue_elt_by_worker_index;
1548  static vlib_frame_queue_t ** congested_handoff_queue_by_worker_index = 0;
1549  vlib_frame_queue_elt_t * hf = 0;
1550  int i;
1551  u32 n_left_to_next_worker = 0, * to_next_worker = 0;
1552  u32 next_worker_index = 0;
1553  u32 current_worker_index = ~0;
1554  u32 cpu_index = os_get_cpu_number();
1555  static int num_workers_set;
1556  static u32 num_workers;
1557  u16 queue_id = 0;
1558  vlib_node_runtime_t * node_trace;
1559  static u32 first_worker_index;
1560  u32 buffer_flags_template;
1561 
1562  if (PREDICT_FALSE(num_workers_set == 0))
1563  {
1564  uword * p;
1566  /* Only the standard vnet worker threads are supported */
1567  p = hash_get_mem (tm->thread_registrations_by_name, "workers");
1568  tr = (vlib_thread_registration_t *) p[0];
1569  if (tr)
1570  {
1571  num_workers = tr->count;
1572  first_worker_index = tr->first_index;
1573  }
1574  num_workers_set = 1;
1575  }
1576 
1577  if (PREDICT_FALSE(handoff_queue_elt_by_worker_index == 0))
1578  {
1579  vec_validate (handoff_queue_elt_by_worker_index, tm->n_vlib_mains - 1);
1580 
1581  vec_validate_init_empty (congested_handoff_queue_by_worker_index,
1582  first_worker_index + num_workers - 1,
1583  (vlib_frame_queue_t *)(~0));
1584  }
1585 
1586  /* packet tracing is triggered on the dpdk-input node for ease-of-use */
1587  node_trace = vlib_node_get_runtime (vm, dpdk_input_node.index);
1588 
1589  buffer_flags_template = dm->buffer_flags_template;
1590 
1591  vec_foreach (xd, dm->devices)
1592  {
1593  u32 n_buffers;
1594  u32 mb_index;
1595  uword n_rx_bytes = 0;
1596  u32 n_trace, trace_cnt __attribute__((unused));
1598  u32 hash;
1599  u64 hash_key;
1600  u8 efd_discard_burst = 0;
1601 
1602  if (!xd->admin_up)
1603  continue;
1604 
1605  n_buffers = dpdk_rx_burst(dm, xd, queue_id );
1606 
1607  if (n_buffers == 0)
1608  {
1609  /* check if EFD (dpdk) is enabled */
1610  if (PREDICT_FALSE(dm->efd.enabled))
1611  {
1612  /* reset a few stats */
1613  xd->efd_agent.last_poll_time = 0;
1614  xd->efd_agent.last_burst_sz = 0;
1615  }
1616  continue;
1617  }
1618 
1620  trace_cnt = n_trace = vlib_get_trace_count (vm, node_trace);
1621 
1622  /*
1623  * DAW-FIXME: VMXNET3 device stop/start doesn't work,
1624  * therefore fake the stop in the dpdk driver by
1625  * silently dropping all of the incoming pkts instead of
1626  * stopping the driver / hardware.
1627  */
1628  if (PREDICT_FALSE(xd->admin_up != 1))
1629  {
1630  for (mb_index = 0; mb_index < n_buffers; mb_index++)
1631  rte_pktmbuf_free (xd->rx_vectors[queue_id][mb_index]);
1632  continue;
1633  }
1634 
1635  /* Check for congestion if EFD (Early-Fast-Discard) is enabled
1636  * in any mode (e.g. dpdk, monitor, or drop_all)
1637  */
1638  if (PREDICT_FALSE(dm->efd.enabled))
1639  {
1640  /* update EFD counters */
1641  dpdk_efd_update_counters(xd, n_buffers, dm->efd.enabled);
1642 
1644  {
1645  /* discard all received packets */
1646  for (mb_index = 0; mb_index < n_buffers; mb_index++)
1647  rte_pktmbuf_free(xd->rx_vectors[queue_id][mb_index]);
1648 
1649  xd->efd_agent.discard_cnt += n_buffers;
1651  DPDK_ERROR_VLAN_EFD_DROP_PKTS,
1652  n_buffers);
1653 
1654  continue;
1655  }
1656 
1659  {
1660  u32 device_queue_sz = rte_eth_rx_queue_count(xd->device_index,
1661  queue_id);
1662  if (device_queue_sz >= dm->efd.queue_hi_thresh)
1663  {
1664  /* dpdk device queue has reached the critical threshold */
1665  xd->efd_agent.congestion_cnt++;
1666 
1667  /* apply EFD to packets from the burst */
1668  efd_discard_burst = 1;
1669  }
1670  }
1671  }
1672 
1675 
1676  mb_index = 0;
1677 
1678  while (n_buffers > 0)
1679  {
1680  u32 bi0;
1681  u8 next0, error0;
1682  u32 l3_offset0;
1683  vlib_buffer_t * b0, * b_seg, * b_chain = 0;
1684  ethernet_header_t * h0;
1685  u8 nb_seg = 1;
1686  struct rte_mbuf *mb = xd->rx_vectors[queue_id][mb_index];
1687  struct rte_mbuf *mb_seg = mb->next;
1688 
1689  if (PREDICT_TRUE(n_buffers > 1))
1690  {
1691  struct rte_mbuf *pfmb = xd->rx_vectors[queue_id][mb_index+2];
1692  vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1693  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1694  CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1696  }
1697 
1698  b0 = vlib_buffer_from_rte_mbuf(mb);
1699 
1700  /* check whether EFD is looking for packets to discard */
1701  if (PREDICT_FALSE(efd_discard_burst))
1702  {
1703  u32 cntr_type;
1704  if (PREDICT_TRUE(cntr_type = is_efd_discardable(tm, b0, mb)))
1705  {
1706  rte_pktmbuf_free(mb);
1707  xd->efd_agent.discard_cnt++;
1709  cntr_type,
1710  1);
1711 
1712  n_buffers--;
1713  mb_index++;
1714  continue;
1715  }
1716  }
1717 
1718  /* Prefetch one next segment if it exists */
1719  if (PREDICT_FALSE(mb->nb_segs > 1))
1720  {
1721  struct rte_mbuf *pfmb = mb->next;
1722  vlib_buffer_t *bp = vlib_buffer_from_rte_mbuf(pfmb);
1723  CLIB_PREFETCH (pfmb, CLIB_CACHE_LINE_BYTES, LOAD);
1724  CLIB_PREFETCH (bp, CLIB_CACHE_LINE_BYTES, STORE);
1725  b_chain = b0;
1726  }
1727 
1728  bi0 = vlib_get_buffer_index (vm, b0);
1730  b0->clone_count = 0;
1731 
1733  &next0, &error0);
1734 #ifdef RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS
1735  /*
1736  * Clear overloaded TX offload flags when a DPDK driver
1737  * is using them for RX flags (e.g. Cisco VIC Ethernet driver)
1738  */
1739  if (PREDICT_TRUE(trace_cnt == 0))
1740  mb->ol_flags &= PKT_EXT_RX_CLR_TX_FLAGS_MASK;
1741  else
1742  trace_cnt--;
1743 #endif /* RTE_LIBRTE_MBUF_EXT_RX_OLFLAGS */
1744 
1745  if (error0)
1746  clib_warning ("bi %d error %d", bi0, error0);
1747 
1748  b0->error = 0;
1749 
1750  l3_offset0 = ((next0 == DPDK_RX_NEXT_IP4_INPUT ||
1751  next0 == DPDK_RX_NEXT_IP6_INPUT ||
1752  next0 == DPDK_RX_NEXT_MPLS_INPUT) ?
1753  sizeof (ethernet_header_t) : 0);
1754 
1755  b0->current_data = l3_offset0;
1756  b0->current_length = mb->data_len - l3_offset0;
1757 
1758  b0->flags = buffer_flags_template;
1759 
1761  b0->current_length -= 4;
1762 
1763  vnet_buffer(b0)->sw_if_index[VLIB_RX] = xd->vlib_sw_if_index;
1764  vnet_buffer(b0)->sw_if_index[VLIB_TX] = (u32)~0;
1765  vnet_buffer(b0)->io_handoff.next_index = next0;
1766  n_rx_bytes += mb->pkt_len;
1767 
1768  /* Process subsequent segments of multi-segment packets */
1769  while ((mb->nb_segs > 1) && (nb_seg < mb->nb_segs))
1770  {
1771  ASSERT(mb_seg != 0);
1772 
1773  b_seg = vlib_buffer_from_rte_mbuf(mb_seg);
1774  vlib_buffer_init_for_free_list (b_seg, fl);
1775  b_seg->clone_count = 0;
1776 
1777  ASSERT((b_seg->flags & VLIB_BUFFER_NEXT_PRESENT) == 0);
1778  ASSERT(b_seg->current_data == 0);
1779 
1780  /*
1781  * The driver (e.g. virtio) may not put the packet data at the start
1782  * of the segment, so don't assume b_seg->current_data == 0 is correct.
1783  */
1784  b_seg->current_data = (mb_seg->buf_addr + mb_seg->data_off) - (void *)b_seg->data;
1785 
1786  b_seg->current_length = mb_seg->data_len;
1788  mb_seg->data_len;
1789 
1790  b_chain->flags |= VLIB_BUFFER_NEXT_PRESENT;
1791  b_chain->next_buffer = vlib_get_buffer_index (vm, b_seg);
1792 
1793  b_chain = b_seg;
1794  mb_seg = mb_seg->next;
1795  nb_seg++;
1796  }
1797 
1798  /*
1799  * Turn this on if you run into
1800  * "bad monkey" contexts, and you want to know exactly
1801  * which nodes they've visited... See main.c...
1802  */
1804 
1805  if (PREDICT_FALSE (n_trace > mb_index))
1806  vec_add1 (xd->d_trace_buffers, bi0);
1807 
1808  next_worker_index = first_worker_index;
1809 
1810  /*
1811  * Force unknown traffic onto worker 0,
1812  * and into ethernet-input. $$$$ add more hashes.
1813  */
1814  h0 = (ethernet_header_t *) b0->data;
1815 
1816  /* Compute ingress LB hash */
1817  hash_key = eth_get_key(h0);
1818  hash = (u32)clib_xxhash(hash_key);
1819 
1820  if (PREDICT_TRUE (is_pow2(num_workers)))
1821  next_worker_index += hash & (num_workers - 1);
1822  else
1823  next_worker_index += hash % num_workers;
1824 
1825  /* if EFD is enabled and not already discarding from dpdk,
1826  * check the worker ring/queue for congestion
1827  */
1828  if (PREDICT_FALSE(tm->efd.enabled && !efd_discard_burst))
1829  {
1830  vlib_frame_queue_t *fq;
1831 
1832  /* fq will be valid if the ring is congested */
1834  next_worker_index, tm->efd.queue_hi_thresh,
1835  congested_handoff_queue_by_worker_index);
1836 
1837  if (PREDICT_FALSE(fq != NULL))
1838  {
1839  u32 cntr_type;
1840  if (PREDICT_TRUE(cntr_type =
1841  is_efd_discardable(tm, b0, mb)))
1842  {
1843  /* discard the packet */
1844  fq->enqueue_efd_discards++;
1845  increment_efd_drop_counter(vm, cntr_type, 1);
1846  rte_pktmbuf_free(mb);
1847  n_buffers--;
1848  mb_index++;
1849  continue;
1850  }
1851  }
1852  }
1853 
1854  if (next_worker_index != current_worker_index)
1855  {
1856  if (hf)
1857  hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1858 
1860  next_worker_index,
1861  handoff_queue_elt_by_worker_index);
1862 
1863  n_left_to_next_worker = VLIB_FRAME_SIZE - hf->n_vectors;
1864  to_next_worker = &hf->buffer_index[hf->n_vectors];
1865  current_worker_index = next_worker_index;
1866  }
1867 
1868  /* enqueue to correct worker thread */
1869  to_next_worker[0] = bi0;
1870  to_next_worker++;
1871  n_left_to_next_worker--;
1872 
1873  if (n_left_to_next_worker == 0)
1874  {
1875  hf->n_vectors = VLIB_FRAME_SIZE;
1877  current_worker_index = ~0;
1878  handoff_queue_elt_by_worker_index[next_worker_index] = 0;
1879  hf = 0;
1880  }
1881 
1882  n_buffers--;
1883  mb_index++;
1884  }
1885 
1886  if (PREDICT_FALSE (vec_len (xd->d_trace_buffers) > 0))
1887  {
1888  /* credit the trace to the trace node */
1889  dpdk_rx_trace (dm, node_trace, xd, queue_id, xd->d_trace_buffers,
1890  vec_len (xd->d_trace_buffers));
1891  vlib_set_trace_count (vm, node_trace, n_trace - vec_len (xd->d_trace_buffers));
1892  }
1893 
1895  (vnet_get_main()->interface_main.combined_sw_if_counters
1897  cpu_index,
1898  xd->vlib_sw_if_index,
1899  mb_index, n_rx_bytes);
1900 
1901  dpdk_worker_t * dw = vec_elt_at_index(dm->workers, cpu_index);
1902  dw->aggregate_rx_packets += mb_index;
1903  n_rx_packets += mb_index;
1904  }
1905 
1906  if (hf)
1907  hf->n_vectors = VLIB_FRAME_SIZE - n_left_to_next_worker;
1908 
1909  /* Ship frames to the worker nodes */
1910  for (i = 0; i < vec_len (handoff_queue_elt_by_worker_index); i++)
1911  {
1912  if (handoff_queue_elt_by_worker_index[i])
1913  {
1914  hf = handoff_queue_elt_by_worker_index[i];
1915  /*
1916  * It works better to let the handoff node
1917  * rate-adapt, always ship the handoff queue element.
1918  */
1919  if (1 || hf->n_vectors == hf->last_n_vectors)
1920  {
1922  handoff_queue_elt_by_worker_index[i] = 0;
1923  }
1924  else
1925  hf->last_n_vectors = hf->n_vectors;
1926  }
1927  congested_handoff_queue_by_worker_index[i] = (vlib_frame_queue_t *)(~0);
1928  }
1929  hf = 0;
1930  current_worker_index = ~0;
1931  return n_rx_packets;
1932 }
1933 
1935  .function = dpdk_io_input,
1936  .type = VLIB_NODE_TYPE_INPUT,
1937  .name = "dpdk-io-input",
1938 
1939  /* Will be enabled if/when hardware is detected. */
1940  .state = VLIB_NODE_STATE_DISABLED,
1941 
1942  .format_buffer = format_ethernet_header_with_length,
1943  .format_trace = format_dpdk_rx_dma_trace,
1944 
1945  .n_errors = DPDK_N_ERROR,
1946  .error_strings = dpdk_error_strings,
1947 
1948  .n_next_nodes = DPDK_RX_N_NEXT,
1949  .next_nodes = {
1950  [DPDK_RX_NEXT_DROP] = "error-drop",
1951  [DPDK_RX_NEXT_ETHERNET_INPUT] = "ethernet-input",
1952  [DPDK_RX_NEXT_IP4_INPUT] = "ip4-input-no-checksum",
1953  [DPDK_RX_NEXT_IP6_INPUT] = "ip6-input",
1954  [DPDK_RX_NEXT_MPLS_INPUT] = "mpls-gre-input",
1955  },
1956 };
1957 
1958 /*
1959  * set_efd_bitmap()
1960  * Based on the operation type, set lower/upper bits for the given index value
1961  */
1962 void
1963 set_efd_bitmap (u8 *bitmap, u32 value, u32 op)
1964 {
1965  int ix;
1966 
1967  *bitmap = 0;
1968  for (ix = 0; ix < 8; ix++) {
1969  if (((op == EFD_OPERATION_LESS_THAN) && (ix < value)) ||
1970  ((op == EFD_OPERATION_GREATER_OR_EQUAL) && (ix >= value))){
1971  (*bitmap) |= (1 << ix);
1972  }
1973  }
1974 }
1975 
1976 void
1977 efd_config (u32 enabled,
1978  u32 ip_prec, u32 ip_op,
1979  u32 mpls_exp, u32 mpls_op,
1980  u32 vlan_cos, u32 vlan_op)
1981 {
1983  dpdk_main_t * dm = &dpdk_main;
1984 
1985  if (enabled) {
1988  } else {
1991  }
1992 
1993  set_efd_bitmap(&tm->efd.ip_prec_bitmap, ip_prec, ip_op);
1994  set_efd_bitmap(&tm->efd.mpls_exp_bitmap, mpls_exp, mpls_op);
1995  set_efd_bitmap(&tm->efd.vlan_cos_bitmap, vlan_cos, vlan_op);
1996 
1997 }
#define VIRL_SPEED_LIMIT()
Definition: node.c:774
#define vec_validate(V, I)
Make sure vector is long enough for given index (no header, unspecified alignment) ...
Definition: vec.h:394
void vlib_put_next_frame(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, u32 n_vectors_left)
Definition: main.c:459
void dpdk_rx_trace(dpdk_main_t *dm, vlib_node_runtime_t *node, dpdk_device_t *xd, u16 queue_id, u32 *buffers, uword n_buffers)
Definition: node.c:343
u16 enabled
Definition: dpdk.h:307
always_inline vlib_thread_main_t * vlib_get_thread_main()
Definition: global_funcs.h:32
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:267
u32 trace_active_hint
Definition: trace.h:80
uword dpdk_input_rss(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *f)
Definition: node.c:805
#define CLIB_UNUSED(x)
Definition: clib.h:79
static u64 ipv4_get_key(ip4_header_t *ip)
Definition: node.c:952
handoff_dispatch_error_t
Definition: node.c:86
u16 max_burst_sz
Definition: dpdk.h:129
void dpdk_efd_update_counters(dpdk_device_t *xd, u32 n_buffers, u16 enabled)
Definition: node.c:398
dpdk_main_t dpdk_main
Definition: dpdk.h:415
bad routing header type(not 4)") sr_error (NO_MORE_SEGMENTS
static u8 * format_handoff_dispatch_trace(u8 *s, va_list *args)
Definition: node.c:66
#define VLIB_BUFFER_TRACE_TRAJECTORY_INIT(b)
Definition: buffer.h:383
#define PREDICT_TRUE(x)
Definition: clib.h:98
static void vlib_worker_thread_barrier_check(void)
Definition: threads.h:201
u64 as_u64[2]
Definition: ip6_packet.h:50
void dpdk_set_next_node(dpdk_rx_next_t next, char *name)
Definition: node.c:857
u64 last_poll_time
Definition: dpdk.h:133
#define foreach_dpdk_error
Definition: dpdk.h:477
#define NULL
Definition: clib.h:55
void(* dpdk_io_thread_callback_t)(vlib_main_t *vm)
Definition: dpdk.h:452
vnet_main_t * vnet_main
Definition: node.c:56
always_inline void vlib_set_trace_count(vlib_main_t *vm, vlib_node_runtime_t *rt, u32 count)
Definition: trace_funcs.h:160
#define vec_add1(V, E)
Add 1 element to end of vector (unspecified alignment).
Definition: vec.h:480
static u64 clib_xxhash(u64 key)
Definition: xxhash.h:57
struct _vlib_node_registration vlib_node_registration_t
u32 per_interface_next_index
Definition: dpdk.h:199
static u64 mpls_get_key(mpls_unicast_header_t *m)
Definition: node.c:978
vlib_node_registration_t dpdk_io_input_node
(constructor) VLIB_REGISTER_NODE (dpdk_io_input_node)
Definition: node.c:1934
clib_time_t clib_time
Definition: main.h:61
u32 congestion_cnt
Definition: dpdk.h:132
static char * dpdk_error_strings[]
Definition: node.c:271
u32 buffer_index[VLIB_FRAME_SIZE]
Definition: threads.h:78
vlib_error_t * errors
Definition: node.h:378
volatile u32 valid
Definition: threads.h:72
ip6_address_t src_address
Definition: ip6_packet.h:293
always_inline vlib_main_t * vlib_get_main(void)
Definition: global_funcs.h:23
#define vec_reset_length(v)
Reset vector length to zero NULL-pointer tolerant.
u8 admin_up
Definition: dpdk.h:218
u8 mpls_exp_bitmap
Definition: threads.h:251
vnet_main_t * vnet_get_main(void)
Definition: misc.c:45
vlib_main_t * vlib_main
Definition: node.c:55
i16 current_data
signed offset in data[], pre_data[] that we are currently processing.
Definition: buffer.h:77
always_inline u32 vlib_get_trace_count(vlib_main_t *vm, vlib_node_runtime_t *rt)
Definition: trace_funcs.h:144
static u64 eth_get_key(ethernet_header_t *h0)
Definition: node.c:1035
#define VLIB_INIT_FUNCTION(x)
Definition: init.h:109
#define always_inline
Definition: clib.h:84
u32 dpdk_get_handoff_node_index(void)
Definition: node.c:266
void efd_config(u32 enabled, u32 ip_prec, u32 ip_op, u32 mpls_exp, u32 mpls_op, u32 vlan_cos, u32 vlan_op)
Definition: node.c:1977
always_inline void vlib_increment_combined_counter(vlib_combined_counter_main_t *cm, u32 cpu_index, u32 index, u32 packet_increment, u32 byte_increment)
Definition: counter.h:210
u16 queue_hi_thresh
Definition: threads.h:249
#define vec_elt_at_index(v, i)
Get vector value at index i checking that i is in bounds.
clib_error_t * handoff_dispatch_init(vlib_main_t *vm)
Definition: node.c:254
always_inline uword rotate_left(uword x, uword i)
Definition: bitops.h:126
u8 pre_data[VLIB_BUFFER_PRE_DATA_SIZE]
Space for inserting data before buffer start.
Definition: buffer.h:142
#define clib_warning(format, args...)
Definition: error.h:59
unsigned long u64
Definition: types.h:89
vlib_frame_queue_elt_t * elts
Definition: threads.h:135
always_inline vlib_buffer_free_list_t * vlib_buffer_get_free_list(vlib_main_t *vm, u32 free_list_index)
Definition: buffer_funcs.h:332
vlib_node_registration_t dpdk_input_node
(constructor) VLIB_REGISTER_NODE (dpdk_input_node)
Definition: node.c:829
#define VMWARE_LENGTH_BUG_WORKAROUND
Definition: node.c:49
always_inline void * vlib_frame_vector_args(vlib_frame_t *f)
Definition: node_funcs.h:202
u32 device_index
Definition: dpdk.h:193
dpdk_worker_t * workers
Definition: dpdk.h:339
static u64 ipv6_get_key(ip6_header_t *ip)
Definition: node.c:961
#define VLIB_BUFFER_NEXT_PRESENT
Definition: buffer.h:93
struct rte_mbuf mb
Definition: dpdk.h:439
u16 consec_full_frames_hi_thresh
Definition: dpdk.h:309
u32 vlib_sw_if_index
Definition: dpdk.h:196
#define VLIB_EFD_DISCARD_ENABLED
Definition: threads.h:241
format_function_t format_dpdk_rx_dma_trace
Definition: dpdk.h:599
u16 current_length
Nbytes between current data and the end of this buffer.
Definition: buffer.h:81
dpdk_device_and_queue_t ** devices_by_cpu
Definition: dpdk.h:317
u64 max_poll_delay
Definition: dpdk.h:134
static_always_inline void increment_efd_drop_counter(vlib_main_t *vm, u32 counter_index, u32 count)
Definition: dpdk.h:502
#define VLIB_NODE_FLAG_IS_HANDOFF
Definition: node.h:222
u32 consec_full_frames_cnt
Definition: dpdk.h:131
static char * handoff_dispatch_error_strings[]
Definition: node.c:93
#define DPDK_EFD_DISCARD_ENABLED
Definition: dpdk.h:299
uword os_get_cpu_number(void)
Definition: unix-misc.c:206
static void vlib_put_handoff_queue_elt(vlib_frame_queue_elt_t *hf)
Definition: node.c:100
u16 last_burst_sz
Definition: dpdk.h:128
unsigned short int uint16_t
Definition: fix_types.h:28
ip4_address_pair_t address_pair
Definition: ip4_packet.h:140
u16 vlan_subifs
Definition: dpdk.h:212
u32 * d_trace_buffers
Definition: dpdk.h:206
#define DPDK_EFD_MONITOR_ENABLED
Definition: dpdk.h:300
#define PREDICT_FALSE(x)
Definition: clib.h:97
u32 full_frames_cnt
Definition: dpdk.h:130
vnet_main_t vnet_main
Definition: misc.c:42
#define VLIB_FRAME_SIZE
Definition: node.h:292
static u32 dpdk_device_input(dpdk_main_t *dm, dpdk_device_t *xd, vlib_node_runtime_t *node, u32 cpu_index, u16 queue_id)
Definition: node.c:487
vlib_frame_queue_t ** vlib_frame_queues
Definition: threads.h:139
#define vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, n_left_to_next, bi0, bi1, next0, next1)
Definition: buffer_node.h:43
#define vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, n_left_to_next, bi0, next0)
Definition: buffer_node.h:83
#define vlib_get_next_frame(vm, node, next_index, vectors, n_vectors_left)
Definition: node_funcs.h:265
void clib_time_init(clib_time_t *c)
Definition: time.c:160
vlib_error_t error
Error code for buffers to be enqueued to error handler.
Definition: buffer.h:129
static uword handoff_dispatch_node_fn(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame)
Definition: node.c:107
u64 aggregate_rx_packets
Definition: dpdk.h:289
u8 * format_ethernet_header_with_length(u8 *s, va_list *args)
Definition: format.c:70
volatile u64 tail
Definition: threads.h:110
u16 n_vectors
Definition: node.h:307
dpdk_device_t * devices
Definition: dpdk.h:316
#define CLIB_PREFETCH(addr, size, type)
Definition: cache.h:82
static uword dpdk_io_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *f)
Definition: node.c:1539
u8 ip_prec_bitmap
Definition: threads.h:250
#define foreach_handoff_dispatch_error
Definition: node.c:83
struct rte_mbuf *** rx_vectors
Definition: dpdk.h:203
u16 queue_hi_thresh
Definition: dpdk.h:308
u8 vlan_cos_bitmap
Definition: threads.h:252
#define clib_memcpy(a, b, c)
Definition: string.h:63
dpdk_pmd_t pmd
Definition: dpdk.h:215
#define ARRAY_LEN(x)
Definition: clib.h:59
#define EFD_OPERATION_GREATER_OR_EQUAL
Definition: dpdk.h:546
#define VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX
Definition: buffer.h:296
u32 discard_cnt
Definition: dpdk.h:135
always_inline uword is_pow2(uword x)
Definition: clib.h:252
u16 cached_next_index
Definition: node.h:422
#define ASSERT(truth)
unsigned int u32
Definition: types.h:88
#define vnet_buffer(b)
Definition: buffer.h:300
u8 * format(u8 *s, char *fmt,...)
Definition: format.c:405
dpdk_efd_t efd
Definition: dpdk.h:388
u32 next_buffer
Next buffer for this linked-list of buffers.
Definition: buffer.h:112
#define MPLS_LABEL_MASK
Definition: node.c:976
u32 clone_count
Specifies whether this buffer should be reinitialized when freed.
Definition: buffer.h:121
vlib_trace_main_t trace_main
Definition: main.h:121
uword * thread_registrations_by_name
Definition: threads.h:263
void dpdk_io_thread(vlib_worker_thread_t *w, u32 instances, u32 instance_id, char *worker_name, dpdk_io_thread_callback_t callback)
Definition: node.c:1079
#define VLIB_BUFFER_IS_TRACED
Definition: buffer.h:91
always_inline void vlib_buffer_init_for_free_list(vlib_buffer_t *_dst, vlib_buffer_free_list_t *fl)
Definition: buffer_funcs.h:591
u64 uword
Definition: types.h:112
vlib_node_registration_t handoff_dispatch_node
(constructor) VLIB_REGISTER_NODE (handoff_dispatch_node)
Definition: node.c:81
u32 total_length_not_including_first_buffer
Only valid for first buffer in chain.
Definition: buffer.h:106
u32 ip_version_traffic_class_and_flow_label
Definition: ip6_packet.h:280
handoff_dispatch_main_t handoff_dispatch_main
Definition: node.c:79
dpdk_rx_next_t
Definition: dpdk.h:417
Definition: defs.h:46
u32 enqueue_full_events
Definition: threads.h:114
unsigned short u16
Definition: types.h:57
vlib_buffer_t buffer
Definition: dpdk.h:440
#define vec_len(v)
Number of elements in vector (rvalue-only, NULL tolerant)
unsigned char u8
Definition: types.h:56
#define MPLS_BOTTOM_OF_STACK_BIT_MASK
Definition: node.c:975
dpdk_efd_agent_t efd_agent
Definition: dpdk.h:256
static vlib_frame_queue_elt_t * dpdk_get_handoff_queue_elt(u32 vlib_worker_index, vlib_frame_queue_elt_t **handoff_queue_elt_by_worker_index)
Definition: node.c:909
#define DPDK_EFD_DROPALL_ENABLED
Definition: dpdk.h:301
always_inline void vlib_increment_main_loop_counter(vlib_main_t *vm)
Definition: main.h:278
u32 enqueue_efd_discards
Definition: threads.h:115
static u32 dpdk_rx_burst(dpdk_main_t *dm, dpdk_device_t *xd, u16 queue_id)
Definition: dpdk_priv.h:52
volatile u64 head_hint
Definition: threads.h:131
always_inline void * vlib_add_trace(vlib_main_t *vm, vlib_node_runtime_t *r, vlib_buffer_t *b, u32 n_data_bytes)
Definition: trace_funcs.h:55
#define hash_get_mem(h, key)
Definition: hash.h:251
u32 buffer_flags_template
Definition: dpdk.h:323
vlib_frame_queue_elt_t * vlib_get_handoff_queue_elt(u32 vlib_worker_index)
Definition: node.c:881
#define vlib_prefetch_buffer_header(b, type)
Prefetch buffer metadata.
Definition: buffer.h:162
#define VLIB_REGISTER_NODE(x,...)
Definition: node.h:140
always_inline vlib_node_runtime_t * vlib_node_get_runtime(vlib_main_t *vm, u32 node_index)
Definition: node_funcs.h:61
u8 data[0]
Packet data.
Definition: buffer.h:150
volatile u32 io_thread_release
Definition: dpdk.h:363
#define vec_foreach(var, vec)
Vector iterator.
static vlib_frame_queue_t * is_vlib_handoff_queue_congested(u32 vlib_worker_index, u32 queue_hi_thresh, vlib_frame_queue_t **handoff_queue_by_worker_index)
Definition: node.c:926
#define EFD_OPERATION_LESS_THAN
Definition: dpdk.h:545
u32 total_packet_cnt
Definition: dpdk.h:136
#define CLIB_MEMORY_BARRIER()
Definition: clib.h:101
always_inline void dpdk_rx_next_and_error_from_mb_flags_x1(dpdk_device_t *xd, struct rte_mbuf *mb, vlib_buffer_t *b0, u8 *next0, u8 *error0)
Definition: node.c:278
u32 flags
Definition: vhost-user.h:73
#define vec_validate_init_empty(V, I, INIT)
Make sure vector is long enough for given index and initialize empty space (no header, unspecified alignment)
Definition: vec.h:443
u32 is_efd_discardable(vlib_thread_main_t *tm, vlib_buffer_t *b0, struct rte_mbuf *mb)
Definition: node.c:435
#define CLIB_CACHE_LINE_BYTES
Definition: cache.h:67
u32 flags
buffer flags: VLIB_BUFFER_IS_TRACED: trace this buffer.
Definition: buffer.h:84
always_inline u64 clib_cpu_time_now(void)
Definition: time.h:71
static uword dpdk_input(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *f)
Definition: node.c:779
vlib_thread_registration_t * registration
Definition: threads.h:98
always_inline void vlib_trace_buffer(vlib_main_t *vm, vlib_node_runtime_t *r, u32 next_index, vlib_buffer_t *b, int follow_chain)
Definition: trace_funcs.h:106
always_inline vlib_buffer_t * vlib_get_buffer(vlib_main_t *vm, u32 buffer_index)
Translate buffer index into buffer pointer.
Definition: buffer_funcs.h:69
always_inline u32 vlib_get_buffer_index(vlib_main_t *vm, void *p)
Translate buffer pointer into buffer index.
Definition: buffer_funcs.h:82
u16 enabled
Definition: threads.h:248
vlib_efd_t efd
Definition: threads.h:303
Definition: defs.h:45
CLIB vectors are ubiquitous dynamically resized arrays with by user defined "headers".
ip6_address_t dst_address
Definition: ip6_packet.h:293
void set_efd_bitmap(u8 *bitmap, u32 value, u32 op)
Definition: node.c:1963