FD.io VPP  v18.04-17-g3a0d853
Vector Packet Processing
kp.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017 Intel and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "POD IS" BPODIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 /**
17  * kp-plugin implements a MagLev-like load balancer.
18  * http://research.google.com/pubs/pub44824.html
19  *
20  * It hasn't been tested for interoperability with the original MagLev
21  * but intends to provide similar functionality.
22  * The kube-proxy receives traffic destined to VIP (Virtual IP)
23  * addresses from one or multiple(ECMP) routers.
24  * The kube-proxy tunnels the traffic toward many application servers
25  * ensuring session stickyness (i.e. that a single sessions is tunneled
26  * towards a single application server).
27  *
28  */
29 
30 #ifndef KP_PLUGIN_KP_KP_H_
31 #define KP_PLUGIN_KP_KP_H_
32 
33 #include <vnet/util/refcount.h>
34 #include <vnet/vnet.h>
35 #include <vnet/ip/ip.h>
36 #include <vnet/dpo/dpo.h>
37 #include <vnet/fib/fib_table.h>
38 #include <vppinfra/bihash_8_8.h>
39 
40 #include <kubeproxy/kphash.h>
41 
42 #define KP_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10
43 #define KP_DEFAULT_FLOW_TIMEOUT 40
44 #define KP_MAPPING_BUCKETS 1024
45 #define KP_MAPPING_MEMORY_SIZE 64<<20
46 
47 typedef enum {
50 } kp_next_t;
51 
52 typedef enum {
57 
58 #define foreach_kp_nat_in2out_error \
59 _(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \
60 _(IN2OUT_PACKETS, "Good in2out packets processed") \
61 _(NO_TRANSLATION, "No translation")
62 
63 typedef enum {
64 #define _(sym,str) KP_NAT_IN2OUT_ERROR_##sym,
66 #undef _
69 
70 /**
71  * kube-proxy supports three types of service
72  */
73 typedef enum {
79 
80 typedef enum {
88 
89 /**
90  * Each VIP is configured with a set of PODs
91  */
92 typedef struct {
93  /**
94  * Registration to FIB event.
95  */
97 
98  /**
99  * Destination address used to transfer traffic towards to that POD.
100  * The address is also used pod ID and pseudo-random
101  * seed for the load-balancing process.
102  */
103  ip46_address_t address;
104 
105  /**
106  * PODs are indexed by address and VIP Index.
107  * Which means there will be duplicated if the same server
108  * address is used for multiple VIPs.
109  */
111 
112  /**
113  * Some per-POD flags.
114  * For now only KP_POD_FLAGS_USED is defined.
115  */
117 
118 #define KP_POD_FLAGS_USED 0x1
119 
120  /**
121  * Rotating timestamp of when KP_POD_FLAGS_USED flag was last set.
122  *
123  * POD removal is based on garbage collection and reference counting.
124  * When an POD is removed, there is a race between configuration core
125  * and worker cores which may still add a reference while it should not
126  * be used. This timestamp is used to not remove the POD while a race condition
127  * may happen.
128  */
130 
131  /**
132  * The FIB entry index for the next-hop
133  */
135 
136  /**
137  * The child index on the FIB entry
138  */
140 
141  /**
142  * The next DPO in the graph to follow.
143  */
145 
146 } kp_pod_t;
147 
149 
150 typedef struct {
153 
154 #define kp_foreach_vip_counter \
155  _(NEXT_PACKET, "packet from existing sessions", 0) \
156  _(FIRST_PACKET, "first session packet", 1) \
157  _(UNTRACKED_PACKET, "untracked packet", 2) \
158  _(NO_SERVER, "no server configured", 3)
159 
160 typedef enum {
161 #define _(a,b,c) KP_VIP_COUNTER_##a = c,
163 #undef _
166 
167 /**
168  * kube-proxy supports IPv4 and IPv6 traffic
169  * and NAT4 and NAT6.
170  */
171 typedef enum {
177 } kp_vip_type_t;
178 
181 
182 /**
183  * Load balancing service is provided per VIP.
184  * In this data model, a VIP can be a whole prefix.
185  * But load balancing only
186  * occurs on a per-source-address/port basis. Meaning that if a given source
187  * reuses the same port for multiple destinations within the same VIP,
188  * they will be considered as a single flow.
189  */
190 typedef struct {
191 
192  //Runtime
193 
194  /**
195  * Vector mapping (flow-hash & new_connect_table_mask) to POD index.
196  * This is used for new flows.
197  */
199 
200  /**
201  * New flows table length - 1
202  * (length MUST be a power of 2)
203  */
205 
206  /**
207  * last time garbage collection was run to free the PODs.
208  */
210 
211  //Not runtime
212 
213  /**
214  * A Virtual IP represents a given service delivered
215  * by a set of PODs. It can be a single
216  * address or a prefix.
217  * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address
218  * (i.e. ::/96 prefix).
219  */
220  ip46_address_t prefix;
221 
222  /**
223  * The VIP prefix length.
224  * In case of IPv4, plen = 96 + ip4_plen.
225  */
227 
228  /**
229  * Service port. network byte order
230  */
232 
233  /**
234  * Pod's port corresponding to specific service. network byte order
235  */
237 
238  /**
239  * Node's port, can access service via NodeIP:node_port. network byte order
240  */
242 
243 
244  /**
245  * The type of traffic for this.
246  * KP_TYPE_UNDEFINED if unknown.
247  */
249 
250  /**
251  * Flags related to this VIP.
252  * KP_VIP_FLAGS_USED means the VIP is active.
253  * When it is not set, the VIP in the process of being removed.
254  * We cannot immediately remove a VIP because the VIP index still may be stored
255  * in the adjacency index.
256  */
258 #define KP_VIP_FLAGS_USED 0x1
259 
260  /**
261  * Pool of POD indexes used for this VIP.
262  * This also includes PODs that have been removed (but are still referenced).
263  */
265 
266 } kp_vip_t;
267 
268 /*
269  * mapping from nodeport to vip_index
270  */
271 typedef struct {
272 
274 
275 } kp_nodeport_t;
276 
277 #define kp_vip_is_ip4(vip) ((vip)->type == KP_VIP_TYPE_IP4_NAT44 \
278  || (vip)->type == KP_VIP_TYPE_IP4_NAT46)
279 #define kp_vip_is_nat4(vip) ((vip)->type == KP_VIP_TYPE_IP6_NAT64 \
280  || (vip)->type == KP_VIP_TYPE_IP4_NAT44)
283 
284 #define foreach_kp_nat_protocol \
285  _(UDP, 0, udp, "udp") \
286  _(TCP, 1, tcp, "tcp")
287 
288 typedef enum {
289 #define _(N, i, n, s) KP_NAT_PROTOCOL_##N = i,
291 #undef _
293 
296 {
297  u32 nat_proto = ~0;
298 
299  nat_proto = (ip_proto == IP_PROTOCOL_UDP) ? KP_NAT_PROTOCOL_UDP : nat_proto;
300  nat_proto = (ip_proto == IP_PROTOCOL_TCP) ? KP_NAT_PROTOCOL_TCP : nat_proto;
301 
302  return nat_proto;
303 }
304 
305 /* Key for Pod's egress SNAT */
306 typedef struct {
307  union
308  {
309  struct
310  {
313  u16 protocol:3,
314  fib_index:13;
315  };
317  };
319 
320 typedef struct
321 {
327 
328 typedef struct {
330  ip46_address_t vip;
331  ip46_address_t node_ip;
332  ip46_address_t pod_ip;
336  u16 port; /* Network byte order */
337  u16 node_port; /* Network byte order */
338  u16 target_port; /* Network byte order */
342 
343 typedef struct {
344  /**
345  * Each CPU has its own sticky flow hash table.
346  * One single table is used for all VIPs.
347  */
349 
350 } kp_per_cpu_t;
351 
352 typedef struct {
353  /**
354  * Pool of all Virtual IPs
355  */
357 
358  /**
359  * Pool of PODs.
360  * PODs are referenced by address and vip index.
361  * The first element (index 0) is special and used only to fill
362  * new_flow_tables when no POD has been configured.
363  */
365 
366  /**
367  * Each POD has an associated reference counter.
368  * As pods[0] has a special meaning, its associated counter
369  * starts at 0 and is decremented instead. i.e. do not use it.
370  */
372 
373  /* hash lookup vip_index by key: {u16: nodeport} */
375 
376 
377  /**
378  * Some global data is per-cpu
379  */
381 
382  /**
383  * Node next index for IP adjacencies, for each of the traffic types.
384  */
385  u32 ip_lookup_next_index[KP_VIP_N_TYPES];
386 
387  /**
388  * Number of buckets in the per-cpu sticky hash table.
389  */
391 
392  /**
393  * Flow timeout in seconds.
394  */
396 
397  /**
398  * Per VIP counter
399  */
401 
402  /**
403  * DPO used to send packet from IP4/6 lookup to KP node.
404  */
407 
408  /**
409  * Node type for registering to fib changes.
410  */
412 
413  /* Find a static mapping by pod IP : target_port */
414  clib_bihash_8_8_t mapping_by_pod;
415 
416  /* Static mapping pool */
418 
419  /**
420  * API dynamically registered base ID.
421  */
423 
424  volatile u32 *writer_lock;
425 
426  /* convenience */
429 } kp_main_t;
430 
431 #define ip46_address_type(ip46) (ip46_address_is_ip4(ip46)?IP46_TYPE_IP4:IP46_TYPE_IP6)
432 #define ip46_prefix_is_ip4(ip46, len) ((len) >= 96 && ip46_address_is_ip4(ip46))
433 #define ip46_prefix_type(ip46, len) (ip46_prefix_is_ip4(ip46, len)?IP46_TYPE_IP4:IP46_TYPE_IP6)
434 
435 void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen);
436 uword unformat_ip46_prefix (unformat_input_t * input, va_list * args);
437 u8 *format_ip46_prefix (u8 * s, va_list * args);
438 
439 
440 extern kp_main_t kp_main;
446 
447 /**
448  * Fix global kube-proxy parameters.
449  * @return 0 on success. VNET_KP_ERR_XXX on error
450  */
451 int kp_conf(u32 sticky_buckets, u32 flow_timeout);
452 
453 int kp_vip_add(ip46_address_t *prefix, u8 plen, kp_vip_type_t type,
454  u32 new_length, u32 *vip_index,
455  u16 port, u16 target_port, u16 node_port);
456 int kp_vip_del(u32 vip_index);
457 
458 int kp_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index);
459 
460 #define kp_vip_get_by_index(index) (pool_is_free_index(kp_main.vips, index)?NULL:pool_elt_at_index(kp_main.vips, index))
461 
462 int kp_vip_add_pods(u32 vip_index, ip46_address_t *addresses, u32 n);
463 int kp_vip_del_pods(u32 vip_index, ip46_address_t *addresses, u32 n);
464 
466 
467 void kp_garbage_collection();
468 
469 int kp_nat4_interface_add_del (u32 sw_if_index, int is_del);
470 
472 
473 #endif /* KP_PLUGIN_KP_KP_H_ */
kp_nat_in2out_error_t
Definition: kp.h:63
uword( unformat_function_t)(unformat_input_t *input, va_list *args)
Definition: format.h:231
u32 vrf_id
Definition: kp.h:324
u8 vip_is_ipv6
Definition: kp.h:333
void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen)
Definition: kp.c:35
ip46_address_t pod_ip
Definition: kp.h:332
vlib_refcount_t pod_refcount
Each POD has an associated reference counter.
Definition: kp.h:371
u32 pod_index
Definition: kp.h:151
#define foreach_kp_nat_in2out_error
Definition: kp.h:58
format_function_t format_kp_main
Definition: kp.h:471
uword * nodeport_by_key
Definition: kp.h:374
kp_next_t
Definition: kp.h:47
u32 fib_index
Definition: kp.h:340
static u32 kp_ip_proto_to_nat_proto(u8 ip_proto)
Definition: kp.h:295
vnet_main_t * vnet_main
Definition: kp.h:428
vlib_node_registration_t kp6_node
kp_per_cpu_t * per_cpu
Some global data is per-cpu.
Definition: kp.h:380
ip4_address_t addr
Definition: kp.h:311
kp_nat4_in2out_next_t
Definition: kp.h:52
u8 *( format_function_t)(u8 *s, va_list *args)
Definition: format.h:48
u32 vip_index
PODs are indexed by address and VIP Index.
Definition: kp.h:110
fib_node_t fib_node
Registration to FIB event.
Definition: kp.h:96
kp_vip_type_t
kube-proxy supports IPv4 and IPv6 traffic and NAT4 and NAT6.
Definition: kp.h:171
int kp_vip_add_pods(u32 vip_index, ip46_address_t *addresses, u32 n)
Definition: kp.c:481
int kp_conf(u32 sticky_buckets, u32 flow_timeout)
Fix global kube-proxy parameters.
Definition: kp.c:421
u8 * format_ip46_prefix(u8 *s, va_list *args)
Definition: kp.c:71
u16 port
Definition: kp.h:312
enum dpo_type_t_ dpo_type_t
Common types of data-path objects New types can be dynamically added using dpo_register_new_type() ...
vlib_main_t * vlib_main
Definition: kp.h:427
u32 vrf_id
Definition: kp.h:339
#define always_inline
Definition: clib.h:92
u8 plen
Definition: kp.h:323
u32 last_used
Rotating timestamp of when KP_POD_FLAGS_USED flag was last set.
Definition: kp.h:129
u8 plen
The VIP prefix length.
Definition: kp.h:226
vlib_node_registration_t kp6_nodeport_node
(constructor) VLIB_REGISTER_NODE (kp6_nodeport_node)
Definition: kp_node.c:795
vlib_node_registration_t kp_nat4_in2out_node
(constructor) VLIB_REGISTER_NODE (kp_nat4_in2out_node)
Definition: kp_node.c:823
kp_snat_mapping_t * snat_mappings
Definition: kp.h:417
format_function_t format_kp_pod
Definition: kp.h:148
unsigned long u64
Definition: types.h:89
Definition: kp.h:150
ip6_address_t prefix
Definition: kp.h:322
u32 * pod_indexes
Pool of POD indexes used for this VIP.
Definition: kp.h:264
int kp_vip_del(u32 vip_index)
Definition: kp.c:810
u32 kp_hash_time_now(vlib_main_t *vm)
Definition: kp.c:106
A collection of simple counters.
Definition: counter.h:58
kp_main_t kp_main
Definition: kp.c:28
Definition: kp.h:352
kp_new_flow_entry_t * new_flow_table
Vector mapping (flow-hash & new_connect_table_mask) to POD index.
Definition: kp.h:198
u16 target_port
Pod&#39;s port corresponding to specific service.
Definition: kp.h:236
kp_nat_protocol_t
Definition: kp.h:288
The identity of a DPO is a combination of its type and its instance number/index of objects of that t...
Definition: dpo.h:168
dpo_type_t dpo_nat4_type
DPO used to send packet from IP4/6 lookup to KP node.
Definition: kp.h:405
struct _unformat_input_t unformat_input_t
ip46_address_t prefix
A Virtual IP represents a given service delivered by a set of PODs.
Definition: kp.h:220
volatile u32 * writer_lock
Definition: kp.h:424
int kp_vip_del_pods(u32 vip_index, ip46_address_t *addresses, u32 n)
Definition: kp.c:670
dpo_type_t dpo_nat6_type
Definition: kp.h:406
ip46_address_t address
Destination address used to transfer traffic towards to that POD.
Definition: kp.h:103
fib_node_index_t next_hop_fib_entry_index
The FIB entry index for the next-hop.
Definition: kp.h:134
u8 node_ip_is_ipv6
Definition: kp.h:334
u32 last_garbage_collection
last time garbage collection was run to free the PODs.
Definition: kp.h:209
ip46_address_t vip
Definition: kp.h:330
u32 vip_index
Definition: kp.h:273
An node in the FIB graph.
Definition: fib_node.h:286
u32 per_cpu_sticky_buckets
Number of buckets in the per-cpu sticky hash table.
Definition: kp.h:390
vlib_main_t * vm
Definition: buffer.c:294
u16 node_port
Definition: kp.h:337
Each VIP is configured with a set of PODs.
Definition: kp.h:92
#define foreach_kp_nat_protocol
Definition: kp.h:284
u32 fib_node_index_t
A typedef of a node index.
Definition: fib_types.h:30
void kp_garbage_collection()
Definition: kp.c:292
int kp_vip_add(ip46_address_t *prefix, u8 plen, kp_vip_type_t type, u32 new_length, u32 *vip_index, u16 port, u16 target_port, u16 node_port)
Definition: kp.c:725
u32 flow_timeout
Flow timeout in seconds.
Definition: kp.h:395
unsigned int u32
Definition: types.h:88
kp_vip_type_t type
The type of traffic for this.
Definition: kp.h:248
format_function_t format_kp_vip_type
Definition: kp.h:179
unformat_function_t unformat_kp_vip_type
Definition: kp.h:180
vlib_node_registration_t kp4_nodeport_node
(constructor) VLIB_REGISTER_NODE (kp4_nodeport_node)
Definition: kp_node.c:774
u16 port
Service port.
Definition: kp.h:231
u64 as_u64
Definition: kp.h:316
u32 next_hop_child_index
The child index on the FIB entry.
Definition: kp.h:139
u32 new_flow_table_mask
New flows table length - 1 (length MUST be a power of 2)
Definition: kp.h:204
u64 uword
Definition: types.h:112
struct _vlib_node_registration vlib_node_registration_t
u16 msg_id_base
API dynamically registered base ID.
Definition: kp.h:422
Definition: kp.h:49
unsigned short u16
Definition: types.h:57
kp_vip_counter_t
Definition: kp.h:160
kp_svr_type_t
kube-proxy supports three types of service
Definition: kp.h:73
u8 flags
Flags related to this VIP.
Definition: kp.h:257
unsigned char u8
Definition: types.h:56
dpo_id_t dpo
The next DPO in the graph to follow.
Definition: kp.h:144
ip46_address_t node_ip
Definition: kp.h:331
int kp_nat4_interface_add_del(u32 sw_if_index, int is_del)
Definition: kp.c:906
#define kp_foreach_vip_counter
Definition: kp.h:154
kp_pod_t * pods
Pool of PODs.
Definition: kp.h:364
u16 target_port
Definition: kp.h:338
vlib_node_registration_t kp4_node
format_function_t format_kp_vip
Definition: kp.h:281
enum fib_node_type_t_ fib_node_type_t
The types of nodes in a FIB graph.
kp_vip_t * vips
Pool of all Virtual IPs.
Definition: kp.h:356
uword unformat_ip46_prefix(unformat_input_t *input, va_list *args)
Definition: kp.c:49
fib_node_type_t fib_node_type
Node type for registering to fib changes.
Definition: kp.h:411
kp_nodeport_next_t
Definition: kp.h:80
u8 pod_ip_is_ipv6
Definition: kp.h:335
kp_svr_type_t svr_type
Definition: kp.h:329
u8 flags
Some per-POD flags.
Definition: kp.h:116
clib_bihash_8_8_t mapping_by_pod
Definition: kp.h:414
int kp_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index)
Definition: kp.c:454
u32 fib_index
Definition: kp.h:325
format_function_t format_kp_vip_detailed
Definition: kp.h:282
Load balancing service is provided per VIP.
Definition: kp.h:190
u16 node_port
Node&#39;s port, can access service via NodeIP:node_port.
Definition: kp.h:241
kp_hash_t * sticky_ht
Each CPU has its own sticky flow hash table.
Definition: kp.h:348