forked from cilium/cilium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbpf_overlay.c
755 lines (658 loc) · 20.6 KB
/
bpf_overlay.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright Authors of Cilium */
#include <bpf/ctx/skb.h>
#include <bpf/api.h>
#include <node_config.h>
#include <netdev_config.h>
#include "lib/mcast.h"
#define IS_BPF_OVERLAY 1
/* Controls the inclusion of the CILIUM_CALL_HANDLE_ICMP6_NS section in the
* bpf_lxc object file.
*/
#define SKIP_ICMPV6_NS_HANDLING
/* Controls the inclusion of the CILIUM_CALL_SRV6 section in the object file.
*/
#define SKIP_SRV6_HANDLING
#include "lib/tailcall.h"
#include "lib/common.h"
#include "lib/edt.h"
#include "lib/maps.h"
#include "lib/ipv6.h"
#include "lib/eth.h"
#include "lib/dbg.h"
#include "lib/trace.h"
#include "lib/l3.h"
#include "lib/drop.h"
#include "lib/identity.h"
#include "lib/nodeport.h"
#include "lib/clustermesh.h"
#include "lib/wireguard.h"
#include "lib/egress_gateway.h"
#ifdef ENABLE_VTEP
#include "lib/arp.h"
#include "lib/encap.h"
#include "lib/eps.h"
#endif /* ENABLE_VTEP */
#ifdef ENABLE_IPV6
static __always_inline int handle_ipv6(struct __ctx_buff *ctx,
__u32 *identity,
__s8 *ext_err __maybe_unused)
{
int ret, l3_off = ETH_HLEN;
struct remote_endpoint_info *info;
void *data_end, *data;
struct ipv6hdr *ip6;
struct endpoint_info *ep;
bool decrypted;
bool __maybe_unused is_dsr = false;
/* verifier workaround (dereference of modified ctx ptr) */
if (!revalidate_data_pull(ctx, &data, &data_end, &ip6))
return DROP_INVALID;
#ifdef ENABLE_NODEPORT
if (!ctx_skip_nodeport(ctx)) {
ret = nodeport_lb6(ctx, ip6, *identity, ext_err, &is_dsr);
/* nodeport_lb6() returns with TC_ACT_REDIRECT for
* traffic to L7 LB. Policy enforcement needs to take
* place after L7 LB has processed the packet, so we
* return to stack immediately here with
* TC_ACT_REDIRECT.
*/
if (ret < 0 || ret == TC_ACT_REDIRECT)
return ret;
}
#endif
if (!revalidate_data(ctx, &data, &data_end, &ip6))
return DROP_INVALID;
/* Lookup the source in the ipcache. After decryption this will be the
* inner source IP to get the source security identity.
*/
info = lookup_ip6_remote_endpoint((union v6addr *)&ip6->saddr, 0);
decrypted = ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_DECRYPT);
if (decrypted) {
if (info)
*identity = info->sec_identity;
cilium_dbg(ctx, info ? DBG_IP_ID_MAP_SUCCEED6 : DBG_IP_ID_MAP_FAILED6,
((__u32 *)&ip6->saddr)[3], *identity);
} else {
/* Maybe overwrite the REMOTE_NODE_ID with
* KUBE_APISERVER_NODE_ID to support upgrade. After v1.12,
* identity_is_remote_node() should be removed.
*
* A packet that has DSR info and comes from `world` may have specific identity when
* a CNP that is using CIDR rules is applied.
*/
if (info && (identity_is_remote_node(*identity) ||
(is_dsr && identity_is_world_ipv6(*identity))))
*identity = info->sec_identity;
}
#ifdef ENABLE_IPSEC
if (!decrypted) {
/* IPSec is not currently enforce (feature coming soon)
* so for now just handle normally
*/
if (ip6->nexthdr != IPPROTO_ESP) {
update_metrics(ctx_full_len(ctx), METRIC_INGRESS,
REASON_PLAINTEXT);
goto not_esp;
}
/* Decrypt "key" is determined by SPI */
ctx->mark = MARK_MAGIC_DECRYPT;
/* To IPSec stack on cilium_vxlan we are going to pass
* this up the stack but eth_type_trans has already labeled
* this as an OTHERHOST type packet. To avoid being dropped
* by IP stack before IPSec can be processed mark as a HOST
* packet.
*/
ctx_change_type(ctx, PACKET_HOST);
send_trace_notify(ctx, TRACE_TO_STACK, *identity, 0, 0,
ctx->ingress_ifindex, TRACE_REASON_ENCRYPTED, 0);
return CTX_ACT_OK;
}
ctx->mark = 0;
not_esp:
#endif
/* Deliver to local (non-host) endpoint: */
ep = lookup_ip6_endpoint(ip6);
if (ep && !(ep->flags & ENDPOINT_F_HOST))
return ipv6_local_delivery(ctx, l3_off, *identity, MARK_MAGIC_IDENTITY,
ep, METRIC_INGRESS, false, true);
/* A packet entering the node from the tunnel and not going to a local
* endpoint has to be going to the local host.
*/
#ifdef HOST_IFINDEX
if (1) {
union macaddr host_mac = HOST_IFINDEX_MAC;
union macaddr router_mac = NODE_MAC;
ret = ipv6_l3(ctx, ETH_HLEN, (__u8 *)&router_mac.addr,
(__u8 *)&host_mac.addr, METRIC_INGRESS);
if (ret != CTX_ACT_OK)
return ret;
cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, HOST_IFINDEX);
return ctx_redirect(ctx, HOST_IFINDEX, 0);
}
#else
return CTX_ACT_OK;
#endif
}
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_FROM_OVERLAY)
int tail_handle_ipv6(struct __ctx_buff *ctx)
{
__u32 src_sec_identity = ctx_load_and_clear_meta(ctx, CB_SRC_LABEL);
__s8 ext_err = 0;
int ret;
ret = handle_ipv6(ctx, &src_sec_identity, &ext_err);
if (IS_ERR(ret))
return send_drop_notify_error_ext(ctx, src_sec_identity, ret, ext_err,
CTX_ACT_DROP, METRIC_INGRESS);
return ret;
}
#endif /* ENABLE_IPV6 */
#ifdef ENABLE_IPV4
static __always_inline int ipv4_host_delivery(struct __ctx_buff *ctx, struct iphdr *ip4)
{
#ifdef HOST_IFINDEX
if (1) {
union macaddr host_mac = HOST_IFINDEX_MAC;
union macaddr router_mac = NODE_MAC;
int ret;
ret = ipv4_l3(ctx, ETH_HLEN, (__u8 *)&router_mac.addr,
(__u8 *)&host_mac.addr, ip4);
if (ret != CTX_ACT_OK)
return ret;
cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, HOST_IFINDEX);
return ctx_redirect(ctx, HOST_IFINDEX, 0);
}
#else
return CTX_ACT_OK;
#endif
}
#if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT)
static __always_inline int handle_inter_cluster_revsnat(struct __ctx_buff *ctx,
__u32 src_sec_identity,
__s8 *ext_err)
{
int ret;
struct iphdr *ip4;
__u32 cluster_id = 0;
void *data_end, *data;
struct endpoint_info *ep;
__u32 cluster_id_from_identity =
extract_cluster_id_from_identity(src_sec_identity);
const struct ipv4_nat_target target = {
.min_port = NODEPORT_PORT_MIN_NAT,
.max_port = NODEPORT_PORT_MAX_NAT,
.cluster_id = cluster_id_from_identity,
};
struct trace_ctx trace;
ret = snat_v4_rev_nat(ctx, &target, &trace, ext_err);
if (ret != NAT_PUNT_TO_STACK && ret != DROP_NAT_NO_MAPPING) {
if (IS_ERR(ret))
return ret;
/*
* RevSNAT succeeded. Identify the remote host using
* cluster_id in the rest of the datapath logic.
*/
cluster_id = cluster_id_from_identity;
}
/* Theoretically, we only need to revalidate data after we
* perform revSNAT. However, we observed the mysterious
* verifier error in the kernel 4.19 that when we only do
* revalidate after the revSNAT, verifier detects an error
* for the subsequent read for ip4 pointer. To avoid that,
* we always revalidate data here.
*/
if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
ep = lookup_ip4_endpoint(ip4);
if (ep) {
/* We don't support inter-cluster SNAT from host */
if (ep->flags & ENDPOINT_F_HOST)
return ipv4_host_delivery(ctx, ip4);
return ipv4_local_delivery(ctx, ETH_HLEN, src_sec_identity,
MARK_MAGIC_IDENTITY, ip4, ep,
METRIC_INGRESS, false, false, true,
cluster_id);
}
return DROP_UNROUTABLE;
}
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_INTER_CLUSTER_REVSNAT)
int tail_handle_inter_cluster_revsnat(struct __ctx_buff *ctx)
{
int ret;
__u32 src_sec_identity = ctx_load_and_clear_meta(ctx, CB_SRC_LABEL);
__s8 ext_err = 0;
ret = handle_inter_cluster_revsnat(ctx, src_sec_identity, &ext_err);
if (IS_ERR(ret))
return send_drop_notify_error_ext(ctx, src_sec_identity, ret, ext_err,
CTX_ACT_DROP, METRIC_INGRESS);
return ret;
}
#endif
static __always_inline int handle_ipv4(struct __ctx_buff *ctx,
__u32 *identity,
__s8 *ext_err __maybe_unused)
{
struct remote_endpoint_info *info;
void *data_end, *data;
struct iphdr *ip4;
struct endpoint_info *ep;
bool decrypted;
bool __maybe_unused is_dsr = false;
/* verifier workaround (dereference of modified ctx ptr) */
if (!revalidate_data_pull(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
/* If IPv4 fragmentation is disabled
* AND a IPv4 fragmented packet is received,
* then drop the packet.
*/
#ifndef ENABLE_IPV4_FRAGMENTS
if (ipv4_is_fragment(ip4))
return DROP_FRAG_NOSUPPORT;
#endif
#ifdef ENABLE_MULTICAST
if (IN_MULTICAST(bpf_ntohl(ip4->daddr))) {
if (mcast_lookup_subscriber_map(&ip4->daddr)) {
ep_tail_call(ctx, CILIUM_CALL_MULTICAST_EP_DELIVERY);
return DROP_MISSED_TAIL_CALL;
}
}
#endif /* ENABLE_MULTICAST */
#ifdef ENABLE_NODEPORT
if (!ctx_skip_nodeport(ctx)) {
int ret = nodeport_lb4(ctx, ip4, ETH_HLEN, *identity, ext_err, &is_dsr);
/* nodeport_lb4() returns with TC_ACT_REDIRECT for
* traffic to L7 LB. Policy enforcement needs to take
* place after L7 LB has processed the packet, so we
* return to stack immediately here with
* TC_ACT_REDIRECT.
*/
if (ret < 0 || ret == TC_ACT_REDIRECT)
return ret;
}
#endif
if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
/* Lookup the source in the ipcache. After decryption this will be the
* inner source IP to get the source security identity.
*/
info = lookup_ip4_remote_endpoint(ip4->saddr, 0);
decrypted = ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_DECRYPT);
/* If packets are decrypted the key has already been pushed into metadata. */
if (decrypted) {
if (info)
*identity = info->sec_identity;
cilium_dbg(ctx, info ? DBG_IP_ID_MAP_SUCCEED4 : DBG_IP_ID_MAP_FAILED4,
ip4->saddr, *identity);
} else {
#ifdef ENABLE_VTEP
{
struct vtep_key vkey = {};
struct vtep_value *vtep;
vkey.vtep_ip = ip4->saddr & VTEP_MASK;
vtep = map_lookup_elem(&VTEP_MAP, &vkey);
if (!vtep)
goto skip_vtep;
if (vtep->tunnel_endpoint) {
if (identity_is_world_ipv4(*identity))
return DROP_INVALID_VNI;
}
}
skip_vtep:
#endif
#if defined(ENABLE_CLUSTER_AWARE_ADDRESSING) && defined(ENABLE_INTER_CLUSTER_SNAT)
{
__u32 cluster_id_from_identity =
extract_cluster_id_from_identity(*identity);
/* When we see inter-cluster communication and if
* the destination is IPV4_INTER_CLUSTER_SNAT, try
* to perform revSNAT. We tailcall from here since
* we saw the complexity issue when we added this
* logic in-line.
*/
if (cluster_id_from_identity != 0 &&
cluster_id_from_identity != CLUSTER_ID &&
ip4->daddr == IPV4_INTER_CLUSTER_SNAT) {
ctx_store_meta(ctx, CB_SRC_LABEL, *identity);
ep_tail_call(ctx, CILIUM_CALL_IPV4_INTER_CLUSTER_REVSNAT);
return DROP_MISSED_TAIL_CALL;
}
}
#endif
/* See comment at equivalent code in handle_ipv6() */
if (info && (identity_is_remote_node(*identity) ||
(is_dsr && identity_is_world_ipv4(*identity))))
*identity = info->sec_identity;
}
#ifdef ENABLE_IPSEC
if (!decrypted) {
/* IPSec is not currently enforce (feature coming soon)
* so for now just handle normally
*/
if (ip4->protocol != IPPROTO_ESP) {
update_metrics(ctx_full_len(ctx), METRIC_INGRESS,
REASON_PLAINTEXT);
goto not_esp;
}
ctx->mark = MARK_MAGIC_DECRYPT;
/* To IPSec stack on cilium_vxlan we are going to pass
* this up the stack but eth_type_trans has already labeled
* this as an OTHERHOST type packet. To avoid being dropped
* by IP stack before IPSec can be processed mark as a HOST
* packet.
*/
ctx_change_type(ctx, PACKET_HOST);
send_trace_notify(ctx, TRACE_TO_STACK, *identity, 0, 0,
ctx->ingress_ifindex, TRACE_REASON_ENCRYPTED, 0);
return CTX_ACT_OK;
}
ctx->mark = 0;
not_esp:
#endif
#if defined(ENABLE_EGRESS_GATEWAY_COMMON)
{
__be32 snat_addr, daddr;
int ret;
daddr = ip4->daddr;
if (egress_gw_snat_needed_hook(ip4->saddr, daddr, &snat_addr)) {
ret = ipv4_l3(ctx, ETH_HLEN, NULL, NULL, ip4);
if (unlikely(ret != CTX_ACT_OK))
return ret;
/* to-netdev@bpf_host handles SNAT, so no need to do it here. */
return egress_gw_fib_lookup_and_redirect(ctx, snat_addr,
daddr, ext_err);
}
}
#endif /* ENABLE_EGRESS_GATEWAY_COMMON */
/* Deliver to local (non-host) endpoint: */
ep = lookup_ip4_endpoint(ip4);
if (ep && !(ep->flags & ENDPOINT_F_HOST))
return ipv4_local_delivery(ctx, ETH_HLEN, *identity, MARK_MAGIC_IDENTITY,
ip4, ep, METRIC_INGRESS, false, false, true,
0);
/* A packet entering the node from the tunnel and not going to a local
* endpoint has to be going to the local host.
*/
return ipv4_host_delivery(ctx, ip4);
}
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_FROM_OVERLAY)
int tail_handle_ipv4(struct __ctx_buff *ctx)
{
__u32 src_sec_identity = ctx_load_and_clear_meta(ctx, CB_SRC_LABEL);
__s8 ext_err = 0;
int ret;
ret = handle_ipv4(ctx, &src_sec_identity, &ext_err);
if (IS_ERR(ret))
return send_drop_notify_error_ext(ctx, src_sec_identity, ret, ext_err,
CTX_ACT_DROP, METRIC_INGRESS);
return ret;
}
#ifdef ENABLE_VTEP
/*
* ARP responder for ARP requests from VTEP
* Respond to remote VTEP endpoint with cilium_vxlan MAC
*/
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_ARP)
int tail_handle_arp(struct __ctx_buff *ctx)
{
union macaddr mac = NODE_MAC;
union macaddr smac;
struct trace_ctx trace = {
.reason = TRACE_REASON_CT_REPLY,
.monitor = TRACE_PAYLOAD_LEN,
};
__be32 sip;
__be32 tip;
int ret;
struct bpf_tunnel_key key = {};
struct vtep_key vkey = {};
struct vtep_value *info;
__u32 key_size;
key_size = TUNNEL_KEY_WITHOUT_SRC_IP;
if (unlikely(ctx_get_tunnel_key(ctx, &key, key_size, 0) < 0))
return send_drop_notify_error(ctx, 0, DROP_NO_TUNNEL_KEY, CTX_ACT_DROP,
METRIC_INGRESS);
if (!arp_validate(ctx, &mac, &smac, &sip, &tip) || !__lookup_ip4_endpoint(tip))
goto pass_to_stack;
vkey.vtep_ip = sip & VTEP_MASK;
info = map_lookup_elem(&VTEP_MAP, &vkey);
if (!info)
goto pass_to_stack;
ret = arp_prepare_response(ctx, &mac, tip, &smac, sip);
if (unlikely(ret != 0))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);
if (info->tunnel_endpoint) {
ret = __encap_and_redirect_with_nodeid(ctx, 0, info->tunnel_endpoint,
LOCAL_NODE_ID, WORLD_IPV4_ID,
WORLD_IPV4_ID, &trace);
if (IS_ERR(ret))
goto drop_err;
return ret;
}
ret = DROP_UNKNOWN_L3;
drop_err:
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);
pass_to_stack:
send_trace_notify(ctx, TRACE_TO_STACK, 0, 0, 0, ctx->ingress_ifindex,
trace.reason, trace.monitor);
return CTX_ACT_OK;
}
#endif /* ENABLE_VTEP */
#endif /* ENABLE_IPV4 */
#ifdef ENABLE_IPSEC
static __always_inline bool is_esp(struct __ctx_buff *ctx, __u16 proto)
{
void *data, *data_end;
__u8 protocol = 0;
struct ipv6hdr *ip6 __maybe_unused;
struct iphdr *ip4 __maybe_unused;
switch (proto) {
#ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
if (!revalidate_data_pull(ctx, &data, &data_end, &ip6))
return false;
protocol = ip6->nexthdr;
break;
#endif
#ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP):
if (!revalidate_data_pull(ctx, &data, &data_end, &ip4))
return false;
protocol = ip4->protocol;
break;
#endif
default:
return false;
}
return protocol == IPPROTO_ESP;
}
#endif /* ENABLE_IPSEC */
/* Attached to the ingress of cilium_vxlan/cilium_geneve to execute on packets
* entering the node via the tunnel.
*/
__section_entry
int cil_from_overlay(struct __ctx_buff *ctx)
{
__u32 src_sec_identity = 0;
bool decrypted;
__u16 proto;
int ret;
ctx_skip_nodeport_clear(ctx);
if (!validate_ethertype(ctx, &proto)) {
/* Pass unknown traffic to the stack */
ret = CTX_ACT_OK;
goto out;
}
/* We need to handle following possible packets come to this program
*
* 1. ESP packets coming from overlay (encrypted and not marked)
* 2. Non-ESP packets coming from overlay (plain and not marked)
* 3. Non-ESP packets coming from stack re-inserted by xfrm (plain
* and marked with MARK_MAGIC_DECRYPT. Only in IPSec mode.)
*
* 1. will be traced with TRACE_REASON_ENCRYPTED
* 2. will be traced without TRACE_REASON_ENCRYPTED
* 3. will be traced without TRACE_REASON_ENCRYPTED
*
* Note that 1. contains the ESP packets someone else generated.
* In that case, we trace it as "encrypted", but it doesn't mean
* "encrypted by Cilium".
*
* When IPSec is disabled, we won't use TRACE_REASON_ENCRYPTED even
* if the packets are ESP, because it doesn't matter for the
* non-IPSec mode.
*/
decrypted = ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_DECRYPT);
switch (proto) {
#if defined(ENABLE_IPV4) || defined(ENABLE_IPV6)
#ifdef ENABLE_IPV6
case bpf_htons(ETH_P_IPV6):
#endif
#ifdef ENABLE_IPV4
case bpf_htons(ETH_P_IP):
#endif
/* If packets are decrypted the key has already been pushed into metadata. */
if (!decrypted) {
struct bpf_tunnel_key key = {};
#ifdef ENABLE_HIGH_SCALE_IPCACHE
/* already set by decapsulate_overlay(): */
key.tunnel_id = ctx_load_meta(ctx, CB_SRC_LABEL);
#else
__u32 key_size = TUNNEL_KEY_WITHOUT_SRC_IP;
if (unlikely(ctx_get_tunnel_key(ctx, &key, key_size, 0) < 0)) {
ret = DROP_NO_TUNNEL_KEY;
goto out;
}
#endif /* ENABLE_HIGH_SCALE_IPCACHE */
cilium_dbg(ctx, DBG_DECAP, key.tunnel_id, key.tunnel_label);
src_sec_identity = get_id_from_tunnel_id(key.tunnel_id, proto);
/* Any node encapsulating will map any HOST_ID source to be
* presented as REMOTE_NODE_ID, therefore any attempt to signal
* HOST_ID as source from a remote node can be dropped.
*/
if (src_sec_identity == HOST_ID) {
ret = DROP_INVALID_IDENTITY;
goto out;
}
ctx_store_meta(ctx, CB_SRC_LABEL, src_sec_identity);
}
break;
#endif /* ENABLE_IPV4 || ENABLE_IPV6 */
default:
break;
}
#ifdef ENABLE_IPSEC
if (is_esp(ctx, proto))
send_trace_notify(ctx, TRACE_FROM_OVERLAY, src_sec_identity, 0, 0,
ctx->ingress_ifindex, TRACE_REASON_ENCRYPTED, 0);
else
#endif
{
enum trace_point obs_point = TRACE_FROM_OVERLAY;
/* Non-ESP packet marked with MARK_MAGIC_DECRYPT is a packet
* re-inserted from the stack.
*/
if (decrypted)
obs_point = TRACE_FROM_STACK;
send_trace_notify(ctx, obs_point, src_sec_identity, 0, 0,
ctx->ingress_ifindex,
TRACE_REASON_UNKNOWN, TRACE_PAYLOAD_LEN);
}
switch (proto) {
case bpf_htons(ETH_P_IPV6):
#ifdef ENABLE_IPV6
ep_tail_call(ctx, CILIUM_CALL_IPV6_FROM_OVERLAY);
ret = DROP_MISSED_TAIL_CALL;
#else
ret = DROP_UNKNOWN_L3;
#endif
break;
case bpf_htons(ETH_P_IP):
#ifdef ENABLE_IPV4
# ifdef ENABLE_HIGH_SCALE_IPCACHE
# if defined(ENABLE_DSR) && DSR_ENCAP_MODE == DSR_ENCAP_GENEVE
if (ctx_load_meta(ctx, CB_HSIPC_ADDR_V4)) {
struct geneve_dsr_opt4 dsr_opt;
struct bpf_tunnel_key key = {};
set_geneve_dsr_opt4((__be16)ctx_load_meta(ctx, CB_HSIPC_PORT),
ctx_load_meta(ctx, CB_HSIPC_ADDR_V4),
&dsr_opt);
/* Needed to create the metadata_dst for storing tunnel opts: */
if (ctx_set_tunnel_key(ctx, &key, sizeof(key), BPF_F_ZERO_CSUM_TX) < 0) {
ret = DROP_WRITE_ERROR;
goto out;
}
if (ctx_set_tunnel_opt(ctx, &dsr_opt, sizeof(dsr_opt)) < 0) {
ret = DROP_WRITE_ERROR;
goto out;
}
}
# endif
# endif
ep_tail_call(ctx, CILIUM_CALL_IPV4_FROM_OVERLAY);
ret = DROP_MISSED_TAIL_CALL;
#else
ret = DROP_UNKNOWN_L3;
#endif
break;
#ifdef ENABLE_VTEP
case bpf_htons(ETH_P_ARP):
ep_tail_call(ctx, CILIUM_CALL_ARP);
ret = DROP_MISSED_TAIL_CALL;
break;
#endif
default:
/* Pass unknown traffic to the stack */
ret = CTX_ACT_OK;
}
out:
if (IS_ERR(ret))
return send_drop_notify_error(ctx, src_sec_identity, ret,
CTX_ACT_DROP, METRIC_INGRESS);
return ret;
}
/* Attached to the egress of cilium_vxlan/cilium_geneve to execute on packets
* leaving the node via the tunnel.
*/
__section_entry
int cil_to_overlay(struct __ctx_buff *ctx)
{
struct trace_ctx __maybe_unused trace;
int ret = TC_ACT_OK;
__u32 cluster_id __maybe_unused = 0;
__s8 ext_err = 0;
#ifdef ENABLE_BANDWIDTH_MANAGER
/* In tunneling mode, we should do this as close as possible to the
* phys dev where FQ runs, but the issue is that the aggregate state
* (in queue_mapping) is overridden on tunnel xmit. Hence set the
* timestamp already here. The tunnel dev has noqueue qdisc, so as
* tradeoff it's close enough.
*/
ret = edt_sched_departure(ctx);
/* No send_drop_notify_error() here given we're rate-limiting. */
if (ret == CTX_ACT_DROP) {
update_metrics(ctx_full_len(ctx), METRIC_EGRESS,
-DROP_EDT_HORIZON);
return CTX_ACT_DROP;
}
#endif
#ifdef ENABLE_NODEPORT
if (ctx_snat_done(ctx)) {
ret = CTX_ACT_OK;
goto out;
}
/* This must be after above ctx_snat_done, since the MARK_MAGIC_CLUSTER_ID
* is a super set of the MARK_MAGIC_SNAT_DONE. They will never be used together,
* but SNAT check should always take presedence.
*/
#ifdef ENABLE_CLUSTER_AWARE_ADDRESSING
cluster_id = ctx_get_cluster_id_mark(ctx);
#endif
ret = handle_nat_fwd(ctx, cluster_id, &trace, &ext_err);
out:
#endif
if (IS_ERR(ret))
return send_drop_notify_error_ext(ctx, 0, ret, ext_err,
CTX_ACT_DROP, METRIC_EGRESS);
return ret;
}
BPF_LICENSE("Dual BSD/GPL");