forked from apple/darwin-xnu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
content_filter.c
4132 lines (3529 loc) · 111 KB
/
content_filter.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2013-2017 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
/*
* THEORY OF OPERATION
*
* The socket content filter subsystem provides a way for user space agents to
* make filtering decisions based on the content of the data being sent and
* received by TCP/IP sockets.
*
* A content filter user space agents gets a copy of the data and the data is
* also kept in kernel buffer until the user space agents makes a pass or drop
* decision. This unidirectional flow of content avoids unnecessary data copies
* back to the kernel.
*
* A user space filter agent opens a kernel control socket with the name
* CONTENT_FILTER_CONTROL_NAME to attach to the socket content filter subsystem.
* When connected, a "struct content_filter" is created and set as the
* "unitinfo" of the corresponding kernel control socket instance.
*
* The socket content filter subsystem exchanges messages with the user space
* filter agent until an ultimate pass or drop decision is made by the
* user space filter agent.
*
* It should be noted that messages about many TCP/IP sockets can be multiplexed
* over a single kernel control socket.
*
* Notes:
* - The current implementation is limited to TCP sockets.
* - The current implementation supports up to two simultaneous content filters
* for the sake of simplicity of the implementation.
*
*
* NECP FILTER CONTROL UNIT
*
* A user space filter agent uses the Network Extension Control Policy (NECP)
* database to specify which TCP/IP sockets need to be filtered. The NECP
* criteria may be based on a variety of properties like user ID or proc UUID.
*
* The NECP "filter control unit" is used by the socket content filter subsystem
* to deliver the relevant TCP/IP content information to the appropriate
* user space filter agent via its kernel control socket instance.
* This works as follows:
*
* 1) The user space filter agent specifies an NECP filter control unit when
* in adds its filtering rules to the NECP database.
*
* 2) The user space filter agent also sets its NECP filter control unit on the
* content filter kernel control socket via the socket option
* CFIL_OPT_NECP_CONTROL_UNIT.
*
* 3) The NECP database is consulted to find out if a given TCP/IP socket
* needs to be subjected to content filtering and returns the corresponding
* NECP filter control unit -- the NECP filter control unit is actually
* stored in the TCP/IP socket structure so the NECP lookup is really simple.
*
* 4) The NECP filter control unit is then used to find the corresponding
* kernel control socket instance.
*
* Note: NECP currently supports a single filter control unit per TCP/IP socket
* but this restriction may be soon lifted.
*
*
* THE MESSAGING PROTOCOL
*
* The socket content filter subsystem and a user space filter agent
* communicate over the kernel control socket via an asynchronous
* messaging protocol (this is not a request-response protocol).
* The socket content filter subsystem sends event messages to the user
* space filter agent about the TCP/IP sockets it is interested to filter.
* The user space filter agent sends action messages to either allow
* data to pass or to disallow the data flow (and drop the connection).
*
* All messages over a content filter kernel control socket share the same
* common header of type "struct cfil_msg_hdr". The message type tells if
* it's a event message "CFM_TYPE_EVENT" or a action message "CFM_TYPE_ACTION".
* The message header field "cfm_sock_id" identifies a given TCP/IP socket.
* Note the message header length field may be padded for alignment and can
* be larger than the actual content of the message.
* The field "cfm_op" describe the kind of event or action.
*
* Here are the kinds of content filter events:
* - CFM_OP_SOCKET_ATTACHED: a new TCP/IP socket is being filtered
* - CFM_OP_SOCKET_CLOSED: A TCP/IP socket is closed
* - CFM_OP_DATA_OUT: A span of data is being sent on a TCP/IP socket
* - CFM_OP_DATA_IN: A span of data is being or received on a TCP/IP socket
*
*
* EVENT MESSAGES
*
* The CFM_OP_DATA_OUT and CFM_OP_DATA_IN event messages contains a span of
* data that is being sent or received. The position of this span of data
* in the data flow is described by a set of start and end offsets. These
* are absolute 64 bits offsets. The first byte sent (or received) starts
* at offset 0 and ends at offset 1. The length of the content data
* is given by the difference between the end offset and the start offset.
*
* After a CFM_OP_SOCKET_ATTACHED is delivered, CFM_OP_DATA_OUT and
* CFM_OP_DATA_OUT events are not delivered until a CFM_OP_DATA_UPDATE
* action message is sent by the user space filter agent.
*
* Note: absolute 64 bits offsets should be large enough for the foreseeable
* future. A 64-bits counter will wrap after 468 years at 10 Gbit/sec:
* 2E64 / ((10E9 / 8) * 60 * 60 * 24 * 365.25) = 467.63
*
* They are two kinds of primary content filter actions:
* - CFM_OP_DATA_UPDATE: to update pass or peek offsets for each direction.
* - CFM_OP_DROP: to shutdown socket and disallow further data flow
*
* There is also an action to mark a given client flow as already filtered
* at a higher level, CFM_OP_BLESS_CLIENT.
*
*
* ACTION MESSAGES
*
* The CFM_OP_DATA_UPDATE action messages let the user space filter
* agent allow data to flow up to the specified pass offset -- there
* is a pass offset for outgoing data and a pass offset for incoming data.
* When a new TCP/IP socket is attached to the content filter, each pass offset
* is initially set to 0 so not data is allowed to pass by default.
* When the pass offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
* then the data flow becomes unrestricted.
*
* Note that pass offsets can only be incremented. A CFM_OP_DATA_UPDATE message
* with a pass offset smaller than the pass offset of a previous
* CFM_OP_DATA_UPDATE message is silently ignored.
*
* A user space filter agent also uses CFM_OP_DATA_UPDATE action messages
* to tell the kernel how much data it wants to see by using the peek offsets.
* Just like pass offsets, there is a peek offset for each direction.
* When a new TCP/IP socket is attached to the content filter, each peek offset
* is initially set to 0 so no CFM_OP_DATA_OUT and CFM_OP_DATA_IN event
* messages are dispatched by default until a CFM_OP_DATA_UPDATE action message
* with a greater than 0 peek offset is sent by the user space filter agent.
* When the peek offset is set to CFM_MAX_OFFSET via a CFM_OP_DATA_UPDATE
* then the flow of update data events becomes unrestricted.
*
* Note that peek offsets cannot be smaller than the corresponding pass offset.
* Also a peek offsets cannot be smaller than the corresponding end offset
* of the last CFM_OP_DATA_OUT/CFM_OP_DATA_IN message dispatched. Trying
* to set a too small peek value is silently ignored.
*
*
* PER SOCKET "struct cfil_info"
*
* As soon as a TCP/IP socket gets attached to a content filter, a
* "struct cfil_info" is created to hold the content filtering state for this
* socket.
*
* The content filtering state is made of the following information
* for each direction:
* - The current pass offset;
* - The first and last offsets of the data pending, waiting for a filtering
* decision;
* - The inject queue for data that passed the filters and that needs
* to be re-injected;
* - A content filter specific state in a set of "struct cfil_entry"
*
*
* CONTENT FILTER STATE "struct cfil_entry"
*
* The "struct cfil_entry" maintains the information most relevant to the
* message handling over a kernel control socket with a user space filter agent.
*
* The "struct cfil_entry" holds the NECP filter control unit that corresponds
* to the kernel control socket unit it corresponds to and also has a pointer
* to the corresponding "struct content_filter".
*
* For each direction, "struct cfil_entry" maintains the following information:
* - The pass offset
* - The peek offset
* - The offset of the last data peeked at by the filter
* - A queue of data that's waiting to be delivered to the user space filter
* agent on the kernel control socket
* - A queue of data for which event messages have been sent on the kernel
* control socket and are pending for a filtering decision.
*
*
* CONTENT FILTER QUEUES
*
* Data that is being filtered is steered away from the TCP/IP socket buffer
* and instead will sit in one of three content filter queues until the data
* can be re-injected into the TCP/IP socket buffer.
*
* A content filter queue is represented by "struct cfil_queue" that contains
* a list of mbufs and the start and end offset of the data span of
* the list of mbufs.
*
* The data moves into the three content filter queues according to this
* sequence:
* a) The "cfe_ctl_q" of "struct cfil_entry"
* b) The "cfe_pending_q" of "struct cfil_entry"
* c) The "cfi_inject_q" of "struct cfil_info"
*
* Note: The sequence (a),(b) may be repeated several times if there is more
* than one content filter attached to the TCP/IP socket.
*
* The "cfe_ctl_q" queue holds data than cannot be delivered to the
* kernel conntrol socket for two reasons:
* - The peek offset is less that the end offset of the mbuf data
* - The kernel control socket is flow controlled
*
* The "cfe_pending_q" queue holds data for which CFM_OP_DATA_OUT or
* CFM_OP_DATA_IN have been successfully dispatched to the kernel control
* socket and are waiting for a pass action message fromn the user space
* filter agent. An mbuf length must be fully allowed to pass to be removed
* from the cfe_pending_q.
*
* The "cfi_inject_q" queue holds data that has been fully allowed to pass
* by the user space filter agent and that needs to be re-injected into the
* TCP/IP socket.
*
*
* IMPACT ON FLOW CONTROL
*
* An essential aspect of the content filer subsystem is to minimize the
* impact on flow control of the TCP/IP sockets being filtered.
*
* The processing overhead of the content filtering may have an effect on
* flow control by adding noticeable delays and cannot be eliminated --
* care must be taken by the user space filter agent to minimize the
* processing delays.
*
* The amount of data being filtered is kept in buffers while waiting for
* a decision by the user space filter agent. This amount of data pending
* needs to be subtracted from the amount of data available in the
* corresponding TCP/IP socket buffer. This is done by modifying
* sbspace() and tcp_sbspace() to account for amount of data pending
* in the content filter.
*
*
* LOCKING STRATEGY
*
* The global state of content filter subsystem is protected by a single
* read-write lock "cfil_lck_rw". The data flow can be done with the
* cfil read-write lock held as shared so it can be re-entered from multiple
* threads.
*
* The per TCP/IP socket content filterstate -- "struct cfil_info" -- is
* protected by the socket lock.
*
* A TCP/IP socket lock cannot be taken while the cfil read-write lock
* is held. That's why we have some sequences where we drop the cfil read-write
* lock before taking the TCP/IP lock.
*
* It is also important to lock the TCP/IP socket buffer while the content
* filter is modifying the amount of pending data. Otherwise the calculations
* in sbspace() and tcp_sbspace() could be wrong.
*
* The "cfil_lck_rw" protects "struct content_filter" and also the fields
* "cfe_link" and "cfe_filter" of "struct cfil_entry".
*
* Actually "cfe_link" and "cfe_filter" are protected by both by
* "cfil_lck_rw" and the socket lock: they may be modified only when
* "cfil_lck_rw" is exclusive and the socket is locked.
*
* To read the other fields of "struct content_filter" we have to take
* "cfil_lck_rw" in shared mode.
*
*
* LIMITATIONS
*
* - For TCP sockets only
*
* - Does not support TCP unordered messages
*/
/*
* TO DO LIST
*
* SOONER:
*
* Deal with OOB
*
* LATER:
*
* If support datagram, enqueue control and address mbufs as well
*/
#include <sys/types.h>
#include <sys/kern_control.h>
#include <sys/queue.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/syslog.h>
#include <kern/locks.h>
#include <kern/zalloc.h>
#include <kern/debug.h>
#include <net/content_filter.h>
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <string.h>
#include <libkern/libkern.h>
#define MAX_CONTENT_FILTER 2
struct cfil_entry;
/*
* The structure content_filter represents a user space content filter
* It's created and associated with a kernel control socket instance
*/
struct content_filter {
kern_ctl_ref cf_kcref;
u_int32_t cf_kcunit;
u_int32_t cf_flags;
uint32_t cf_necp_control_unit;
uint32_t cf_sock_count;
TAILQ_HEAD(, cfil_entry) cf_sock_entries;
};
#define CFF_ACTIVE 0x01
#define CFF_DETACHING 0x02
#define CFF_FLOW_CONTROLLED 0x04
struct content_filter **content_filters = NULL;
uint32_t cfil_active_count = 0; /* Number of active content filters */
uint32_t cfil_sock_attached_count = 0; /* Number of sockets attachements */
uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */
static kern_ctl_ref cfil_kctlref = NULL;
static lck_grp_attr_t *cfil_lck_grp_attr = NULL;
static lck_attr_t *cfil_lck_attr = NULL;
static lck_grp_t *cfil_lck_grp = NULL;
decl_lck_rw_data(static, cfil_lck_rw);
#define CFIL_RW_LCK_MAX 8
int cfil_rw_nxt_lck = 0;
void* cfil_rw_lock_history[CFIL_RW_LCK_MAX];
int cfil_rw_nxt_unlck = 0;
void* cfil_rw_unlock_history[CFIL_RW_LCK_MAX];
#define CONTENT_FILTER_ZONE_NAME "content_filter"
#define CONTENT_FILTER_ZONE_MAX 10
static struct zone *content_filter_zone = NULL; /* zone for content_filter */
#define CFIL_INFO_ZONE_NAME "cfil_info"
#define CFIL_INFO_ZONE_MAX 1024
static struct zone *cfil_info_zone = NULL; /* zone for cfil_info */
MBUFQ_HEAD(cfil_mqhead);
struct cfil_queue {
uint64_t q_start; /* offset of first byte in queue */
uint64_t q_end; /* offset of last byte in queue */
struct cfil_mqhead q_mq;
};
/*
* struct cfil_entry
*
* The is one entry per content filter
*/
struct cfil_entry {
TAILQ_ENTRY(cfil_entry) cfe_link;
struct content_filter *cfe_filter;
struct cfil_info *cfe_cfil_info;
uint32_t cfe_flags;
uint32_t cfe_necp_control_unit;
struct timeval cfe_last_event; /* To user space */
struct timeval cfe_last_action; /* From user space */
struct cfe_buf {
/*
* cfe_pending_q holds data that has been delivered to
* the filter and for which we are waiting for an action
*/
struct cfil_queue cfe_pending_q;
/*
* This queue is for data that has not be delivered to
* the content filter (new data, pass peek or flow control)
*/
struct cfil_queue cfe_ctl_q;
uint64_t cfe_pass_offset;
uint64_t cfe_peek_offset;
uint64_t cfe_peeked;
} cfe_snd, cfe_rcv;
};
#define CFEF_CFIL_ATTACHED 0x0001 /* was attached to filter */
#define CFEF_SENT_SOCK_ATTACHED 0x0002 /* sock attach event was sent */
#define CFEF_DATA_START 0x0004 /* can send data event */
#define CFEF_FLOW_CONTROLLED 0x0008 /* wait for flow control lift */
#define CFEF_SENT_DISCONNECT_IN 0x0010 /* event was sent */
#define CFEF_SENT_DISCONNECT_OUT 0x0020 /* event was sent */
#define CFEF_SENT_SOCK_CLOSED 0x0040 /* closed event was sent */
#define CFEF_CFIL_DETACHED 0x0080 /* filter was detached */
#define CFI_ADD_TIME_LOG(cfil, t1, t0, op) \
struct timeval _tdiff; \
if ((cfil)->cfi_op_list_ctr < CFI_MAX_TIME_LOG_ENTRY) { \
timersub(t1, t0, &_tdiff); \
(cfil)->cfi_op_time[(cfil)->cfi_op_list_ctr] = (uint32_t)(_tdiff.tv_sec * 1000 + _tdiff.tv_usec / 1000);\
(cfil)->cfi_op_list[(cfil)->cfi_op_list_ctr] = (unsigned char)op; \
(cfil)->cfi_op_list_ctr ++; \
}
/*
* struct cfil_info
*
* There is a struct cfil_info per socket
*/
struct cfil_info {
TAILQ_ENTRY(cfil_info) cfi_link;
struct socket *cfi_so;
uint64_t cfi_flags;
uint64_t cfi_sock_id;
struct timeval64 cfi_first_event;
uint32_t cfi_op_list_ctr;
uint32_t cfi_op_time[CFI_MAX_TIME_LOG_ENTRY]; /* time interval in microseconds since first event */
unsigned char cfi_op_list[CFI_MAX_TIME_LOG_ENTRY];
struct cfi_buf {
/*
* cfi_pending_first and cfi_pending_last describe the total
* amount of data outstanding for all the filters on
* this socket and data in the flow queue
* cfi_pending_mbcnt counts in sballoc() "chars of mbufs used"
*/
uint64_t cfi_pending_first;
uint64_t cfi_pending_last;
int cfi_pending_mbcnt;
/*
* cfi_pass_offset is the minimum of all the filters
*/
uint64_t cfi_pass_offset;
/*
* cfi_inject_q holds data that needs to be re-injected
* into the socket after filtering and that can
* be queued because of flow control
*/
struct cfil_queue cfi_inject_q;
} cfi_snd, cfi_rcv;
struct cfil_entry cfi_entries[MAX_CONTENT_FILTER];
} __attribute__((aligned(8)));
#define CFIF_DROP 0x0001 /* drop action applied */
#define CFIF_CLOSE_WAIT 0x0002 /* waiting for filter to close */
#define CFIF_SOCK_CLOSED 0x0004 /* socket is closed */
#define CFIF_RETRY_INJECT_IN 0x0010 /* inject in failed */
#define CFIF_RETRY_INJECT_OUT 0x0020 /* inject out failed */
#define CFIF_SHUT_WR 0x0040 /* shutdown write */
#define CFIF_SHUT_RD 0x0080 /* shutdown read */
#define CFI_MASK_GENCNT 0xFFFFFFFF00000000 /* upper 32 bits */
#define CFI_SHIFT_GENCNT 32
#define CFI_MASK_FLOWHASH 0x00000000FFFFFFFF /* lower 32 bits */
#define CFI_SHIFT_FLOWHASH 0
TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
#define CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
#define CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x)
/*
* Statistics
*/
struct cfil_stats cfil_stats;
/*
* For troubleshooting
*/
int cfil_log_level = LOG_ERR;
int cfil_debug = 1;
/*
* Sysctls for logs and statistics
*/
static int sysctl_cfil_filter_list(struct sysctl_oid *, void *, int,
struct sysctl_req *);
static int sysctl_cfil_sock_list(struct sysctl_oid *, void *, int,
struct sysctl_req *);
SYSCTL_NODE(_net, OID_AUTO, cfil, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "cfil");
SYSCTL_INT(_net_cfil, OID_AUTO, log, CTLFLAG_RW|CTLFLAG_LOCKED,
&cfil_log_level, 0, "");
SYSCTL_INT(_net_cfil, OID_AUTO, debug, CTLFLAG_RW|CTLFLAG_LOCKED,
&cfil_debug, 0, "");
SYSCTL_UINT(_net_cfil, OID_AUTO, sock_attached_count, CTLFLAG_RD|CTLFLAG_LOCKED,
&cfil_sock_attached_count, 0, "");
SYSCTL_UINT(_net_cfil, OID_AUTO, active_count, CTLFLAG_RD|CTLFLAG_LOCKED,
&cfil_active_count, 0, "");
SYSCTL_UINT(_net_cfil, OID_AUTO, close_wait_timeout, CTLFLAG_RW|CTLFLAG_LOCKED,
&cfil_close_wait_timeout, 0, "");
static int cfil_sbtrim = 1;
SYSCTL_UINT(_net_cfil, OID_AUTO, sbtrim, CTLFLAG_RW|CTLFLAG_LOCKED,
&cfil_sbtrim, 0, "");
SYSCTL_PROC(_net_cfil, OID_AUTO, filter_list, CTLFLAG_RD|CTLFLAG_LOCKED,
0, 0, sysctl_cfil_filter_list, "S,cfil_filter_stat", "");
SYSCTL_PROC(_net_cfil, OID_AUTO, sock_list, CTLFLAG_RD|CTLFLAG_LOCKED,
0, 0, sysctl_cfil_sock_list, "S,cfil_sock_stat", "");
SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD|CTLFLAG_LOCKED,
&cfil_stats, cfil_stats, "");
/*
* Forward declaration to appease the compiler
*/
static int cfil_action_data_pass(struct socket *, uint32_t, int,
uint64_t, uint64_t);
static int cfil_action_drop(struct socket *, uint32_t);
static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *);
static int cfil_dispatch_closed_event(struct socket *, int);
static int cfil_data_common(struct socket *, int, struct sockaddr *,
struct mbuf *, struct mbuf *, uint32_t);
static int cfil_data_filter(struct socket *, uint32_t, int,
struct mbuf *, uint64_t);
static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
struct in_addr, u_int16_t);
static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
struct in6_addr *, u_int16_t);
static int cfil_dispatch_attach_event(struct socket *, uint32_t);
static void cfil_info_free(struct socket *, struct cfil_info *);
static struct cfil_info * cfil_info_alloc(struct socket *);
static int cfil_info_attach_unit(struct socket *, uint32_t);
static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t);
static struct socket *cfil_socket_from_client_uuid(uuid_t, bool *);
static int cfil_service_pending_queue(struct socket *, uint32_t, int);
static int cfil_data_service_ctl_q(struct socket *, uint32_t, int);
static void cfil_info_verify(struct cfil_info *);
static int cfil_update_data_offsets(struct socket *, uint32_t, int,
uint64_t, uint64_t);
static int cfil_acquire_sockbuf(struct socket *, int);
static void cfil_release_sockbuf(struct socket *, int);
static int cfil_filters_attached(struct socket *);
static void cfil_rw_lock_exclusive(lck_rw_t *);
static void cfil_rw_unlock_exclusive(lck_rw_t *);
static void cfil_rw_lock_shared(lck_rw_t *);
static void cfil_rw_unlock_shared(lck_rw_t *);
static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *);
static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
static unsigned int cfil_data_length(struct mbuf *, int *);
/*
* Content filter global read write lock
*/
static void
cfil_rw_lock_exclusive(lck_rw_t *lck)
{
void *lr_saved;
lr_saved = __builtin_return_address(0);
lck_rw_lock_exclusive(lck);
cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
}
static void
cfil_rw_unlock_exclusive(lck_rw_t *lck)
{
void *lr_saved;
lr_saved = __builtin_return_address(0);
lck_rw_unlock_exclusive(lck);
cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
}
static void
cfil_rw_lock_shared(lck_rw_t *lck)
{
void *lr_saved;
lr_saved = __builtin_return_address(0);
lck_rw_lock_shared(lck);
cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
}
static void
cfil_rw_unlock_shared(lck_rw_t *lck)
{
void *lr_saved;
lr_saved = __builtin_return_address(0);
lck_rw_unlock_shared(lck);
cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
}
static boolean_t
cfil_rw_lock_shared_to_exclusive(lck_rw_t *lck)
{
void *lr_saved;
boolean_t upgraded;
lr_saved = __builtin_return_address(0);
upgraded = lck_rw_lock_shared_to_exclusive(lck);
if (upgraded) {
cfil_rw_unlock_history[cfil_rw_nxt_unlck] = lr_saved;
cfil_rw_nxt_unlck = (cfil_rw_nxt_unlck + 1) % CFIL_RW_LCK_MAX;
}
return (upgraded);
}
static void
cfil_rw_lock_exclusive_to_shared(lck_rw_t *lck)
{
void *lr_saved;
lr_saved = __builtin_return_address(0);
lck_rw_lock_exclusive_to_shared(lck);
cfil_rw_lock_history[cfil_rw_nxt_lck] = lr_saved;
cfil_rw_nxt_lck = (cfil_rw_nxt_lck + 1) % CFIL_RW_LCK_MAX;
}
static void
cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive)
{
#if !MACH_ASSERT
#pragma unused(lck, exclusive)
#endif
LCK_RW_ASSERT(lck,
exclusive ? LCK_RW_ASSERT_EXCLUSIVE : LCK_RW_ASSERT_HELD);
}
/*
* Return the number of bytes in the mbuf chain using the same
* method as m_length() or sballoc()
*/
static unsigned int
cfil_data_length(struct mbuf *m, int *retmbcnt)
{
struct mbuf *m0;
unsigned int pktlen;
int mbcnt;
if (retmbcnt == NULL)
return (m_length(m));
pktlen = 0;
mbcnt = 0;
for (m0 = m; m0 != NULL; m0 = m0->m_next) {
pktlen += m0->m_len;
mbcnt += MSIZE;
if (m0->m_flags & M_EXT)
mbcnt += m0->m_ext.ext_size;
}
*retmbcnt = mbcnt;
return (pktlen);
}
/*
* Common mbuf queue utilities
*/
static inline void
cfil_queue_init(struct cfil_queue *cfq)
{
cfq->q_start = 0;
cfq->q_end = 0;
MBUFQ_INIT(&cfq->q_mq);
}
static inline uint64_t
cfil_queue_drain(struct cfil_queue *cfq)
{
uint64_t drained = cfq->q_start - cfq->q_end;
cfq->q_start = 0;
cfq->q_end = 0;
MBUFQ_DRAIN(&cfq->q_mq);
return (drained);
}
/* Return 1 when empty, 0 otherwise */
static inline int
cfil_queue_empty(struct cfil_queue *cfq)
{
return (MBUFQ_EMPTY(&cfq->q_mq));
}
static inline uint64_t
cfil_queue_offset_first(struct cfil_queue *cfq)
{
return (cfq->q_start);
}
static inline uint64_t
cfil_queue_offset_last(struct cfil_queue *cfq)
{
return (cfq->q_end);
}
static inline uint64_t
cfil_queue_len(struct cfil_queue *cfq)
{
return (cfq->q_end - cfq->q_start);
}
/*
* Routines to verify some fundamental assumptions
*/
static void
cfil_queue_verify(struct cfil_queue *cfq)
{
mbuf_t m;
mbuf_t n;
uint64_t queuesize = 0;
/* Verify offset are ordered */
VERIFY(cfq->q_start <= cfq->q_end);
/*
* When queue is empty, the offsets are equal otherwise the offsets
* are different
*/
VERIFY((MBUFQ_EMPTY(&cfq->q_mq) && cfq->q_start == cfq->q_end) ||
(!MBUFQ_EMPTY(&cfq->q_mq) &&
cfq->q_start != cfq->q_end));
MBUFQ_FOREACH(m, &cfq->q_mq) {
size_t chainsize = 0;
unsigned int mlen = m_length(m);
if (m == (void *)M_TAG_FREE_PATTERN ||
m->m_next == (void *)M_TAG_FREE_PATTERN ||
m->m_nextpkt == (void *)M_TAG_FREE_PATTERN)
panic("%s - mq %p is free at %p", __func__,
&cfq->q_mq, m);
for (n = m; n != NULL; n = n->m_next) {
if (n->m_type != MT_DATA &&
n->m_type != MT_HEADER &&
n->m_type != MT_OOBDATA)
panic("%s - %p unsupported type %u", __func__,
n, n->m_type);
chainsize += n->m_len;
}
if (mlen != chainsize)
panic("%s - %p m_length() %u != chainsize %lu",
__func__, m, mlen, chainsize);
queuesize += chainsize;
}
if (queuesize != cfq->q_end - cfq->q_start)
panic("%s - %p queuesize %llu != offsetdiffs %llu", __func__,
m, queuesize, cfq->q_end - cfq->q_start);
}
static void
cfil_queue_enqueue(struct cfil_queue *cfq, mbuf_t m, size_t len)
{
CFIL_QUEUE_VERIFY(cfq);
MBUFQ_ENQUEUE(&cfq->q_mq, m);
cfq->q_end += len;
CFIL_QUEUE_VERIFY(cfq);
}
static void
cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len)
{
CFIL_QUEUE_VERIFY(cfq);
VERIFY(m_length(m) == len);
MBUFQ_REMOVE(&cfq->q_mq, m);
MBUFQ_NEXT(m) = NULL;
cfq->q_start += len;
CFIL_QUEUE_VERIFY(cfq);
}
static mbuf_t
cfil_queue_first(struct cfil_queue *cfq)
{
return (MBUFQ_FIRST(&cfq->q_mq));
}
static mbuf_t
cfil_queue_next(struct cfil_queue *cfq, mbuf_t m)
{
#pragma unused(cfq)
return (MBUFQ_NEXT(m));
}
static void
cfil_entry_buf_verify(struct cfe_buf *cfe_buf)
{
CFIL_QUEUE_VERIFY(&cfe_buf->cfe_ctl_q);
CFIL_QUEUE_VERIFY(&cfe_buf->cfe_pending_q);
/* Verify the queues are ordered so that pending is before ctl */
VERIFY(cfe_buf->cfe_ctl_q.q_start >= cfe_buf->cfe_pending_q.q_end);
/* The peek offset cannot be less than the pass offset */
VERIFY(cfe_buf->cfe_peek_offset >= cfe_buf->cfe_pass_offset);
/* Make sure we've updated the offset we peeked at */
VERIFY(cfe_buf->cfe_ctl_q.q_start <= cfe_buf->cfe_peeked);
}
static void
cfil_entry_verify(struct cfil_entry *entry)
{
cfil_entry_buf_verify(&entry->cfe_snd);
cfil_entry_buf_verify(&entry->cfe_rcv);
}
static void
cfil_info_buf_verify(struct cfi_buf *cfi_buf)
{
CFIL_QUEUE_VERIFY(&cfi_buf->cfi_inject_q);
VERIFY(cfi_buf->cfi_pending_first <= cfi_buf->cfi_pending_last);
VERIFY(cfi_buf->cfi_pending_mbcnt >= 0);
}
static void
cfil_info_verify(struct cfil_info *cfil_info)
{
int i;
if (cfil_info == NULL)
return;
cfil_info_buf_verify(&cfil_info->cfi_snd);
cfil_info_buf_verify(&cfil_info->cfi_rcv);
for (i = 0; i < MAX_CONTENT_FILTER; i++)
cfil_entry_verify(&cfil_info->cfi_entries[i]);
}
static void
verify_content_filter(struct content_filter *cfc)
{
struct cfil_entry *entry;
uint32_t count = 0;
VERIFY(cfc->cf_sock_count >= 0);
TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
count++;
VERIFY(cfc == entry->cfe_filter);
}
VERIFY(count == cfc->cf_sock_count);
}
/*
* Kernel control socket callbacks
*/
static errno_t
cfil_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
void **unitinfo)
{
errno_t error = 0;
struct content_filter *cfc = NULL;
CFIL_LOG(LOG_NOTICE, "");
cfc = zalloc(content_filter_zone);
if (cfc == NULL) {
CFIL_LOG(LOG_ERR, "zalloc failed");
error = ENOMEM;
goto done;
}
bzero(cfc, sizeof(struct content_filter));
cfil_rw_lock_exclusive(&cfil_lck_rw);
if (content_filters == NULL) {
struct content_filter **tmp;
cfil_rw_unlock_exclusive(&cfil_lck_rw);
MALLOC(tmp,
struct content_filter **,
MAX_CONTENT_FILTER * sizeof(struct content_filter *),
M_TEMP,
M_WAITOK | M_ZERO);
cfil_rw_lock_exclusive(&cfil_lck_rw);
if (tmp == NULL && content_filters == NULL) {
error = ENOMEM;
cfil_rw_unlock_exclusive(&cfil_lck_rw);
goto done;
}
/* Another thread may have won the race */
if (content_filters != NULL)
FREE(tmp, M_TEMP);
else
content_filters = tmp;
}
if (sac->sc_unit == 0 || sac->sc_unit > MAX_CONTENT_FILTER) {
CFIL_LOG(LOG_ERR, "bad sc_unit %u", sac->sc_unit);
error = EINVAL;
} else if (content_filters[sac->sc_unit - 1] != NULL) {
CFIL_LOG(LOG_ERR, "sc_unit %u in use", sac->sc_unit);
error = EADDRINUSE;
} else {
/*
* kernel control socket kcunit numbers start at 1
*/
content_filters[sac->sc_unit - 1] = cfc;
cfc->cf_kcref = kctlref;
cfc->cf_kcunit = sac->sc_unit;
TAILQ_INIT(&cfc->cf_sock_entries);
*unitinfo = cfc;
cfil_active_count++;
}
cfil_rw_unlock_exclusive(&cfil_lck_rw);
done:
if (error != 0 && cfc != NULL)
zfree(content_filter_zone, cfc);
if (error == 0)
OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_ok);
else
OSIncrementAtomic(&cfil_stats.cfs_ctl_connect_fail);
CFIL_LOG(LOG_INFO, "return %d cfil_active_count %u kcunit %u",
error, cfil_active_count, sac->sc_unit);
return (error);
}
static errno_t
cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
{
#pragma unused(kctlref)
errno_t error = 0;
struct content_filter *cfc;
struct cfil_entry *entry;
CFIL_LOG(LOG_NOTICE, "");
if (content_filters == NULL) {
CFIL_LOG(LOG_ERR, "no content filter");
error = EINVAL;
goto done;
}
if (kcunit > MAX_CONTENT_FILTER) {
CFIL_LOG(LOG_ERR, "kcunit %u > MAX_CONTENT_FILTER (%d)",
kcunit, MAX_CONTENT_FILTER);
error = EINVAL;
goto done;
}