Browse Source

Introduce seccomp-assisted syscall filtering

With this patch, strace can rely on seccomp to only be stopped at syscalls
of interest, instead of stopping at all syscalls.  The seccomp filtering
of syscalls is opt-in only; it must be enabled with the --seccomp-bpf
option.  Kernel support is first checked with check_seccomp_filter(),
which also ensures the BPF program derived from the syscalls to filter
is not larger than the kernel's limit.

The --seccomp-bpf option implies -f, but a warning is emitted if -f is not
explicitly specified.  Since a task's children inherit its seccomp
filters, we want to ensure all children are also traced to avoid their
syscalls failing with ENOSYS (cf. SECCOMP_RET_TRACE in seccomp man page).

Fork/vfork/clone children of traced processes are marked as not having a
seccomp filter until we receive a first seccomp-stop.  They are therefore
stopped at every syscall entries and exits until that first seccomp-stop.

The current BPF program implements a simple linear match of the syscall
numbers.  Contiguous sequences of syscall numbers are however matched as
an interval, with two instructions only.  The algorithm can be improved
or replaced in the future without impacting user-observed behavior.

The behavior of SECCOMP_RET_TRACE changed between Linux 4.7 and 4.8
(cf. PTRACE_EVENT_SECCOMP in ptrace man page).  This patch supports both
behaviors by checking the kernel's actual behavior before installing the
seccomp filter.

* filter_seccomp.c: New file.
* filter_seccomp.h: New file.
* Makefile.am (strace_SOURCES): Add filter_seccomp.c and
filter_seccomp.h.
* linux/aarch64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Define for aarch64.
* linux/powerpc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for powerpc64.
* linux/s390x/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
* linux/sparc64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for sparc64.
PERSONALITY1_AUDIT_ARCH): Likewise for s390x.
* linux/tile/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for tile.
* linux/x32/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH): Likewise for x32.
* linux/x86_64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH,
PERSONALITY1_AUDIT_ARCH, PERSONALITY2_AUDIT_ARCH): Likewise for x86_64.
* linux/ia64/arch_defs_.h (PERSONALITY0_AUDIT_ARCH): Likewise for IA64.
* strace.c (usage): Document --seccomp-bpf option.
(startup_child): Mark process has having seccomp filter.
(exec_or_die): Initialize seccomp filtering if requested.
(init): Handle --seccomp-bpf option and check that seccomp can be
enabled.
(print_debug_info): Handle PTRACE_EVENT_SECCOMP.
(next_event): Capture PTRACE_EVENT_SECCOMP event.
(dispatch_event): Handle PTRACE_EVENT_SECCOMP event.
* trace_event.h (trace_event): New enumeration entity.
* strace.1.in: Document new --seccomp-bpf option.
* NEWS: Mention this change.

Co-authored-by: Paul Chaignon <paul.chaignon@gmail.com>
Co-Authored-by: Dmitry V. Levin <ldv@altlinux.org>
Chen Jingpiao 1 year ago
parent
commit
5d64f3d7fa

+ 2
- 0
Makefile.am View File

@@ -129,6 +129,8 @@ strace_SOURCES =	\
129 129
 	file_ioctl.c	\
130 130
 	filter.h	\
131 131
 	filter_qualify.c \
132
+	filter_seccomp.c \
133
+	filter_seccomp.h \
132 134
 	flock.c		\
133 135
 	flock.h		\
134 136
 	fs_x_ioctl.c	\

+ 2
- 0
NEWS View File

@@ -2,6 +2,8 @@ Noteworthy changes in release ?.? (????-??-??)
2 2
 ==============================================
3 3
 
4 4
 * Improvements
5
+  * Implemented usage of seccomp-bpf for stopping tracees only for filtered
6
+    syscalls.  Use --seccomp-bpf option to enable.
5 7
   * Implemented decoding of pidfd_open and clone3 syscalls.
6 8
   * Enhanced decoding of NETLINK_ROUTE protocol.
7 9
   * Implemented decoding of UNIX_DIAG_UID netlink attribute.

+ 4
- 0
defs.h View File

@@ -332,6 +332,9 @@ struct tcb {
332 332
 # define TCB_DELAYED	0x2000	/* Current syscall has been delayed */
333 333
 # define TCB_TAMPERED_NO_FAIL 0x4000	/* We tamper tcb with syscall
334 334
 					   that should not fail. */
335
+# define TCB_SECCOMP_FILTER	0x8000	/* This process has a seccomp filter
336
+					 * attached.
337
+					 */
335 338
 
336 339
 /* qualifier flags */
337 340
 # define QUAL_TRACE	0x001	/* this system call should be traced */
@@ -358,6 +361,7 @@ struct tcb {
358 361
 # define inject_delay_exit(tcp)	((tcp)->flags & TCB_INJECT_DELAY_EXIT)
359 362
 # define syscall_delayed(tcp)	((tcp)->flags & TCB_DELAYED)
360 363
 # define syscall_tampered_nofail(tcp) ((tcp)->flags & TCB_TAMPERED_NO_FAIL)
364
+# define has_seccomp_filter(tcp)	((tcp)->flags & TCB_SECCOMP_FILTER)
361 365
 
362 366
 extern const struct_sysent stub_sysent;
363 367
 # define tcp_sysent(tcp) (tcp->s_ent ?: &stub_sysent)

+ 617
- 0
filter_seccomp.c View File

@@ -0,0 +1,617 @@
1
+/*
2
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4
+ * Copyright (c) 2019 The strace developers.
5
+ * All rights reserved.
6
+ *
7
+ * SPDX-License-Identifier: LGPL-2.1-or-later
8
+ */
9
+
10
+#include "defs.h"
11
+
12
+#include "ptrace.h"
13
+#include <signal.h>
14
+#include <sys/prctl.h>
15
+#include <sys/wait.h>
16
+#include <linux/audit.h>
17
+#include <linux/filter.h>
18
+
19
+#include "filter_seccomp.h"
20
+#include "number_set.h"
21
+#include "syscall.h"
22
+#include "scno.h"
23
+
24
+bool seccomp_filtering;
25
+bool seccomp_before_sysentry;
26
+
27
+#ifdef HAVE_LINUX_SECCOMP_H
28
+
29
+# include <linux/seccomp.h>
30
+
31
+# ifndef BPF_MAXINSNS
32
+#  define BPF_MAXINSNS 4096
33
+# endif
34
+
35
+# define JMP_PLACEHOLDER_NEXT  ((unsigned char) -1)
36
+# define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
37
+
38
+# define SET_BPF(filter, code, jt, jf, k) \
39
+	(*(filter) = (struct sock_filter) { code, jt, jf, k })
40
+
41
+# define SET_BPF_STMT(filter, code, k) \
42
+	SET_BPF(filter, code, 0, 0, k)
43
+
44
+# define SET_BPF_JUMP(filter, code, k, jt, jf) \
45
+	SET_BPF(filter, BPF_JMP | code, jt, jf, k)
46
+
47
+struct audit_arch_t {
48
+	unsigned int arch;
49
+	unsigned int flag;
50
+};
51
+
52
+static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
53
+# if SUPPORTED_PERSONALITIES > 1
54
+	PERSONALITY0_AUDIT_ARCH,
55
+	PERSONALITY1_AUDIT_ARCH,
56
+#  if SUPPORTED_PERSONALITIES > 2
57
+	PERSONALITY2_AUDIT_ARCH,
58
+#  endif
59
+# endif
60
+};
61
+
62
+# ifdef ENABLE_COVERAGE_GCOV
63
+extern void __gcov_flush(void);
64
+# endif
65
+
66
+static void ATTRIBUTE_NORETURN
67
+check_seccomp_order_do_child(void)
68
+{
69
+	static const struct sock_filter filter[] = {
70
+		/* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
71
+		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
72
+			 offsetof(struct seccomp_data, nr)),
73
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
74
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
75
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
76
+	};
77
+	static const struct sock_fprog prog = {
78
+		.len = ARRAY_SIZE(filter),
79
+		.filter = (struct sock_filter *) filter
80
+	};
81
+
82
+	/* Get everything ready before PTRACE_TRACEME.  */
83
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
84
+		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
85
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
86
+		perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
87
+	int pid = getpid();
88
+
89
+	if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
90
+		/* Exit with a nonzero exit status.  */
91
+		perror_func_msg_and_die("PTRACE_TRACEME");
92
+	}
93
+
94
+# ifdef ENABLE_COVERAGE_GCOV
95
+	__gcov_flush();
96
+# endif
97
+
98
+	kill(pid, SIGSTOP);
99
+	syscall(__NR_gettid);
100
+	_exit(0);
101
+}
102
+
103
+static int
104
+check_seccomp_order_tracer(int pid)
105
+{
106
+	unsigned int step;
107
+
108
+	for (step = 0; ; ++step) {
109
+		int status;
110
+
111
+		for (;;) {
112
+			long rc = waitpid(pid, &status, 0);
113
+			if (rc < 0 && errno == EINTR)
114
+				continue;
115
+			if (rc == pid)
116
+				break;
117
+			/* Cannot happen.  */
118
+			perror_func_msg("#%d: unexpected wait result %ld",
119
+					step, rc);
120
+			return pid;
121
+		}
122
+
123
+		if (WIFEXITED(status)) {
124
+			/* The tracee is no more.  */
125
+			pid = 0;
126
+
127
+			int exitstatus = WEXITSTATUS(status);
128
+			if (step == 5 && exitstatus == 0) {
129
+				seccomp_filtering = true;
130
+			} else {
131
+				error_func_msg("#%d: unexpected exit status %u",
132
+					       step, exitstatus);
133
+			}
134
+			break;
135
+		}
136
+
137
+		if (WIFSIGNALED(status)) {
138
+			/* The tracee is no more.  */
139
+			pid = 0;
140
+
141
+			error_func_msg("#%d: unexpected signal %u",
142
+				       step, WTERMSIG(status));
143
+			break;
144
+		}
145
+
146
+		if (!WIFSTOPPED(status)) {
147
+			/* Cannot happen.  */
148
+			error_func_msg("#%d: unexpected wait status %#x",
149
+				       step, status);
150
+			break;
151
+		}
152
+
153
+		unsigned int event = (unsigned int) status >> 16;
154
+
155
+		switch (WSTOPSIG(status)) {
156
+		case SIGSTOP:
157
+			if (step != 0) {
158
+				error_func_msg("#%d: unexpected signal stop",
159
+					       step);
160
+				return pid;
161
+			}
162
+			if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
163
+				   PTRACE_O_TRACESYSGOOD|
164
+				   PTRACE_O_TRACESECCOMP) < 0) {
165
+				perror_func_msg("PTRACE_SETOPTIONS");
166
+				return pid;
167
+			}
168
+			break;
169
+
170
+		case SIGTRAP:
171
+			if (event != PTRACE_EVENT_SECCOMP) {
172
+				error_func_msg("#%d: unexpected trap %#x",
173
+					       step, event);
174
+				return pid;
175
+			}
176
+
177
+			switch (step) {
178
+			case 1: /* Seccomp stop before entering gettid.  */
179
+				seccomp_before_sysentry = true;
180
+				break;
181
+			case 2: /* Seccomp stop after entering gettid.  */
182
+				if (!seccomp_before_sysentry)
183
+					break;
184
+				ATTRIBUTE_FALLTHROUGH;
185
+			default:
186
+				error_func_msg("#%d: unexpected seccomp stop",
187
+					       step);
188
+				return pid;
189
+			}
190
+			break;
191
+
192
+		case SIGTRAP | 0x80:
193
+			switch (step) {
194
+			case 3: /* Exiting gettid.  */
195
+			case 4: /* Entering exit_group.  */
196
+				break;
197
+			case 1: /* Entering gettid before seccomp stop.  */
198
+				seccomp_before_sysentry = false;
199
+				break;
200
+			case 2: /* Entering gettid after seccomp stop.  */
201
+				if (seccomp_before_sysentry)
202
+					break;
203
+				ATTRIBUTE_FALLTHROUGH;
204
+			default:
205
+				error_func_msg("#%d: unexpected syscall stop",
206
+					       step);
207
+				return pid;
208
+			}
209
+			break;
210
+
211
+		default:
212
+			error_func_msg("#%d: unexpected stop signal %#x",
213
+				       step, WSTOPSIG(status));
214
+			return pid;
215
+		}
216
+
217
+		if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
218
+			/* Cannot happen.  */
219
+			perror_func_msg("#%d: PTRACE_SYSCALL", step);
220
+			break;
221
+		}
222
+	}
223
+
224
+	return pid;
225
+}
226
+
227
+static void
228
+check_seccomp_order(void)
229
+{
230
+	seccomp_filtering = false;
231
+
232
+	int pid = fork();
233
+	if (pid < 0) {
234
+		perror_func_msg("fork");
235
+		return;
236
+	}
237
+
238
+	if (pid == 0)
239
+		check_seccomp_order_do_child();
240
+
241
+	pid = check_seccomp_order_tracer(pid);
242
+	if (pid) {
243
+		kill(pid, SIGKILL);
244
+		for (;;) {
245
+			long rc = waitpid(pid, NULL, 0);
246
+			if (rc < 0 && errno == EINTR)
247
+				continue;
248
+			break;
249
+		}
250
+	}
251
+}
252
+
253
+static bool
254
+traced_by_seccomp(unsigned int scno, unsigned int p)
255
+{
256
+	if (is_number_in_set_array(scno, trace_set, p)
257
+	    || sysent_vec[p][scno].sys_flags
258
+	    & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
259
+		return true;
260
+	return false;
261
+}
262
+
263
+static void
264
+check_bpf_program_size(void)
265
+{
266
+	unsigned int nb_insns = SUPPORTED_PERSONALITIES > 1 ? 1 : 0;
267
+
268
+	/*
269
+	 * Implements a simplified form of init_sock_filter()'s bytecode
270
+	 * generation algorithm, to count the number of instructions that will
271
+	 * be generated.
272
+	 */
273
+	for (int p = SUPPORTED_PERSONALITIES - 1;
274
+	     p >= 0 && nb_insns < BPF_MAXINSNS; --p) {
275
+		unsigned int nb_insns_personality = 0;
276
+		unsigned int lower = UINT_MAX;
277
+
278
+		nb_insns_personality++;
279
+# if SUPPORTED_PERSONALITIES > 1
280
+		nb_insns_personality++;
281
+		if (audit_arch_vec[p].flag)
282
+			nb_insns_personality += 3;
283
+# endif
284
+
285
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
286
+			if (traced_by_seccomp(i, p)) {
287
+				if (lower == UINT_MAX)
288
+					lower = i;
289
+				continue;
290
+			}
291
+			if (lower == UINT_MAX)
292
+				continue;
293
+			if (lower + 1 == i)
294
+				nb_insns_personality++;
295
+			else
296
+				nb_insns_personality += 2;
297
+			lower = UINT_MAX;
298
+		}
299
+		if (lower != UINT_MAX) {
300
+			if (lower + 1 == nsyscall_vec[p])
301
+				nb_insns_personality++;
302
+			else
303
+				nb_insns_personality += 2;
304
+		}
305
+
306
+		nb_insns_personality += 3;
307
+
308
+		/*
309
+		 * Within generated BPF programs, the origin and destination of
310
+		 * jumps are always in the same personality section.  The
311
+		 * largest jump is therefore the jump from the first
312
+		 * instruction of the section to the last, to skip the
313
+		 * personality and try to compare .arch to the next
314
+		 * personality.
315
+		 * If we have a personality section with more than 255
316
+		 * instructions, the jump offset will overflow.  Such program
317
+		 * is unlikely to happen, so we simply disable seccomp filter
318
+		 * is such a case.
319
+		 */
320
+		if (nb_insns_personality > UCHAR_MAX) {
321
+			debug_msg("seccomp filter disabled due to "
322
+				  "possibility of overflow");
323
+			seccomp_filtering = false;
324
+			return;
325
+		}
326
+		nb_insns += nb_insns_personality;
327
+	}
328
+
329
+# if SUPPORTED_PERSONALITIES > 1
330
+	nb_insns++;
331
+# endif
332
+
333
+	if (nb_insns > BPF_MAXINSNS) {
334
+		debug_msg("seccomp filter disabled due to BPF program being "
335
+			  "oversized (%u > %d)", nb_insns, BPF_MAXINSNS);
336
+		seccomp_filtering = false;
337
+	}
338
+}
339
+
340
+static void
341
+check_seccomp_filter_properties(void)
342
+{
343
+	if (NOMMU_SYSTEM) {
344
+		seccomp_filtering = false;
345
+		return;
346
+	}
347
+
348
+	int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
349
+	seccomp_filtering = rc < 0 && errno != EINVAL;
350
+	if (!seccomp_filtering)
351
+		debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
352
+
353
+	if (seccomp_filtering)
354
+		check_bpf_program_size();
355
+	if (seccomp_filtering)
356
+		check_seccomp_order();
357
+}
358
+
359
+static void
360
+dump_seccomp_bpf(const struct sock_filter *filter, unsigned short len)
361
+{
362
+	for (unsigned int i = 0; i < len; ++i) {
363
+		switch (filter[i].code) {
364
+		case BPF_LD | BPF_W | BPF_ABS:
365
+			switch (filter[i].k) {
366
+			case offsetof(struct seccomp_data, arch):
367
+				error_msg("STMT(BPF_LDWABS, data->arch)");
368
+				break;
369
+			case offsetof(struct seccomp_data, nr):
370
+				error_msg("STMT(BPF_LDWABS, data->nr)");
371
+				break;
372
+			default:
373
+				error_msg("STMT(BPF_LDWABS, 0x%x)",
374
+					  filter[i].k);
375
+			}
376
+			break;
377
+		case BPF_RET | BPF_K:
378
+			switch (filter[i].k) {
379
+			case SECCOMP_RET_TRACE:
380
+				error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
381
+				break;
382
+			case SECCOMP_RET_ALLOW:
383
+				error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
384
+				break;
385
+			default:
386
+				error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
387
+			}
388
+			break;
389
+		case BPF_JMP | BPF_JEQ | BPF_K:
390
+			error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
391
+				  filter[i].jt, filter[i].jf,
392
+				  filter[i].k);
393
+			break;
394
+		case BPF_JMP | BPF_JGE | BPF_K:
395
+			error_msg("JUMP(BPF_JGE, %u, %u, %u)",
396
+				  filter[i].jt, filter[i].jf,
397
+				  filter[i].k);
398
+			break;
399
+		case BPF_JMP | BPF_JA:
400
+			error_msg("JUMP(BPF_JA, %u)", filter[i].k);
401
+			break;
402
+		default:
403
+			error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
404
+				  filter[i].jt, filter[i].jf, filter[i].k);
405
+		}
406
+	}
407
+}
408
+
409
+static void
410
+replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
411
+			 unsigned char jmp_trace)
412
+{
413
+	switch (*jmp_offset) {
414
+	case JMP_PLACEHOLDER_NEXT:
415
+		*jmp_offset = jmp_next;
416
+		break;
417
+	case JMP_PLACEHOLDER_TRACE:
418
+		*jmp_offset = jmp_trace;
419
+		break;
420
+	default:
421
+		break;
422
+	}
423
+}
424
+
425
+static unsigned short
426
+bpf_syscalls_cmp(struct sock_filter *filter,
427
+		 unsigned int lower, unsigned int upper)
428
+{
429
+	if (lower + 1 == upper) {
430
+		/* if (nr == lower) return RET_TRACE; */
431
+		SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
432
+			     JMP_PLACEHOLDER_TRACE, 0);
433
+		return 1;
434
+	} else {
435
+		/* if (nr >= lower && nr < upper) return RET_TRACE; */
436
+		SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
437
+		SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
438
+			     JMP_PLACEHOLDER_TRACE);
439
+		return 2;
440
+	}
441
+}
442
+
443
+static unsigned short
444
+init_sock_filter(struct sock_filter *filter)
445
+{
446
+	/*
447
+	 * Generated program looks like:
448
+	 * if (arch == AUDIT_ARCH_A && nr >= flag) {
449
+	 *	if (nr == 59)
450
+	 *		return SECCOMP_RET_TRACE;
451
+	 *	if (nr >= 321 && nr <= 323)
452
+	 *		return SECCOMP_RET_TRACE;
453
+	 *	...
454
+	 *	return SECCOMP_RET_ALLOW;
455
+	 * }
456
+	 * if (arch == AUDIT_ARCH_A) {
457
+	 *	...
458
+	 * }
459
+	 * if (arch == AUDIT_ARCH_B) {
460
+	 *	...
461
+	 * }
462
+	 * return SECCOMP_RET_TRACE;
463
+	 */
464
+	unsigned short pos = 0;
465
+
466
+# if SUPPORTED_PERSONALITIES > 1
467
+	SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
468
+		     offsetof(struct seccomp_data, arch));
469
+# endif
470
+
471
+	/*
472
+	 * Personalities are iterated in reverse-order in the BPF program so
473
+	 * that the x86 case is naturally handled.  On x86, the first and third
474
+	 * personalities have the same arch identifier.  The third can be
475
+	 * distinguished based on its associated syscall flag, so we check it
476
+	 * first.  The only drawback here is that the first personality is more
477
+	 * common, which may make the BPF program slower to match syscalls on
478
+	 * average.
479
+	 */
480
+	for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
481
+		unsigned int lower = UINT_MAX;
482
+		unsigned short start = pos, end;
483
+
484
+# if SUPPORTED_PERSONALITIES > 1
485
+		/* if (arch != audit_arch_vec[p].arch) goto next; */
486
+		SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
487
+			     audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
488
+# endif
489
+		SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
490
+			     offsetof(struct seccomp_data, nr));
491
+
492
+# if SUPPORTED_PERSONALITIES > 1
493
+		if (audit_arch_vec[p].flag) {
494
+			/* if (nr < audit_arch_vec[p].flag) goto next; */
495
+			SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
496
+				     audit_arch_vec[p].flag, 2, 0);
497
+			SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
498
+				     offsetof(struct seccomp_data, arch));
499
+			SET_BPF_JUMP(&filter[pos++], BPF_JA,
500
+				     JMP_PLACEHOLDER_NEXT, 0, 0);
501
+		}
502
+# endif
503
+
504
+		for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
505
+			if (traced_by_seccomp(i, p)) {
506
+				if (lower == UINT_MAX)
507
+					lower = i;
508
+				continue;
509
+			}
510
+			if (lower == UINT_MAX)
511
+				continue;
512
+			pos += bpf_syscalls_cmp(filter + pos,
513
+						lower | audit_arch_vec[p].flag,
514
+						i | audit_arch_vec[p].flag);
515
+			lower = UINT_MAX;
516
+		}
517
+		if (lower != UINT_MAX)
518
+			pos += bpf_syscalls_cmp(filter + pos,
519
+						lower | audit_arch_vec[p].flag,
520
+						nsyscall_vec[p]
521
+						| audit_arch_vec[p].flag);
522
+		end = pos;
523
+
524
+		/* if (nr >= max_nr) return RET_TRACE; */
525
+		SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
526
+			     nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
527
+
528
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
529
+			     SECCOMP_RET_ALLOW);
530
+		SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
531
+			     SECCOMP_RET_TRACE);
532
+
533
+		for (unsigned int i = start; i < end; ++i) {
534
+			if (BPF_CLASS(filter[i].code) != BPF_JMP)
535
+				continue;
536
+			unsigned char jmp_next = pos - i - 1;
537
+			unsigned char jmp_trace = pos - i - 2;
538
+			replace_jmp_placeholders(&filter[i].jt, jmp_next,
539
+						 jmp_trace);
540
+			replace_jmp_placeholders(&filter[i].jf, jmp_next,
541
+						 jmp_trace);
542
+			if (BPF_OP(filter[i].code) == BPF_JA)
543
+				filter[i].k = (unsigned int) jmp_next;
544
+		}
545
+	}
546
+
547
+# if SUPPORTED_PERSONALITIES > 1
548
+	/* Jumps conditioned on .arch default to this RET_TRACE. */
549
+	SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
550
+# endif
551
+
552
+	if (debug_flag)
553
+		dump_seccomp_bpf(filter, pos);
554
+
555
+	return pos;
556
+}
557
+
558
+void
559
+init_seccomp_filter(void)
560
+{
561
+	struct sock_filter filter[BPF_MAXINSNS];
562
+	unsigned short len;
563
+
564
+	len = init_sock_filter(filter);
565
+
566
+	struct sock_fprog prog = {
567
+		.len = len,
568
+		.filter = filter
569
+	};
570
+
571
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
572
+		perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
573
+
574
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
575
+		perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
576
+}
577
+
578
+int
579
+seccomp_filter_restart_operator(const struct tcb *tcp)
580
+{
581
+	if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
582
+	    && traced_by_seccomp(tcp->scno, current_personality))
583
+		return PTRACE_SYSCALL;
584
+	return PTRACE_CONT;
585
+}
586
+
587
+#else /* !HAVE_LINUX_SECCOMP_H */
588
+
589
+# warning <linux/seccomp.h> is not available, seccomp filtering is not supported
590
+
591
+static void
592
+check_seccomp_filter_properties(void)
593
+{
594
+	seccomp_filtering = false;
595
+}
596
+
597
+void
598
+init_seccomp_filter(void)
599
+{
600
+}
601
+
602
+int
603
+seccomp_filter_restart_operator(const struct tcb *tcp)
604
+{
605
+	return PTRACE_SYSCALL;
606
+}
607
+
608
+#endif
609
+
610
+void
611
+check_seccomp_filter(void)
612
+{
613
+	check_seccomp_filter_properties();
614
+
615
+	if (!seccomp_filtering)
616
+		error_msg("seccomp filter is requested but unavailable");
617
+}

+ 21
- 0
filter_seccomp.h View File

@@ -0,0 +1,21 @@
1
+/*
2
+ * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
3
+ * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
4
+ * All rights reserved.
5
+ *
6
+ * SPDX-License-Identifier: LGPL-2.1-or-later
7
+ */
8
+
9
+#ifndef STRACE_SECCOMP_FILTER_H
10
+# define STRACE_SECCOMP_FILTER_H
11
+
12
+# include "defs.h"
13
+
14
+extern bool seccomp_filtering;
15
+extern bool seccomp_before_sysentry;
16
+
17
+extern void check_seccomp_filter(void);
18
+extern void init_seccomp_filter(void);
19
+extern int seccomp_filter_restart_operator(const struct tcb *);
20
+
21
+#endif /* !STRACE_SECCOMP_FILTER_H */

+ 2
- 0
linux/aarch64/arch_defs_.h View File

@@ -9,3 +9,5 @@
9 9
 #define HAVE_ARCH_OLD_SELECT 1
10 10
 #define HAVE_ARCH_UID16_SYSCALLS 1
11 11
 #define SUPPORTED_PERSONALITIES 2
12
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_AARCH64, 0 }
13
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_ARM,     0 }

+ 1
- 0
linux/ia64/arch_defs_.h View File

@@ -9,3 +9,4 @@
9 9
 #define HAVE_ARCH_UID16_SYSCALLS 1
10 10
 #define HAVE_ARCH_SA_RESTORER 0
11 11
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
12
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_IA64, SYSCALLENT_BASE_NR }

+ 2
- 0
linux/powerpc64/arch_defs_.h View File

@@ -8,3 +8,5 @@
8 8
 #define HAVE_ARCH_OLD_SELECT 1
9 9
 #define SUPPORTED_PERSONALITIES 2
10 10
 #define HAVE_ARCH_DEDICATED_ERR_REG 1
11
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_PPC64, 0 }
12
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_PPC,   0 }

+ 2
- 0
linux/s390x/arch_defs_.h View File

@@ -9,3 +9,5 @@
9 9
 #define HAVE_ARCH_OLD_MMAP_PGOFF 1
10 10
 #define HAVE_ARCH_UID16_SYSCALLS 1
11 11
 #define SUPPORTED_PERSONALITIES 2
12
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_S390X, 0 }
13
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_S390,  0 }

+ 2
- 0
linux/sparc64/arch_defs_.h View File

@@ -9,4 +9,6 @@
9 9
 #define HAVE_ARCH_UID16_SYSCALLS 1
10 10
 #define HAVE_ARCH_SA_RESTORER 1
11 11
 #define SUPPORTED_PERSONALITIES 2
12
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_SPARC64, 0 }
13
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_SPARC,   0 }
12 14
 #define HAVE_ARCH_DEDICATED_ERR_REG 1

+ 2
- 0
linux/tile/arch_defs_.h View File

@@ -6,6 +6,8 @@
6 6
  */
7 7
 
8 8
 #define SUPPORTED_PERSONALITIES 2
9
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_TILEGX,   0 }
10
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_TILEGX32, 0 }
9 11
 #define CAN_ARCH_BE_COMPAT_ON_64BIT_KERNEL 1
10 12
 
11 13
 #ifdef __tilepro__

+ 2
- 0
linux/x32/arch_defs_.h View File

@@ -11,3 +11,5 @@
11 11
 #define HAVE_ARCH_UID16_SYSCALLS 1
12 12
 #define HAVE_ARCH_OLD_TIME64_SYSCALLS 1
13 13
 #define SUPPORTED_PERSONALITIES 2
14
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }
15
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }

+ 3
- 0
linux/x86_64/arch_defs_.h View File

@@ -9,3 +9,6 @@
9 9
 #define HAVE_ARCH_OLD_SELECT 1
10 10
 #define HAVE_ARCH_UID16_SYSCALLS 1
11 11
 #define SUPPORTED_PERSONALITIES 3
12
+#define PERSONALITY0_AUDIT_ARCH { AUDIT_ARCH_X86_64, 0 }
13
+#define PERSONALITY1_AUDIT_ARCH { AUDIT_ARCH_I386,   0 }
14
+#define PERSONALITY2_AUDIT_ARCH { AUDIT_ARCH_X86_64, __X32_SYSCALL_BIT }

+ 19
- 0
strace.1.in View File

@@ -49,6 +49,7 @@ strace \- trace system calls and signals
49 49
 .OP \-X format
50 50
 .OM \-P path
51 51
 .OM \-p pid
52
+.OP \-\-seccomp\-bpf
52 53
 .BR "" {
53 54
 .OR \-p pid
54 55
 .BR "" |
@@ -68,6 +69,7 @@ strace \- trace system calls and signals
68 69
 .OP \-S sortby
69 70
 .OM \-P path
70 71
 .OM \-p pid
72
+.OP \-\-seccomp\-bpf
71 73
 .BR "" {
72 74
 .OR \-p pid
73 75
 .BR "" |
@@ -970,6 +972,23 @@ Show some debugging output of
970 972
 .B strace
971 973
 itself on the standard error.
972 974
 .TP
975
+.B \-\-seccomp\-bpf
976
+Enable (experimental) usage of seccomp-bpf to have ptrace(2)-stops only when
977
+system calls that are being traced occur in the traced processes.  Implies the
978
+.B \-f
979
+option.
980
+An attempt to rely on seccomp-bpf to filter system calls may fail for various
981
+reasons, e.g. there are too many system calls to filter, the seccomp API is not
982
+available, or
983
+.B strace
984
+itself is being traced.
985
+.B \-\-seccomp\-bpf
986
+is also ineffective on processes attached using
987
+.BR \-p .
988
+In cases when seccomp-bpf filter setup failed,
989
+.B strace
990
+proceeds as usual and stops traced processes on every system call.
991
+.TP
973 992
 .B \-F
974 993
 This option is deprecated.  It is retained for backward compatibility only
975 994
 and may be removed in future releases.

+ 103
- 3
strace.c View File

@@ -31,6 +31,7 @@
31 31
 #endif
32 32
 
33 33
 #include "kill_save_errno.h"
34
+#include "filter_seccomp.h"
34 35
 #include "largefile_wrappers.h"
35 36
 #include "mmap_cache.h"
36 37
 #include "number_set.h"
@@ -239,10 +240,10 @@ usage(void)
239 240
 	printf("\
240 241
 usage: strace [-ACdffhi" K_OPT "qqrtttTvVwxxyyzZ] [-I n] [-b execve] [-e expr]...\n\
241 242
               [-a column] [-o file] [-s strsize] [-X format] [-P path]...\n\
242
-              [-p pid]...\n\
243
+              [-p pid]... [--seccomp-bpf]\n\
243 244
 	      { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
244 245
    or: strace -c[dfwzZ] [-I n] [-b execve] [-e expr]... [-O overhead]\n\
245
-              [-S sortby] [-P path]... [-p pid]...\n\
246
+              [-S sortby] [-P path]... [-p pid]... [--seccomp-bpf]\n\
246 247
               { -p pid | [-D] [-E var=val]... [-u username] PROG [ARGS] }\n\
247 248
 \n\
248 249
 Output format:\n\
@@ -308,6 +309,7 @@ Startup:\n\
308 309
   -u username    run command as username handling setuid and/or setgid\n\
309 310
 \n\
310 311
 Miscellaneous:\n\
312
+  --seccomp-bpf  enable seccomp-bpf filtering\n\
311 313
   -d             enable debug output to stderr\n\
312 314
   -h, --help     print help message\n\
313 315
   -V, --version  print version\n\
@@ -1232,6 +1234,10 @@ exec_or_die(void)
1232 1234
 	if (params_for_tracee.child_sa.sa_handler != SIG_DFL)
1233 1235
 		sigaction(SIGCHLD, &params_for_tracee.child_sa, NULL);
1234 1236
 
1237
+	debug_msg("seccomp filter %s",
1238
+		  seccomp_filtering ? "enabled" : "disabled");
1239
+	if (seccomp_filtering)
1240
+		init_seccomp_filter();
1235 1241
 	execv(params->pathname, params->argv);
1236 1242
 	perror_msg_and_die("exec");
1237 1243
 }
@@ -1470,6 +1476,10 @@ startup_child(char **argv)
1470 1476
 		 * to create a genuine separate stack and execute on it.
1471 1477
 		 */
1472 1478
 	}
1479
+
1480
+	if (seccomp_filtering)
1481
+		tcp->flags |= TCB_SECCOMP_FILTER;
1482
+
1473 1483
 	/*
1474 1484
 	 * A case where straced process is part of a pipe:
1475 1485
 	 * { sleep 1; yes | head -n99999; } | strace -o/dev/null sh -c 'exec <&-; sleep 9'
@@ -1609,7 +1619,12 @@ init(int argc, char *argv[])
1609 1619
 	    "k"
1610 1620
 #endif
1611 1621
 	    "a:Ab:cCdDe:E:fFhiI:o:O:p:P:qrs:S:tTu:vVwxX:yzZ";
1622
+
1623
+	enum {
1624
+		SECCOMP_OPTION = 0x100
1625
+	};
1612 1626
 	static const struct option longopts[] = {
1627
+		{ "seccomp-bpf", no_argument, 0, SECCOMP_OPTION },
1613 1628
 		{ "help", no_argument, 0, 'h' },
1614 1629
 		{ "version", no_argument, 0, 'V' },
1615 1630
 		{ 0, 0, 0, 0 }
@@ -1751,6 +1766,9 @@ init(int argc, char *argv[])
1751 1766
 			add_number_to_set(STATUS_FAILED, status_set);
1752 1767
 			zflags++;
1753 1768
 			break;
1769
+		case SECCOMP_OPTION:
1770
+			seccomp_filtering = true;
1771
+			break;
1754 1772
 		default:
1755 1773
 			error_msg_and_help(NULL);
1756 1774
 			break;
@@ -1768,6 +1786,16 @@ init(int argc, char *argv[])
1768 1786
 		error_msg_and_help("PROG [ARGS] must be specified with -D");
1769 1787
 	}
1770 1788
 
1789
+	if (seccomp_filtering) {
1790
+		if (nprocs && (!argc || debug_flag))
1791
+			error_msg("--seccomp-bpf is not enabled for processes"
1792
+				  " attached with -p");
1793
+		if (!followfork) {
1794
+			error_msg("--seccomp-bpf implies -f");
1795
+			followfork = 1;
1796
+		}
1797
+	}
1798
+
1771 1799
 	if (optF) {
1772 1800
 		if (followfork) {
1773 1801
 			error_msg("deprecated option -F ignored");
@@ -1843,6 +1871,12 @@ init(int argc, char *argv[])
1843 1871
 		ptrace_setoptions |= PTRACE_O_TRACECLONE |
1844 1872
 				     PTRACE_O_TRACEFORK |
1845 1873
 				     PTRACE_O_TRACEVFORK;
1874
+
1875
+	if (seccomp_filtering)
1876
+		check_seccomp_filter();
1877
+	if (seccomp_filtering)
1878
+		ptrace_setoptions |= PTRACE_O_TRACESECCOMP;
1879
+
1846 1880
 	debug_msg("ptrace_setoptions = %#x", ptrace_setoptions);
1847 1881
 	test_ptrace_seize();
1848 1882
 	test_ptrace_get_syscall_info();
@@ -2030,6 +2064,7 @@ print_debug_info(const int pid, int status)
2030 2064
 			[PTRACE_EVENT_VFORK_DONE] = "VFORK_DONE",
2031 2065
 			[PTRACE_EVENT_EXEC]  = "EXEC",
2032 2066
 			[PTRACE_EVENT_EXIT]  = "EXIT",
2067
+			[PTRACE_EVENT_SECCOMP]  = "SECCOMP",
2033 2068
 			/* [PTRACE_EVENT_STOP (=128)] would make biggish array */
2034 2069
 		};
2035 2070
 		const char *e = "??";
@@ -2555,6 +2590,9 @@ next_event(void)
2555 2590
 			case PTRACE_EVENT_EXIT:
2556 2591
 				wd->te = TE_STOP_BEFORE_EXIT;
2557 2592
 				break;
2593
+			case PTRACE_EVENT_SECCOMP:
2594
+				wd->te = TE_SECCOMP;
2595
+				break;
2558 2596
 			default:
2559 2597
 				wd->te = TE_RESTART;
2560 2598
 			}
@@ -2640,7 +2678,7 @@ trace_syscall(struct tcb *tcp, unsigned int *sig)
2640 2678
 static bool
2641 2679
 dispatch_event(const struct tcb_wait_data *wd)
2642 2680
 {
2643
-	unsigned int restart_op = PTRACE_SYSCALL;
2681
+	unsigned int restart_op;
2644 2682
 	unsigned int restart_sig = 0;
2645 2683
 	enum trace_event te = wd ? wd->te : TE_BREAK;
2646 2684
 	/*
@@ -2649,6 +2687,11 @@ dispatch_event(const struct tcb_wait_data *wd)
2649 2687
 	 */
2650 2688
 	int status = wd ? wd->status : 0;
2651 2689
 
2690
+	if (current_tcp && has_seccomp_filter(current_tcp))
2691
+		restart_op = seccomp_filter_restart_operator(current_tcp);
2692
+	else
2693
+		restart_op = PTRACE_SYSCALL;
2694
+
2652 2695
 	switch (te) {
2653 2696
 	case TE_BREAK:
2654 2697
 		return false;
@@ -2659,6 +2702,27 @@ dispatch_event(const struct tcb_wait_data *wd)
2659 2702
 	case TE_RESTART:
2660 2703
 		break;
2661 2704
 
2705
+	case TE_SECCOMP:
2706
+		if (!has_seccomp_filter(current_tcp)) {
2707
+			/*
2708
+			 * We don't know if forks/clones have a seccomp filter
2709
+			 * when they are created, but we can detect it when we
2710
+			 * have a seccomp-stop.
2711
+			 * In such a case, if !seccomp_before_sysentry, we have
2712
+			 * already processed the syscall entry, so we avoid
2713
+			 * processing it a second time.
2714
+			 */
2715
+			current_tcp->flags |= TCB_SECCOMP_FILTER;
2716
+			restart_op = PTRACE_SYSCALL;
2717
+			break;
2718
+		}
2719
+
2720
+		if (seccomp_before_sysentry) {
2721
+			restart_op = PTRACE_SYSCALL;
2722
+			break;
2723
+		}
2724
+		ATTRIBUTE_FALLTHROUGH;
2725
+
2662 2726
 	case TE_SYSCALL_STOP:
2663 2727
 		if (trace_syscall(current_tcp, &restart_sig) < 0) {
2664 2728
 			/*
@@ -2674,6 +2738,42 @@ dispatch_event(const struct tcb_wait_data *wd)
2674 2738
 			 */
2675 2739
 			return true;
2676 2740
 		}
2741
+		if (has_seccomp_filter(current_tcp)) {
2742
+			/*
2743
+			 * Syscall and seccomp stops can happen in different
2744
+			 * orders depending on kernel.  strace tests this in
2745
+			 * check_seccomp_order_tracer().
2746
+			 *
2747
+			 * Linux 3.5--4.7:
2748
+			 * (seccomp-stop before syscall-entry-stop)
2749
+			 *         +--> seccomp-stop ->-PTRACE_SYSCALL->-+
2750
+			 *         |                                     |
2751
+			 *     PTRACE_CONT                   syscall-entry-stop
2752
+			 *         |                                     |
2753
+			 * syscall-exit-stop <---PTRACE_SYSCALL-----<----+
2754
+			 *
2755
+			 * Linux 4.8+:
2756
+			 * (seccomp-stop after syscall-entry-stop)
2757
+			 *                 syscall-entry-stop
2758
+			 *
2759
+			 *         +---->-----PTRACE_CONT---->----+
2760
+			 *         |                              |
2761
+			 *  syscall-exit-stop               seccomp-stop
2762
+			 *         |                              |
2763
+			 *         +----<----PTRACE_SYSCALL---<---+
2764
+			 *
2765
+			 * Note in Linux 4.8+, we restart in PTRACE_CONT
2766
+			 * after syscall-exit to skip the syscall-entry-stop.
2767
+			 * The next seccomp-stop will be treated as a syscall
2768
+			 * entry.
2769
+			 *
2770
+			 * The line below implements this behavior.
2771
+			 * Note that exiting(current_tcp) actually marks
2772
+			 * a syscall-entry-stop because the flag was inverted
2773
+			 * in the above call to trace_syscall.
2774
+			 */
2775
+			restart_op = exiting(current_tcp) ? PTRACE_SYSCALL : PTRACE_CONT;
2776
+		}
2677 2777
 		break;
2678 2778
 
2679 2779
 	case TE_SIGNAL_DELIVERY_STOP:

+ 5
- 0
trace_event.h View File

@@ -66,6 +66,11 @@ enum trace_event {
66 66
 	 * Restart the tracee with signal 0.
67 67
 	 */
68 68
 	TE_STOP_BEFORE_EXIT,
69
+
70
+	/*
71
+	 * SECCOMP_RET_TRACE rule is triggered.
72
+	 */
73
+	TE_SECCOMP,
69 74
 };
70 75
 
71 76
 #endif /* !STRACE_TRACE_EVENT_H */

Loading…
Cancel
Save