Mirror of strace – the linux syscall tracer
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

filter_seccomp.c 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768
  1. /*
  2. * Copyright (c) 2018 Chen Jingpiao <chenjingpiao@gmail.com>
  3. * Copyright (c) 2019 Paul Chaignon <paul.chaignon@gmail.com>
  4. * Copyright (c) 2019 The strace developers.
  5. * All rights reserved.
  6. *
  7. * SPDX-License-Identifier: LGPL-2.1-or-later
  8. */
  9. #include "defs.h"
  10. #include "ptrace.h"
  11. #include <signal.h>
  12. #include <sys/prctl.h>
  13. #include <sys/wait.h>
  14. #include <linux/filter.h>
  15. #include "filter_seccomp.h"
  16. #include "number_set.h"
  17. #include "syscall.h"
  18. #include "scno.h"
  19. bool seccomp_filtering;
  20. bool seccomp_before_sysentry;
  21. #ifdef HAVE_LINUX_SECCOMP_H
  22. # include <linux/seccomp.h>
  23. /* PERSONALITY*_AUDIT_ARCH definitions depend on AUDIT_ARCH_* constants. */
  24. # ifdef PERSONALITY0_AUDIT_ARCH
  25. # include <linux/audit.h>
  26. # define XLAT_MACROS_ONLY
  27. # include "xlat/elf_em.h"
  28. # include "xlat/audit_arch.h"
  29. # undef XLAT_MACROS_ONLY
  30. # endif
  31. # ifndef BPF_MAXINSNS
  32. # define BPF_MAXINSNS 4096
  33. # endif
  34. # define JMP_PLACEHOLDER_NEXT ((unsigned char) -1)
  35. # define JMP_PLACEHOLDER_TRACE ((unsigned char) -2)
  36. # define JMP_PLACEHOLDER_ALLOW ((unsigned char) -3)
  37. # define SET_BPF(filter, code, jt, jf, k) \
  38. (*(filter) = (struct sock_filter) { code, jt, jf, k })
  39. # define SET_BPF_STMT(filter, code, k) \
  40. SET_BPF(filter, code, 0, 0, k)
  41. # define SET_BPF_JUMP(filter, code, k, jt, jf) \
  42. SET_BPF(filter, BPF_JMP | code, jt, jf, k)
  43. struct audit_arch_t {
  44. unsigned int arch;
  45. unsigned int flag;
  46. };
  47. static const struct audit_arch_t audit_arch_vec[SUPPORTED_PERSONALITIES] = {
  48. # if SUPPORTED_PERSONALITIES > 1
  49. PERSONALITY0_AUDIT_ARCH,
  50. PERSONALITY1_AUDIT_ARCH,
  51. # if SUPPORTED_PERSONALITIES > 2
  52. PERSONALITY2_AUDIT_ARCH,
  53. # endif
  54. # endif
  55. };
  56. # ifdef HAVE_FORK
  57. typedef unsigned short (*filter_generator_t)(struct sock_filter *,
  58. bool *overflow);
  59. static unsigned short linear_filter_generator(struct sock_filter *,
  60. bool *overflow);
  61. static unsigned short binary_match_filter_generator(struct sock_filter *,
  62. bool *overflow);
  63. static filter_generator_t filter_generators[] = {
  64. linear_filter_generator,
  65. binary_match_filter_generator,
  66. };
  67. /*
  68. * Keep some margin in seccomp_filter as programs larger than allowed may
  69. * be constructed before we discard them.
  70. */
  71. static struct sock_filter
  72. filters[ARRAY_SIZE(filter_generators)][2 * BPF_MAXINSNS];
  73. static struct sock_fprog bpf_prog = {
  74. .len = USHRT_MAX,
  75. .filter = NULL,
  76. };
  77. static void ATTRIBUTE_NORETURN
  78. check_seccomp_order_do_child(void)
  79. {
  80. static const struct sock_filter filter[] = {
  81. /* return (nr == __NR_gettid) ? RET_TRACE : RET_ALLOW; */
  82. BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
  83. offsetof(struct seccomp_data, nr)),
  84. BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_gettid, 0, 1),
  85. BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE),
  86. BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)
  87. };
  88. static const struct sock_fprog prog = {
  89. .len = ARRAY_SIZE(filter),
  90. .filter = (struct sock_filter *) filter
  91. };
  92. /* Get everything ready before PTRACE_TRACEME. */
  93. if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
  94. perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS, 1");
  95. if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0)
  96. perror_func_msg_and_die("prctl(PR_SET_SECCOMP)");
  97. int pid = getpid();
  98. if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) < 0) {
  99. /* Exit with a nonzero exit status. */
  100. perror_func_msg_and_die("PTRACE_TRACEME");
  101. }
  102. GCOV_DUMP;
  103. kill(pid, SIGSTOP);
  104. syscall(__NR_gettid);
  105. _exit(0);
  106. }
  107. static int
  108. check_seccomp_order_tracer(int pid)
  109. {
  110. unsigned int step;
  111. for (step = 0; ; ++step) {
  112. int status;
  113. for (;;) {
  114. long rc = waitpid(pid, &status, 0);
  115. if (rc < 0 && errno == EINTR)
  116. continue;
  117. if (rc == pid)
  118. break;
  119. /* Cannot happen. */
  120. perror_func_msg("#%d: unexpected wait result %ld",
  121. step, rc);
  122. return pid;
  123. }
  124. if (WIFEXITED(status)) {
  125. /* The tracee is no more. */
  126. pid = 0;
  127. int exitstatus = WEXITSTATUS(status);
  128. if (step == 5 && exitstatus == 0) {
  129. seccomp_filtering = true;
  130. } else {
  131. error_func_msg("#%d: unexpected exit status %u",
  132. step, exitstatus);
  133. }
  134. break;
  135. }
  136. if (WIFSIGNALED(status)) {
  137. /* The tracee is no more. */
  138. pid = 0;
  139. error_func_msg("#%d: unexpected signal %u",
  140. step, WTERMSIG(status));
  141. break;
  142. }
  143. if (!WIFSTOPPED(status)) {
  144. /* Cannot happen. */
  145. error_func_msg("#%d: unexpected wait status %#x",
  146. step, status);
  147. break;
  148. }
  149. unsigned int event = (unsigned int) status >> 16;
  150. switch (WSTOPSIG(status)) {
  151. case SIGSTOP:
  152. if (step != 0) {
  153. error_func_msg("#%d: unexpected signal stop",
  154. step);
  155. return pid;
  156. }
  157. if (ptrace(PTRACE_SETOPTIONS, pid, 0L,
  158. PTRACE_O_TRACESYSGOOD|
  159. PTRACE_O_TRACESECCOMP) < 0) {
  160. perror_func_msg("PTRACE_SETOPTIONS");
  161. return pid;
  162. }
  163. break;
  164. case SIGTRAP:
  165. if (event != PTRACE_EVENT_SECCOMP) {
  166. error_func_msg("#%d: unexpected trap %#x",
  167. step, event);
  168. return pid;
  169. }
  170. switch (step) {
  171. case 1: /* Seccomp stop before entering gettid. */
  172. seccomp_before_sysentry = true;
  173. break;
  174. case 2: /* Seccomp stop after entering gettid. */
  175. if (!seccomp_before_sysentry)
  176. break;
  177. ATTRIBUTE_FALLTHROUGH;
  178. default:
  179. error_func_msg("#%d: unexpected seccomp stop",
  180. step);
  181. return pid;
  182. }
  183. break;
  184. case SIGTRAP | 0x80:
  185. switch (step) {
  186. case 3: /* Exiting gettid. */
  187. case 4: /* Entering exit_group. */
  188. break;
  189. case 1: /* Entering gettid before seccomp stop. */
  190. seccomp_before_sysentry = false;
  191. break;
  192. case 2: /* Entering gettid after seccomp stop. */
  193. if (seccomp_before_sysentry)
  194. break;
  195. ATTRIBUTE_FALLTHROUGH;
  196. default:
  197. error_func_msg("#%d: unexpected syscall stop",
  198. step);
  199. return pid;
  200. }
  201. break;
  202. default:
  203. error_func_msg("#%d: unexpected stop signal %#x",
  204. step, WSTOPSIG(status));
  205. return pid;
  206. }
  207. if (ptrace(PTRACE_SYSCALL, pid, 0L, 0L) < 0) {
  208. /* Cannot happen. */
  209. perror_func_msg("#%d: PTRACE_SYSCALL", step);
  210. break;
  211. }
  212. }
  213. return pid;
  214. }
  215. # endif /* HAVE_FORK */
  216. static void
  217. check_seccomp_order(void)
  218. {
  219. seccomp_filtering = false;
  220. /* NOMMU provides no forks necessary for the test. */
  221. # ifdef HAVE_FORK
  222. int pid = fork();
  223. if (pid < 0) {
  224. perror_func_msg("fork");
  225. return;
  226. }
  227. if (pid == 0)
  228. check_seccomp_order_do_child();
  229. pid = check_seccomp_order_tracer(pid);
  230. if (pid) {
  231. kill(pid, SIGKILL);
  232. for (;;) {
  233. long rc = waitpid(pid, NULL, 0);
  234. if (rc < 0 && errno == EINTR)
  235. continue;
  236. break;
  237. }
  238. }
  239. # endif /* HAVE_FORK */
  240. }
  241. static bool
  242. traced_by_seccomp(unsigned int scno, unsigned int p)
  243. {
  244. if (is_number_in_set_array(scno, trace_set, p)
  245. || sysent_vec[p][scno].sys_flags
  246. & (TRACE_INDIRECT_SUBCALL | TRACE_SECCOMP_DEFAULT))
  247. return true;
  248. return false;
  249. }
  250. static void
  251. replace_jmp_placeholders(unsigned char *jmp_offset, unsigned char jmp_next,
  252. unsigned char jmp_trace, unsigned char jmp_allow)
  253. {
  254. switch (*jmp_offset) {
  255. case JMP_PLACEHOLDER_NEXT:
  256. *jmp_offset = jmp_next;
  257. break;
  258. case JMP_PLACEHOLDER_TRACE:
  259. *jmp_offset = jmp_trace;
  260. break;
  261. case JMP_PLACEHOLDER_ALLOW:
  262. *jmp_offset = jmp_allow;
  263. break;
  264. default:
  265. break;
  266. }
  267. }
  268. static unsigned short
  269. bpf_syscalls_cmp(struct sock_filter *filter,
  270. unsigned int lower, unsigned int upper)
  271. {
  272. if (lower + 1 == upper) {
  273. /* if (nr == lower) return RET_TRACE; */
  274. SET_BPF_JUMP(filter, BPF_JEQ | BPF_K, lower,
  275. JMP_PLACEHOLDER_TRACE, 0);
  276. return 1;
  277. } else {
  278. /* if (nr >= lower && nr < upper) return RET_TRACE; */
  279. SET_BPF_JUMP(filter, BPF_JGE | BPF_K, lower, 0, 1);
  280. SET_BPF_JUMP(filter + 1, BPF_JGE | BPF_K, upper, 0,
  281. JMP_PLACEHOLDER_TRACE);
  282. return 2;
  283. }
  284. }
  285. static unsigned short
  286. linear_filter_generator(struct sock_filter *filter, bool *overflow)
  287. {
  288. /*
  289. * Generated program looks like:
  290. * if (arch == AUDIT_ARCH_A && nr >= flag) {
  291. * if (nr == 59)
  292. * return SECCOMP_RET_TRACE;
  293. * if (nr >= 321 && nr <= 323)
  294. * return SECCOMP_RET_TRACE;
  295. * ...
  296. * return SECCOMP_RET_ALLOW;
  297. * }
  298. * if (arch == AUDIT_ARCH_A) {
  299. * ...
  300. * }
  301. * if (arch == AUDIT_ARCH_B) {
  302. * ...
  303. * }
  304. * return SECCOMP_RET_TRACE;
  305. */
  306. unsigned short pos = 0;
  307. # if SUPPORTED_PERSONALITIES > 1
  308. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
  309. offsetof(struct seccomp_data, arch));
  310. # endif
  311. /*
  312. * Personalities are iterated in reverse-order in the BPF program so
  313. * that the x86 case is naturally handled. On x86, the first and third
  314. * personalities have the same arch identifier. The third can be
  315. * distinguished based on its associated syscall flag, so we check it
  316. * first. The only drawback here is that the first personality is more
  317. * common, which may make the BPF program slower to match syscalls on
  318. * average.
  319. */
  320. for (int p = SUPPORTED_PERSONALITIES - 1; p >= 0; --p) {
  321. unsigned int lower = UINT_MAX;
  322. unsigned short start = pos, end;
  323. # if SUPPORTED_PERSONALITIES > 1
  324. /* if (arch != audit_arch_vec[p].arch) goto next; */
  325. SET_BPF_JUMP(&filter[pos++], BPF_JEQ | BPF_K,
  326. audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
  327. # endif
  328. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
  329. offsetof(struct seccomp_data, nr));
  330. # if SUPPORTED_PERSONALITIES > 1
  331. if (audit_arch_vec[p].flag) {
  332. /* if (nr < audit_arch_vec[p].flag) goto next; */
  333. SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
  334. audit_arch_vec[p].flag, 2, 0);
  335. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
  336. offsetof(struct seccomp_data, arch));
  337. SET_BPF_JUMP(&filter[pos++], BPF_JA,
  338. JMP_PLACEHOLDER_NEXT, 0, 0);
  339. }
  340. # endif
  341. for (unsigned int i = 0; i < nsyscall_vec[p]; ++i) {
  342. if (traced_by_seccomp(i, p)) {
  343. if (lower == UINT_MAX)
  344. lower = i;
  345. continue;
  346. }
  347. if (lower == UINT_MAX)
  348. continue;
  349. pos += bpf_syscalls_cmp(filter + pos,
  350. lower | audit_arch_vec[p].flag,
  351. i | audit_arch_vec[p].flag);
  352. lower = UINT_MAX;
  353. }
  354. if (lower != UINT_MAX)
  355. pos += bpf_syscalls_cmp(filter + pos,
  356. lower | audit_arch_vec[p].flag,
  357. nsyscall_vec[p]
  358. | audit_arch_vec[p].flag);
  359. end = pos;
  360. /* if (nr >= max_nr) return RET_TRACE; */
  361. SET_BPF_JUMP(&filter[pos++], BPF_JGE | BPF_K,
  362. nsyscall_vec[p] | audit_arch_vec[p].flag, 1, 0);
  363. SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
  364. SECCOMP_RET_ALLOW);
  365. SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
  366. SECCOMP_RET_TRACE);
  367. /*
  368. * Within generated BPF programs, the origin and destination of
  369. * jumps are always in the same personality section. The
  370. * largest jump is therefore the jump from the first
  371. * instruction of the section to the last, to skip the
  372. * personality and try to compare .arch to the next
  373. * personality.
  374. * If we have a personality section with more than 255
  375. * instructions, the jump offset will overflow. Such program
  376. * is unlikely to happen, so we simply disable seccomp-filter
  377. * in such a case.
  378. */
  379. if (pos - start > UCHAR_MAX) {
  380. *overflow = true;
  381. return pos;
  382. }
  383. for (unsigned int i = start; i < end; ++i) {
  384. if (BPF_CLASS(filter[i].code) != BPF_JMP)
  385. continue;
  386. unsigned char jmp_next = pos - i - 1;
  387. unsigned char jmp_trace = pos - i - 2;
  388. unsigned char jmp_allow = pos - i - 3;
  389. replace_jmp_placeholders(&filter[i].jt, jmp_next,
  390. jmp_trace, jmp_allow);
  391. replace_jmp_placeholders(&filter[i].jf, jmp_next,
  392. jmp_trace, jmp_allow);
  393. if (BPF_OP(filter[i].code) == BPF_JA)
  394. filter[i].k = (unsigned int) jmp_next;
  395. }
  396. }
  397. # if SUPPORTED_PERSONALITIES > 1
  398. /* Jumps conditioned on .arch default to this RET_TRACE. */
  399. SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
  400. # endif
  401. return pos;
  402. }
  403. static unsigned short
  404. bpf_syscalls_match(struct sock_filter *filter, unsigned int bitarray,
  405. unsigned int bitarray_idx)
  406. {
  407. if (!bitarray) {
  408. /* return RET_ALLOW; */
  409. SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
  410. JMP_PLACEHOLDER_ALLOW, 0);
  411. return 1;
  412. }
  413. if (bitarray == UINT_MAX) {
  414. /* return RET_TRACE; */
  415. SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
  416. JMP_PLACEHOLDER_TRACE, 0);
  417. return 1;
  418. }
  419. /*
  420. * if (A == nr / 32)
  421. * return (X & bitarray) ? RET_TRACE : RET_ALLOW;
  422. */
  423. SET_BPF_JUMP(filter, BPF_JMP | BPF_JEQ | BPF_K, bitarray_idx,
  424. 0, 2);
  425. SET_BPF_STMT(filter + 1, BPF_MISC | BPF_TXA, 0);
  426. SET_BPF_JUMP(filter + 2, BPF_JMP | BPF_JSET | BPF_K, bitarray,
  427. JMP_PLACEHOLDER_TRACE, JMP_PLACEHOLDER_ALLOW);
  428. return 3;
  429. }
  430. static unsigned short
  431. binary_match_filter_generator(struct sock_filter *filter, bool *overflow)
  432. {
  433. unsigned short pos = 0;
  434. #if SUPPORTED_PERSONALITIES > 1
  435. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
  436. offsetof(struct seccomp_data, arch));
  437. #endif
  438. /* Personalities are iterated in reverse-order in the BPF program so that
  439. * the x86 case is naturally handled. In x86, the first and third
  440. * personalities have the same arch identifier. The third can be
  441. * distinguished based on its associated bit mask, so we check it first.
  442. * The only drawback here is that the first personality is more common,
  443. * which may make the BPF program slower to match syscalls on average. */
  444. for (int p = SUPPORTED_PERSONALITIES - 1;
  445. p >= 0 && pos <= BPF_MAXINSNS;
  446. --p) {
  447. unsigned short start = pos, end;
  448. unsigned int bitarray = 0;
  449. unsigned int i;
  450. #if SUPPORTED_PERSONALITIES > 1
  451. SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JEQ | BPF_K,
  452. audit_arch_vec[p].arch, 0, JMP_PLACEHOLDER_NEXT);
  453. #endif
  454. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
  455. offsetof(struct seccomp_data, nr));
  456. #if SUPPORTED_PERSONALITIES > 1
  457. if (audit_arch_vec[p].flag) {
  458. SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JGE | BPF_K,
  459. audit_arch_vec[p].flag, 2, 0);
  460. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
  461. offsetof(struct seccomp_data, arch));
  462. SET_BPF_JUMP(&filter[pos++], BPF_JMP | BPF_JA,
  463. JMP_PLACEHOLDER_NEXT, 0, 0);
  464. /* nr = nr & ~mask */
  465. SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
  466. ~audit_arch_vec[p].flag);
  467. }
  468. #endif
  469. /* X = 1 << nr % 32 = 1 << nr & 0x1F; */
  470. SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K, 0x1F);
  471. SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
  472. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_IMM, 1);
  473. SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_LSH | BPF_X, 0);
  474. SET_BPF_STMT(&filter[pos++], BPF_MISC | BPF_TAX, 0);
  475. /* A = nr / 32 = n >> 5; */
  476. SET_BPF_STMT(&filter[pos++], BPF_LD | BPF_W | BPF_ABS,
  477. offsetof(struct seccomp_data, nr));
  478. if (audit_arch_vec[p].flag) {
  479. /* nr = nr & ~mask */
  480. SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_AND | BPF_K,
  481. ~audit_arch_vec[p].flag);
  482. }
  483. SET_BPF_STMT(&filter[pos++], BPF_ALU | BPF_RSH | BPF_K, 5);
  484. for (i = 0; i < nsyscall_vec[p] && pos <= BPF_MAXINSNS; ++i) {
  485. if (traced_by_seccomp(i, p))
  486. bitarray |= (1 << i % 32);
  487. if (i % 32 == 31) {
  488. pos += bpf_syscalls_match(filter + pos,
  489. bitarray, i / 32);
  490. bitarray = 0;
  491. }
  492. }
  493. if (i % 32 != 0)
  494. pos += bpf_syscalls_match(filter + pos, bitarray,
  495. i / 32);
  496. end = pos;
  497. SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
  498. SECCOMP_RET_ALLOW);
  499. SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K,
  500. SECCOMP_RET_TRACE);
  501. if (pos - start > UCHAR_MAX) {
  502. *overflow = true;
  503. return pos;
  504. }
  505. for (unsigned int i = start; i < end; ++i) {
  506. if (BPF_CLASS(filter[i].code) != BPF_JMP)
  507. continue;
  508. unsigned char jmp_next = pos - i - 1;
  509. unsigned char jmp_trace = pos - i - 2;
  510. unsigned char jmp_allow = pos - i - 3;
  511. replace_jmp_placeholders(&filter[i].jt, jmp_next,
  512. jmp_trace, jmp_allow);
  513. replace_jmp_placeholders(&filter[i].jf, jmp_next,
  514. jmp_trace, jmp_allow);
  515. if (BPF_OP(filter[i].code) == BPF_JA)
  516. filter[i].k = (unsigned int)jmp_next;
  517. }
  518. }
  519. #if SUPPORTED_PERSONALITIES > 1
  520. SET_BPF_STMT(&filter[pos++], BPF_RET | BPF_K, SECCOMP_RET_TRACE);
  521. #endif
  522. return pos;
  523. }
  524. static void
  525. check_seccomp_filter_properties(void)
  526. {
  527. int rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0);
  528. seccomp_filtering = rc < 0 && errno != EINVAL;
  529. if (!seccomp_filtering) {
  530. debug_func_perror_msg("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
  531. return;
  532. }
  533. for (unsigned int i = 0; i < ARRAY_SIZE(filter_generators); ++i) {
  534. bool overflow = false;
  535. unsigned short len = filter_generators[i](filters[i],
  536. &overflow);
  537. if (len < bpf_prog.len && !overflow) {
  538. bpf_prog.len = len;
  539. bpf_prog.filter = filters[i];
  540. }
  541. }
  542. if (bpf_prog.len == USHRT_MAX) {
  543. debug_msg("seccomp filter disabled due to jump offset "
  544. "overflow");
  545. seccomp_filtering = false;
  546. } else if (bpf_prog.len > BPF_MAXINSNS) {
  547. debug_msg("seccomp filter disabled due to BPF program "
  548. "being oversized (%u > %d)", bpf_prog.len,
  549. BPF_MAXINSNS);
  550. seccomp_filtering = false;
  551. }
  552. if (seccomp_filtering)
  553. check_seccomp_order();
  554. }
  555. static void
  556. dump_seccomp_bpf(void)
  557. {
  558. const struct sock_filter *filter = bpf_prog.filter;
  559. for (unsigned int i = 0; i < bpf_prog.len; ++i) {
  560. switch (filter[i].code) {
  561. case BPF_LD | BPF_W | BPF_ABS:
  562. switch (filter[i].k) {
  563. case offsetof(struct seccomp_data, arch):
  564. error_msg("STMT(BPF_LDWABS, data->arch)");
  565. break;
  566. case offsetof(struct seccomp_data, nr):
  567. error_msg("STMT(BPF_LDWABS, data->nr)");
  568. break;
  569. default:
  570. error_msg("STMT(BPF_LDWABS, 0x%x)",
  571. filter[i].k);
  572. }
  573. break;
  574. case BPF_LD + BPF_W + BPF_IMM:
  575. error_msg("STMT(BPF_LDWIMM, 0x%x)", filter[i].k);
  576. break;
  577. case BPF_RET | BPF_K:
  578. switch (filter[i].k) {
  579. case SECCOMP_RET_TRACE:
  580. error_msg("STMT(BPF_RET, SECCOMP_RET_TRACE)");
  581. break;
  582. case SECCOMP_RET_ALLOW:
  583. error_msg("STMT(BPF_RET, SECCOMP_RET_ALLOW)");
  584. break;
  585. default:
  586. error_msg("STMT(BPF_RET, 0x%x)", filter[i].k);
  587. }
  588. break;
  589. case BPF_JMP | BPF_JEQ | BPF_K:
  590. error_msg("JUMP(BPF_JEQ, %u, %u, %u)",
  591. filter[i].jt, filter[i].jf,
  592. filter[i].k);
  593. break;
  594. case BPF_JMP | BPF_JGE | BPF_K:
  595. error_msg("JUMP(BPF_JGE, %u, %u, %u)",
  596. filter[i].jt, filter[i].jf,
  597. filter[i].k);
  598. break;
  599. case BPF_JMP + BPF_JSET + BPF_K:
  600. error_msg("JUMP(BPF_JSET, %u, %u, 0x%x)",
  601. filter[i].jt, filter[i].jf,
  602. filter[i].k);
  603. break;
  604. case BPF_JMP | BPF_JA:
  605. error_msg("JUMP(BPF_JA, %u)", filter[i].k);
  606. break;
  607. case BPF_ALU + BPF_RSH + BPF_K:
  608. error_msg("STMT(BPF_RSH, %u)", filter[i].k);
  609. break;
  610. case BPF_ALU + BPF_LSH + BPF_X:
  611. error_msg("STMT(BPF_LSH, X)");
  612. break;
  613. case BPF_ALU + BPF_AND + BPF_K:
  614. error_msg("STMT(BPF_AND, 0x%x)", filter[i].k);
  615. break;
  616. case BPF_MISC + BPF_TAX:
  617. error_msg("STMT(BPF_TAX)");
  618. break;
  619. case BPF_MISC + BPF_TXA:
  620. error_msg("STMT(BPF_TXA)");
  621. break;
  622. default:
  623. error_msg("STMT(0x%x, %u, %u, 0x%x)", filter[i].code,
  624. filter[i].jt, filter[i].jf, filter[i].k);
  625. }
  626. }
  627. }
  628. void
  629. init_seccomp_filter(void)
  630. {
  631. if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
  632. perror_func_msg_and_die("prctl(PR_SET_NO_NEW_PRIVS)");
  633. if (debug_flag)
  634. dump_seccomp_bpf();
  635. if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf_prog) < 0)
  636. perror_func_msg_and_die("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
  637. }
  638. int
  639. seccomp_filter_restart_operator(const struct tcb *tcp)
  640. {
  641. if (exiting(tcp) && tcp->scno < nsyscall_vec[current_personality]
  642. && traced_by_seccomp(tcp->scno, current_personality))
  643. return PTRACE_SYSCALL;
  644. return PTRACE_CONT;
  645. }
  646. #else /* !HAVE_LINUX_SECCOMP_H */
  647. # warning <linux/seccomp.h> is not available, seccomp filtering is not supported
  648. static void
  649. check_seccomp_filter_properties(void)
  650. {
  651. seccomp_filtering = false;
  652. }
  653. void
  654. init_seccomp_filter(void)
  655. {
  656. }
  657. int
  658. seccomp_filter_restart_operator(const struct tcb *tcp)
  659. {
  660. return PTRACE_SYSCALL;
  661. }
  662. #endif
  663. void
  664. check_seccomp_filter(void)
  665. {
  666. /* Let's avoid enabling seccomp if all syscalls are traced. */
  667. seccomp_filtering = !is_complete_set_array(trace_set, nsyscall_vec,
  668. SUPPORTED_PERSONALITIES);
  669. if (!seccomp_filtering) {
  670. error_msg("Seccomp filter is requested "
  671. "but there are no syscalls to filter. "
  672. "See -e trace to filter syscalls.");
  673. return;
  674. }
  675. check_seccomp_filter_properties();
  676. if (!seccomp_filtering)
  677. error_msg("seccomp filter is requested but unavailable");
  678. }