1 /* Authors: Gregory P. Smith & Jeffrey Yasskin */
2 #ifndef Py_BUILD_CORE_BUILTIN
3 #  define Py_BUILD_CORE_MODULE 1
4 #endif
5 
6 #include "Python.h"
7 #include "pycore_fileutils.h"
8 #if defined(HAVE_PIPE2) && !defined(_GNU_SOURCE)
9 # define _GNU_SOURCE
10 #endif
11 #include <unistd.h>
12 #include <fcntl.h>
13 #ifdef HAVE_SYS_TYPES_H
14 #include <sys/types.h>
15 #endif
16 #if defined(HAVE_SYS_STAT_H)
17 #include <sys/stat.h>
18 #endif
19 #ifdef HAVE_SYS_SYSCALL_H
20 #include <sys/syscall.h>
21 #endif
22 #if defined(HAVE_SYS_RESOURCE_H)
23 #include <sys/resource.h>
24 #endif
25 #ifdef HAVE_DIRENT_H
26 #include <dirent.h>
27 #endif
28 #ifdef HAVE_GRP_H
29 #include <grp.h>
30 #endif /* HAVE_GRP_H */
31 
32 #include "posixmodule.h"
33 
34 #ifdef _Py_MEMORY_SANITIZER
35 # include <sanitizer/msan_interface.h>
36 #endif
37 
38 #if defined(__ANDROID__) && __ANDROID_API__ < 21 && !defined(SYS_getdents64)
39 # include <sys/linux-syscalls.h>
40 # define SYS_getdents64  __NR_getdents64
41 #endif
42 
43 #if defined(__linux__) && defined(HAVE_VFORK) && defined(HAVE_SIGNAL_H) && \
44     defined(HAVE_PTHREAD_SIGMASK) && !defined(HAVE_BROKEN_PTHREAD_SIGMASK)
45 /* If this is ever expanded to non-Linux platforms, verify what calls are
46  * allowed after vfork(). Ex: setsid() may be disallowed on macOS? */
47 # include <signal.h>
48 # define VFORK_USABLE 1
49 #endif
50 
51 #if defined(__sun) && defined(__SVR4)
52 /* readdir64 is used to work around Solaris 9 bug 6395699. */
53 # define readdir readdir64
54 # define dirent dirent64
55 # if !defined(HAVE_DIRFD)
56 /* Some versions of Solaris lack dirfd(). */
57 #  define dirfd(dirp) ((dirp)->dd_fd)
58 #  define HAVE_DIRFD
59 # endif
60 #endif
61 
62 #if defined(__FreeBSD__) || (defined(__APPLE__) && defined(__MACH__)) || defined(__DragonFly__)
63 # define FD_DIR "/dev/fd"
64 #else
65 # define FD_DIR "/proc/self/fd"
66 #endif
67 
68 #ifdef NGROUPS_MAX
69 #define MAX_GROUPS NGROUPS_MAX
70 #else
71 #define MAX_GROUPS 64
72 #endif
73 
74 #define POSIX_CALL(call)   do { if ((call) == -1) goto error; } while (0)
75 
76 static struct PyModuleDef _posixsubprocessmodule;
77 
78 /* Convert ASCII to a positive int, no libc call. no overflow. -1 on error. */
79 static int
_pos_int_from_ascii(const char * name)80 _pos_int_from_ascii(const char *name)
81 {
82     int num = 0;
83     while (*name >= '0' && *name <= '9') {
84         num = num * 10 + (*name - '0');
85         ++name;
86     }
87     if (*name)
88         return -1;  /* Non digit found, not a number. */
89     return num;
90 }
91 
92 
93 #if defined(__FreeBSD__) || defined(__DragonFly__)
94 /* When /dev/fd isn't mounted it is often a static directory populated
95  * with 0 1 2 or entries for 0 .. 63 on FreeBSD, NetBSD, OpenBSD and DragonFlyBSD.
96  * NetBSD and OpenBSD have a /proc fs available (though not necessarily
97  * mounted) and do not have fdescfs for /dev/fd.  MacOS X has a devfs
98  * that properly supports /dev/fd.
99  */
100 static int
_is_fdescfs_mounted_on_dev_fd(void)101 _is_fdescfs_mounted_on_dev_fd(void)
102 {
103     struct stat dev_stat;
104     struct stat dev_fd_stat;
105     if (stat("/dev", &dev_stat) != 0)
106         return 0;
107     if (stat(FD_DIR, &dev_fd_stat) != 0)
108         return 0;
109     if (dev_stat.st_dev == dev_fd_stat.st_dev)
110         return 0;  /* / == /dev == /dev/fd means it is static. #fail */
111     return 1;
112 }
113 #endif
114 
115 
116 /* Returns 1 if there is a problem with fd_sequence, 0 otherwise. */
117 static int
_sanity_check_python_fd_sequence(PyObject * fd_sequence)118 _sanity_check_python_fd_sequence(PyObject *fd_sequence)
119 {
120     Py_ssize_t seq_idx;
121     long prev_fd = -1;
122     for (seq_idx = 0; seq_idx < PyTuple_GET_SIZE(fd_sequence); ++seq_idx) {
123         PyObject* py_fd = PyTuple_GET_ITEM(fd_sequence, seq_idx);
124         long iter_fd;
125         if (!PyLong_Check(py_fd)) {
126             return 1;
127         }
128         iter_fd = PyLong_AsLong(py_fd);
129         if (iter_fd < 0 || iter_fd <= prev_fd || iter_fd > INT_MAX) {
130             /* Negative, overflow, unsorted, too big for a fd. */
131             return 1;
132         }
133         prev_fd = iter_fd;
134     }
135     return 0;
136 }
137 
138 
139 /* Is fd found in the sorted Python Sequence? */
140 static int
_is_fd_in_sorted_fd_sequence(int fd,int * fd_sequence,Py_ssize_t fd_sequence_len)141 _is_fd_in_sorted_fd_sequence(int fd, int *fd_sequence,
142                              Py_ssize_t fd_sequence_len)
143 {
144     /* Binary search. */
145     Py_ssize_t search_min = 0;
146     Py_ssize_t search_max = fd_sequence_len - 1;
147     if (search_max < 0)
148         return 0;
149     do {
150         long middle = (search_min + search_max) / 2;
151         long middle_fd = fd_sequence[middle];
152         if (fd == middle_fd)
153             return 1;
154         if (fd > middle_fd)
155             search_min = middle + 1;
156         else
157             search_max = middle - 1;
158     } while (search_min <= search_max);
159     return 0;
160 }
161 
162 /*
163  * Do all the Python C API calls in the parent process to turn the pass_fds
164  * "py_fds_to_keep" tuple into a C array.  The caller owns allocation and
165  * freeing of the array.
166  *
167  * On error an unknown number of array elements may have been filled in.
168  * A Python exception has been set when an error is returned.
169  *
170  * Returns: -1 on error, 0 on success.
171  */
172 static int
convert_fds_to_keep_to_c(PyObject * py_fds_to_keep,int * c_fds_to_keep)173 convert_fds_to_keep_to_c(PyObject *py_fds_to_keep, int *c_fds_to_keep)
174 {
175     Py_ssize_t i, len;
176 
177     len = PyTuple_GET_SIZE(py_fds_to_keep);
178     for (i = 0; i < len; ++i) {
179         PyObject* fdobj = PyTuple_GET_ITEM(py_fds_to_keep, i);
180         long fd = PyLong_AsLong(fdobj);
181         if (fd == -1 && PyErr_Occurred()) {
182             return -1;
183         }
184         if (fd < 0 || fd > INT_MAX) {
185             PyErr_SetString(PyExc_ValueError,
186                             "fd out of range in fds_to_keep.");
187             return -1;
188         }
189         c_fds_to_keep[i] = (int)fd;
190     }
191     return 0;
192 }
193 
194 
195 /* This function must be async-signal-safe as it is called from child_exec()
196  * after fork() or vfork().
197  */
198 static int
make_inheritable(int * c_fds_to_keep,Py_ssize_t len,int errpipe_write)199 make_inheritable(int *c_fds_to_keep, Py_ssize_t len, int errpipe_write)
200 {
201     Py_ssize_t i;
202 
203     for (i = 0; i < len; ++i) {
204         int fd = c_fds_to_keep[i];
205         if (fd == errpipe_write) {
206             /* errpipe_write is part of fds_to_keep. It must be closed at
207                exec(), but kept open in the child process until exec() is
208                called. */
209             continue;
210         }
211         if (_Py_set_inheritable_async_safe(fd, 1, NULL) < 0)
212             return -1;
213     }
214     return 0;
215 }
216 
217 
218 /* Get the maximum file descriptor that could be opened by this process.
219  * This function is async signal safe for use between fork() and exec().
220  */
221 static long
safe_get_max_fd(void)222 safe_get_max_fd(void)
223 {
224     long local_max_fd;
225 #if defined(__NetBSD__)
226     local_max_fd = fcntl(0, F_MAXFD);
227     if (local_max_fd >= 0)
228         return local_max_fd;
229 #endif
230 #if defined(HAVE_SYS_RESOURCE_H) && defined(__OpenBSD__)
231     struct rlimit rl;
232     /* Not on the POSIX async signal safe functions list but likely
233      * safe.  TODO - Someone should audit OpenBSD to make sure. */
234     if (getrlimit(RLIMIT_NOFILE, &rl) >= 0)
235         return (long) rl.rlim_max;
236 #endif
237 #ifdef _SC_OPEN_MAX
238     local_max_fd = sysconf(_SC_OPEN_MAX);
239     if (local_max_fd == -1)
240 #endif
241         local_max_fd = 256;  /* Matches legacy Lib/subprocess.py behavior. */
242     return local_max_fd;
243 }
244 
245 
246 /* Close all file descriptors in the given range except for those in
247  * fds_to_keep by invoking closer on each subrange.
248  *
249  * If end_fd == -1, it's guessed via safe_get_max_fd(), but it isn't
250  * possible to know for sure what the max fd to go up to is for
251  * processes with the capability of raising their maximum, or in case
252  * a process opened a high fd and then lowered its maximum.
253  */
254 static int
_close_range_except(int start_fd,int end_fd,int * fds_to_keep,Py_ssize_t fds_to_keep_len,int (* closer)(int,int))255 _close_range_except(int start_fd,
256                     int end_fd,
257                     int *fds_to_keep,
258                     Py_ssize_t fds_to_keep_len,
259                     int (*closer)(int, int))
260 {
261     if (end_fd == -1) {
262         end_fd = Py_MIN(safe_get_max_fd(), INT_MAX);
263     }
264     Py_ssize_t keep_seq_idx;
265     /* As fds_to_keep is sorted we can loop through the list closing
266      * fds in between any in the keep list falling within our range. */
267     for (keep_seq_idx = 0; keep_seq_idx < fds_to_keep_len; ++keep_seq_idx) {
268         int keep_fd = fds_to_keep[keep_seq_idx];
269         if (keep_fd < start_fd)
270             continue;
271         if (closer(start_fd, keep_fd - 1) != 0)
272             return -1;
273         start_fd = keep_fd + 1;
274     }
275     if (start_fd <= end_fd) {
276         if (closer(start_fd, end_fd) != 0)
277             return -1;
278     }
279     return 0;
280 }
281 
282 #if defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)
283 /* It doesn't matter if d_name has room for NAME_MAX chars; we're using this
284  * only to read a directory of short file descriptor number names.  The kernel
285  * will return an error if we didn't give it enough space.  Highly Unlikely.
286  * This structure is very old and stable: It will not change unless the kernel
287  * chooses to break compatibility with all existing binaries.  Highly Unlikely.
288  */
289 struct linux_dirent64 {
290    unsigned long long d_ino;
291    long long d_off;
292    unsigned short d_reclen;     /* Length of this linux_dirent */
293    unsigned char  d_type;
294    char           d_name[256];  /* Filename (null-terminated) */
295 };
296 
297 static int
_brute_force_closer(int first,int last)298 _brute_force_closer(int first, int last)
299 {
300     for (int i = first; i <= last; i++) {
301         /* Ignore errors */
302         (void)close(i);
303     }
304     return 0;
305 }
306 
307 /* Close all open file descriptors in the range from start_fd and higher
308  * Do not close any in the sorted fds_to_keep list.
309  *
310  * This version is async signal safe as it does not make any unsafe C library
311  * calls, malloc calls or handle any locks.  It is _unfortunate_ to be forced
312  * to resort to making a kernel system call directly but this is the ONLY api
313  * available that does no harm.  opendir/readdir/closedir perform memory
314  * allocation and locking so while they usually work they are not guaranteed
315  * to (especially if you have replaced your malloc implementation).  A version
316  * of this function that uses those can be found in the _maybe_unsafe variant.
317  *
318  * This is Linux specific because that is all I am ready to test it on.  It
319  * should be easy to add OS specific dirent or dirent64 structures and modify
320  * it with some cpp #define magic to work on other OSes as well if you want.
321  */
322 static void
_close_open_fds_safe(int start_fd,int * fds_to_keep,Py_ssize_t fds_to_keep_len)323 _close_open_fds_safe(int start_fd, int *fds_to_keep, Py_ssize_t fds_to_keep_len)
324 {
325     int fd_dir_fd;
326 
327     fd_dir_fd = _Py_open_noraise(FD_DIR, O_RDONLY);
328     if (fd_dir_fd == -1) {
329         /* No way to get a list of open fds. */
330         _close_range_except(start_fd, -1,
331                             fds_to_keep, fds_to_keep_len,
332                             _brute_force_closer);
333         return;
334     } else {
335         char buffer[sizeof(struct linux_dirent64)];
336         int bytes;
337         while ((bytes = syscall(SYS_getdents64, fd_dir_fd,
338                                 (struct linux_dirent64 *)buffer,
339                                 sizeof(buffer))) > 0) {
340             struct linux_dirent64 *entry;
341             int offset;
342 #ifdef _Py_MEMORY_SANITIZER
343             __msan_unpoison(buffer, bytes);
344 #endif
345             for (offset = 0; offset < bytes; offset += entry->d_reclen) {
346                 int fd;
347                 entry = (struct linux_dirent64 *)(buffer + offset);
348                 if ((fd = _pos_int_from_ascii(entry->d_name)) < 0)
349                     continue;  /* Not a number. */
350                 if (fd != fd_dir_fd && fd >= start_fd &&
351                     !_is_fd_in_sorted_fd_sequence(fd, fds_to_keep,
352                                                   fds_to_keep_len)) {
353                     close(fd);
354                 }
355             }
356         }
357         close(fd_dir_fd);
358     }
359 }
360 
361 #define _close_open_fds_fallback _close_open_fds_safe
362 
363 #else  /* NOT (defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)) */
364 
365 static int
_unsafe_closer(int first,int last)366 _unsafe_closer(int first, int last)
367 {
368     _Py_closerange(first, last);
369     return 0;
370 }
371 
372 /* Close all open file descriptors from start_fd and higher.
373  * Do not close any in the sorted fds_to_keep tuple.
374  *
375  * This function violates the strict use of async signal safe functions. :(
376  * It calls opendir(), readdir() and closedir().  Of these, the one most
377  * likely to ever cause a problem is opendir() as it performs an internal
378  * malloc().  Practically this should not be a problem.  The Java VM makes the
379  * same calls between fork and exec in its own UNIXProcess_md.c implementation.
380  *
381  * readdir_r() is not used because it provides no benefit.  It is typically
382  * implemented as readdir() followed by memcpy().  See also:
383  *   http://womble.decadent.org.uk/readdir_r-advisory.html
384  */
385 static void
_close_open_fds_maybe_unsafe(int start_fd,int * fds_to_keep,Py_ssize_t fds_to_keep_len)386 _close_open_fds_maybe_unsafe(int start_fd, int *fds_to_keep,
387                              Py_ssize_t fds_to_keep_len)
388 {
389     DIR *proc_fd_dir;
390 #ifndef HAVE_DIRFD
391     while (_is_fd_in_sorted_fd_sequence(start_fd, fds_to_keep,
392                                         fds_to_keep_len)) {
393         ++start_fd;
394     }
395     /* Close our lowest fd before we call opendir so that it is likely to
396      * reuse that fd otherwise we might close opendir's file descriptor in
397      * our loop.  This trick assumes that fd's are allocated on a lowest
398      * available basis. */
399     close(start_fd);
400     ++start_fd;
401 #endif
402 
403 #if defined(__FreeBSD__) || defined(__DragonFly__)
404     if (!_is_fdescfs_mounted_on_dev_fd())
405         proc_fd_dir = NULL;
406     else
407 #endif
408         proc_fd_dir = opendir(FD_DIR);
409     if (!proc_fd_dir) {
410         /* No way to get a list of open fds. */
411         _close_range_except(start_fd, -1, fds_to_keep, fds_to_keep_len,
412                             _unsafe_closer);
413     } else {
414         struct dirent *dir_entry;
415 #ifdef HAVE_DIRFD
416         int fd_used_by_opendir = dirfd(proc_fd_dir);
417 #else
418         int fd_used_by_opendir = start_fd - 1;
419 #endif
420         errno = 0;
421         while ((dir_entry = readdir(proc_fd_dir))) {
422             int fd;
423             if ((fd = _pos_int_from_ascii(dir_entry->d_name)) < 0)
424                 continue;  /* Not a number. */
425             if (fd != fd_used_by_opendir && fd >= start_fd &&
426                 !_is_fd_in_sorted_fd_sequence(fd, fds_to_keep,
427                                               fds_to_keep_len)) {
428                 close(fd);
429             }
430             errno = 0;
431         }
432         if (errno) {
433             /* readdir error, revert behavior. Highly Unlikely. */
434             _close_range_except(start_fd, -1, fds_to_keep, fds_to_keep_len,
435                                 _unsafe_closer);
436         }
437         closedir(proc_fd_dir);
438     }
439 }
440 
441 #define _close_open_fds_fallback _close_open_fds_maybe_unsafe
442 
443 #endif  /* else NOT (defined(__linux__) && defined(HAVE_SYS_SYSCALL_H)) */
444 
445 /* We can use close_range() library function only if it's known to be
446  * async-signal-safe.
447  *
448  * On Linux, glibc explicitly documents it to be a thin wrapper over
449  * the system call, and other C libraries are likely to follow glibc.
450  */
451 #if defined(HAVE_CLOSE_RANGE) && \
452     (defined(__linux__) || defined(__FreeBSD__))
453 #define HAVE_ASYNC_SAFE_CLOSE_RANGE
454 
455 static int
_close_range_closer(int first,int last)456 _close_range_closer(int first, int last)
457 {
458     return close_range(first, last, 0);
459 }
460 #endif
461 
462 static void
_close_open_fds(int start_fd,int * fds_to_keep,Py_ssize_t fds_to_keep_len)463 _close_open_fds(int start_fd, int *fds_to_keep, Py_ssize_t fds_to_keep_len)
464 {
465 #ifdef HAVE_ASYNC_SAFE_CLOSE_RANGE
466     if (_close_range_except(
467             start_fd, INT_MAX, fds_to_keep, fds_to_keep_len,
468             _close_range_closer) == 0) {
469         return;
470     }
471 #endif
472     _close_open_fds_fallback(start_fd, fds_to_keep, fds_to_keep_len);
473 }
474 
475 #ifdef VFORK_USABLE
476 /* Reset dispositions for all signals to SIG_DFL except for ignored
477  * signals. This way we ensure that no signal handlers can run
478  * after we unblock signals in a child created by vfork().
479  */
480 static void
reset_signal_handlers(const sigset_t * child_sigmask)481 reset_signal_handlers(const sigset_t *child_sigmask)
482 {
483     struct sigaction sa_dfl = {.sa_handler = SIG_DFL};
484     for (int sig = 1; sig < _NSIG; sig++) {
485         /* Dispositions for SIGKILL and SIGSTOP can't be changed. */
486         if (sig == SIGKILL || sig == SIGSTOP) {
487             continue;
488         }
489 
490         /* There is no need to reset the disposition of signals that will
491          * remain blocked across execve() since the kernel will do it. */
492         if (sigismember(child_sigmask, sig) == 1) {
493             continue;
494         }
495 
496         struct sigaction sa;
497         /* C libraries usually return EINVAL for signals used
498          * internally (e.g. for thread cancellation), so simply
499          * skip errors here. */
500         if (sigaction(sig, NULL, &sa) == -1) {
501             continue;
502         }
503 
504         /* void *h works as these fields are both pointer types already. */
505         void *h = (sa.sa_flags & SA_SIGINFO ? (void *)sa.sa_sigaction :
506                                               (void *)sa.sa_handler);
507         if (h == SIG_IGN || h == SIG_DFL) {
508             continue;
509         }
510 
511         /* This call can't reasonably fail, but if it does, terminating
512          * the child seems to be too harsh, so ignore errors. */
513         (void) sigaction(sig, &sa_dfl, NULL);
514     }
515 }
516 #endif /* VFORK_USABLE */
517 
518 
519 /*
520  * This function is code executed in the child process immediately after
521  * (v)fork to set things up and call exec().
522  *
523  * All of the code in this function must only use async-signal-safe functions,
524  * listed at `man 7 signal` or
525  * http://www.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html.
526  *
527  * This restriction is documented at
528  * http://www.opengroup.org/onlinepubs/009695399/functions/fork.html.
529  *
530  * If this function is called after vfork(), even more care must be taken.
531  * The lack of preparations that C libraries normally take on fork(),
532  * as well as sharing the address space with the parent, might make even
533  * async-signal-safe functions vfork-unsafe. In particular, on Linux,
534  * set*id() and setgroups() library functions must not be called, since
535  * they have to interact with the library-level thread list and send
536  * library-internal signals to implement per-process credentials semantics
537  * required by POSIX but not supported natively on Linux. Another reason to
538  * avoid this family of functions is that sharing an address space between
539  * processes running with different privileges is inherently insecure.
540  * See bpo-35823 for further discussion and references.
541  *
542  * In some C libraries, setrlimit() has the same thread list/signalling
543  * behavior since resource limits were per-thread attributes before
544  * Linux 2.6.10. Musl, as of 1.2.1, is known to have this issue
545  * (https://www.openwall.com/lists/musl/2020/10/15/6).
546  *
547  * If vfork-unsafe functionality is desired after vfork(), consider using
548  * syscall() to obtain it.
549  */
550 Py_NO_INLINE static void
child_exec(char * const exec_array[],char * const argv[],char * const envp[],const char * cwd,int p2cread,int p2cwrite,int c2pread,int c2pwrite,int errread,int errwrite,int errpipe_read,int errpipe_write,int close_fds,int restore_signals,int call_setsid,pid_t pgid_to_set,int call_setgid,gid_t gid,int call_setgroups,size_t groups_size,const gid_t * groups,int call_setuid,uid_t uid,int child_umask,const void * child_sigmask,int * fds_to_keep,Py_ssize_t fds_to_keep_len,PyObject * preexec_fn,PyObject * preexec_fn_args_tuple)551 child_exec(char *const exec_array[],
552            char *const argv[],
553            char *const envp[],
554            const char *cwd,
555            int p2cread, int p2cwrite,
556            int c2pread, int c2pwrite,
557            int errread, int errwrite,
558            int errpipe_read, int errpipe_write,
559            int close_fds, int restore_signals,
560            int call_setsid, pid_t pgid_to_set,
561            int call_setgid, gid_t gid,
562            int call_setgroups, size_t groups_size, const gid_t *groups,
563            int call_setuid, uid_t uid, int child_umask,
564            const void *child_sigmask,
565            int *fds_to_keep, Py_ssize_t fds_to_keep_len,
566            PyObject *preexec_fn,
567            PyObject *preexec_fn_args_tuple)
568 {
569     int i, saved_errno, reached_preexec = 0;
570     PyObject *result;
571     const char* err_msg = "";
572     /* Buffer large enough to hold a hex integer.  We can't malloc. */
573     char hex_errno[sizeof(saved_errno)*2+1];
574 
575     if (make_inheritable(fds_to_keep, fds_to_keep_len, errpipe_write) < 0)
576         goto error;
577 
578     /* Close parent's pipe ends. */
579     if (p2cwrite != -1)
580         POSIX_CALL(close(p2cwrite));
581     if (c2pread != -1)
582         POSIX_CALL(close(c2pread));
583     if (errread != -1)
584         POSIX_CALL(close(errread));
585     POSIX_CALL(close(errpipe_read));
586 
587     /* When duping fds, if there arises a situation where one of the fds is
588        either 0, 1 or 2, it is possible that it is overwritten (#12607). */
589     if (c2pwrite == 0) {
590         POSIX_CALL(c2pwrite = dup(c2pwrite));
591         /* issue32270 */
592         if (_Py_set_inheritable_async_safe(c2pwrite, 0, NULL) < 0) {
593             goto error;
594         }
595     }
596     while (errwrite == 0 || errwrite == 1) {
597         POSIX_CALL(errwrite = dup(errwrite));
598         /* issue32270 */
599         if (_Py_set_inheritable_async_safe(errwrite, 0, NULL) < 0) {
600             goto error;
601         }
602     }
603 
604     /* Dup fds for child.
605        dup2() removes the CLOEXEC flag but we must do it ourselves if dup2()
606        would be a no-op (issue #10806). */
607     if (p2cread == 0) {
608         if (_Py_set_inheritable_async_safe(p2cread, 1, NULL) < 0)
609             goto error;
610     }
611     else if (p2cread != -1)
612         POSIX_CALL(dup2(p2cread, 0));  /* stdin */
613 
614     if (c2pwrite == 1) {
615         if (_Py_set_inheritable_async_safe(c2pwrite, 1, NULL) < 0)
616             goto error;
617     }
618     else if (c2pwrite != -1)
619         POSIX_CALL(dup2(c2pwrite, 1));  /* stdout */
620 
621     if (errwrite == 2) {
622         if (_Py_set_inheritable_async_safe(errwrite, 1, NULL) < 0)
623             goto error;
624     }
625     else if (errwrite != -1)
626         POSIX_CALL(dup2(errwrite, 2));  /* stderr */
627 
628     /* We no longer manually close p2cread, c2pwrite, and errwrite here as
629      * _close_open_fds takes care when it is not already non-inheritable. */
630 
631     if (cwd)
632         POSIX_CALL(chdir(cwd));
633 
634     if (child_umask >= 0)
635         umask(child_umask);  /* umask() always succeeds. */
636 
637     if (restore_signals)
638         _Py_RestoreSignals();
639 
640 #ifdef VFORK_USABLE
641     if (child_sigmask) {
642         reset_signal_handlers(child_sigmask);
643         if ((errno = pthread_sigmask(SIG_SETMASK, child_sigmask, NULL))) {
644             goto error;
645         }
646     }
647 #endif
648 
649 #ifdef HAVE_SETSID
650     if (call_setsid)
651         POSIX_CALL(setsid());
652 #endif
653 
654 #ifdef HAVE_SETPGID
655     if (pgid_to_set >= 0)
656         POSIX_CALL(setpgid(0, pgid_to_set));
657 #endif
658 
659 #ifdef HAVE_SETGROUPS
660     if (call_setgroups)
661         POSIX_CALL(setgroups(groups_size, groups));
662 #endif /* HAVE_SETGROUPS */
663 
664 #ifdef HAVE_SETREGID
665     if (call_setgid)
666         POSIX_CALL(setregid(gid, gid));
667 #endif /* HAVE_SETREGID */
668 
669 #ifdef HAVE_SETREUID
670     if (call_setuid)
671         POSIX_CALL(setreuid(uid, uid));
672 #endif /* HAVE_SETREUID */
673 
674 
675     reached_preexec = 1;
676     if (preexec_fn != Py_None && preexec_fn_args_tuple) {
677         /* This is where the user has asked us to deadlock their program. */
678         result = PyObject_Call(preexec_fn, preexec_fn_args_tuple, NULL);
679         if (result == NULL) {
680             /* Stringifying the exception or traceback would involve
681              * memory allocation and thus potential for deadlock.
682              * We've already faced potential deadlock by calling back
683              * into Python in the first place, so it probably doesn't
684              * matter but we avoid it to minimize the possibility. */
685             err_msg = "Exception occurred in preexec_fn.";
686             errno = 0;  /* We don't want to report an OSError. */
687             goto error;
688         }
689         /* Py_DECREF(result); - We're about to exec so why bother? */
690     }
691 
692     /* close FDs after executing preexec_fn, which might open FDs */
693     if (close_fds) {
694         /* TODO HP-UX could use pstat_getproc() if anyone cares about it. */
695         _close_open_fds(3, fds_to_keep, fds_to_keep_len);
696     }
697 
698     /* This loop matches the Lib/os.py _execvpe()'s PATH search when */
699     /* given the executable_list generated by Lib/subprocess.py.     */
700     saved_errno = 0;
701     for (i = 0; exec_array[i] != NULL; ++i) {
702         const char *executable = exec_array[i];
703         if (envp) {
704             execve(executable, argv, envp);
705         } else {
706             execv(executable, argv);
707         }
708         if (errno != ENOENT && errno != ENOTDIR && saved_errno == 0) {
709             saved_errno = errno;
710         }
711     }
712     /* Report the first exec error, not the last. */
713     if (saved_errno)
714         errno = saved_errno;
715 
716 error:
717     saved_errno = errno;
718     /* Report the posix error to our parent process. */
719     /* We ignore all write() return values as the total size of our writes is
720        less than PIPEBUF and we cannot do anything about an error anyways.
721        Use _Py_write_noraise() to retry write() if it is interrupted by a
722        signal (fails with EINTR). */
723     if (saved_errno) {
724         char *cur;
725         _Py_write_noraise(errpipe_write, "OSError:", 8);
726         cur = hex_errno + sizeof(hex_errno);
727         while (saved_errno != 0 && cur != hex_errno) {
728             *--cur = Py_hexdigits[saved_errno % 16];
729             saved_errno /= 16;
730         }
731         _Py_write_noraise(errpipe_write, cur, hex_errno + sizeof(hex_errno) - cur);
732         _Py_write_noraise(errpipe_write, ":", 1);
733         if (!reached_preexec) {
734             /* Indicate to the parent that the error happened before exec(). */
735             _Py_write_noraise(errpipe_write, "noexec", 6);
736         }
737         /* We can't call strerror(saved_errno).  It is not async signal safe.
738          * The parent process will look the error message up. */
739     } else {
740         _Py_write_noraise(errpipe_write, "SubprocessError:0:", 18);
741         _Py_write_noraise(errpipe_write, err_msg, strlen(err_msg));
742     }
743 }
744 
745 
746 /* The main purpose of this wrapper function is to isolate vfork() from both
747  * subprocess_fork_exec() and child_exec(). A child process created via
748  * vfork() executes on the same stack as the parent process while the latter is
749  * suspended, so this function should not be inlined to avoid compiler bugs
750  * that might clobber data needed by the parent later. Additionally,
751  * child_exec() should not be inlined to avoid spurious -Wclobber warnings from
752  * GCC (see bpo-35823).
753  */
754 Py_NO_INLINE static pid_t
do_fork_exec(char * const exec_array[],char * const argv[],char * const envp[],const char * cwd,int p2cread,int p2cwrite,int c2pread,int c2pwrite,int errread,int errwrite,int errpipe_read,int errpipe_write,int close_fds,int restore_signals,int call_setsid,pid_t pgid_to_set,int call_setgid,gid_t gid,int call_setgroups,size_t groups_size,const gid_t * groups,int call_setuid,uid_t uid,int child_umask,const void * child_sigmask,int * fds_to_keep,Py_ssize_t fds_to_keep_len,PyObject * preexec_fn,PyObject * preexec_fn_args_tuple)755 do_fork_exec(char *const exec_array[],
756              char *const argv[],
757              char *const envp[],
758              const char *cwd,
759              int p2cread, int p2cwrite,
760              int c2pread, int c2pwrite,
761              int errread, int errwrite,
762              int errpipe_read, int errpipe_write,
763              int close_fds, int restore_signals,
764              int call_setsid, pid_t pgid_to_set,
765              int call_setgid, gid_t gid,
766              int call_setgroups, size_t groups_size, const gid_t *groups,
767              int call_setuid, uid_t uid, int child_umask,
768              const void *child_sigmask,
769              int *fds_to_keep, Py_ssize_t fds_to_keep_len,
770              PyObject *preexec_fn,
771              PyObject *preexec_fn_args_tuple)
772 {
773 
774     pid_t pid;
775 
776 #ifdef VFORK_USABLE
777     if (child_sigmask) {
778         /* These are checked by our caller; verify them in debug builds. */
779         assert(!call_setuid);
780         assert(!call_setgid);
781         assert(!call_setgroups);
782         assert(preexec_fn == Py_None);
783 
784         pid = vfork();
785         if (pid == -1) {
786             /* If vfork() fails, fall back to using fork(). When it isn't
787              * allowed in a process by the kernel, vfork can return -1
788              * with errno EINVAL. https://bugs.python.org/issue47151. */
789             pid = fork();
790         }
791     } else
792 #endif
793     {
794         pid = fork();
795     }
796 
797     if (pid != 0) {
798         return pid;
799     }
800 
801     /* Child process.
802      * See the comment above child_exec() for restrictions imposed on
803      * the code below.
804      */
805 
806     if (preexec_fn != Py_None) {
807         /* We'll be calling back into Python later so we need to do this.
808          * This call may not be async-signal-safe but neither is calling
809          * back into Python.  The user asked us to use hope as a strategy
810          * to avoid deadlock... */
811         PyOS_AfterFork_Child();
812     }
813 
814     child_exec(exec_array, argv, envp, cwd,
815                p2cread, p2cwrite, c2pread, c2pwrite,
816                errread, errwrite, errpipe_read, errpipe_write,
817                close_fds, restore_signals, call_setsid, pgid_to_set,
818                call_setgid, gid, call_setgroups, groups_size, groups,
819                call_setuid, uid, child_umask, child_sigmask,
820                fds_to_keep, fds_to_keep_len,
821                preexec_fn, preexec_fn_args_tuple);
822     _exit(255);
823     return 0;  /* Dead code to avoid a potential compiler warning. */
824 }
825 
826 
827 static PyObject *
subprocess_fork_exec(PyObject * module,PyObject * args)828 subprocess_fork_exec(PyObject *module, PyObject *args)
829 {
830     PyObject *gc_module = NULL;
831     PyObject *executable_list, *py_fds_to_keep;
832     PyObject *env_list, *preexec_fn;
833     PyObject *process_args, *converted_args = NULL, *fast_args = NULL;
834     PyObject *preexec_fn_args_tuple = NULL;
835     PyObject *groups_list;
836     PyObject *uid_object, *gid_object;
837     int p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite;
838     int errpipe_read, errpipe_write, close_fds, restore_signals;
839     int call_setsid;
840     pid_t pgid_to_set = -1;
841     int call_setgid = 0, call_setgroups = 0, call_setuid = 0;
842     uid_t uid;
843     gid_t gid, *groups = NULL;
844     int child_umask;
845     PyObject *cwd_obj, *cwd_obj2 = NULL;
846     const char *cwd;
847     pid_t pid = -1;
848     int need_to_reenable_gc = 0;
849     char *const *exec_array, *const *argv = NULL, *const *envp = NULL;
850     Py_ssize_t arg_num, num_groups = 0;
851     int need_after_fork = 0;
852     int saved_errno = 0;
853     int allow_vfork;
854     int *c_fds_to_keep = NULL;
855 
856     if (!PyArg_ParseTuple(
857             args, "OOpO!OOiiiiiiiiii" _Py_PARSE_PID "OOOiOp:fork_exec",
858             &process_args, &executable_list,
859             &close_fds, &PyTuple_Type, &py_fds_to_keep,
860             &cwd_obj, &env_list,
861             &p2cread, &p2cwrite, &c2pread, &c2pwrite,
862             &errread, &errwrite, &errpipe_read, &errpipe_write,
863             &restore_signals, &call_setsid, &pgid_to_set,
864             &gid_object, &groups_list, &uid_object, &child_umask,
865             &preexec_fn, &allow_vfork))
866         return NULL;
867 
868     if ((preexec_fn != Py_None) &&
869             (PyInterpreterState_Get() != PyInterpreterState_Main())) {
870         PyErr_SetString(PyExc_RuntimeError,
871                         "preexec_fn not supported within subinterpreters");
872         return NULL;
873     }
874 
875     if (close_fds && errpipe_write < 3) {  /* precondition */
876         PyErr_SetString(PyExc_ValueError, "errpipe_write must be >= 3");
877         return NULL;
878     }
879     if (_sanity_check_python_fd_sequence(py_fds_to_keep)) {
880         PyErr_SetString(PyExc_ValueError, "bad value(s) in fds_to_keep");
881         return NULL;
882     }
883 
884     PyInterpreterState *interp = PyInterpreterState_Get();
885     const PyConfig *config = _PyInterpreterState_GetConfig(interp);
886     if (config->_isolated_interpreter) {
887         PyErr_SetString(PyExc_RuntimeError,
888                         "subprocess not supported for isolated subinterpreters");
889         return NULL;
890     }
891 
892     /* We need to call gc.disable() when we'll be calling preexec_fn */
893     if (preexec_fn != Py_None) {
894         need_to_reenable_gc = PyGC_Disable();
895     }
896 
897     exec_array = _PySequence_BytesToCharpArray(executable_list);
898     if (!exec_array)
899         goto cleanup;
900 
901     /* Convert args and env into appropriate arguments for exec() */
902     /* These conversions are done in the parent process to avoid allocating
903        or freeing memory in the child process. */
904     if (process_args != Py_None) {
905         Py_ssize_t num_args;
906         /* Equivalent to:  */
907         /*  tuple(PyUnicode_FSConverter(arg) for arg in process_args)  */
908         fast_args = PySequence_Fast(process_args, "argv must be a tuple");
909         if (fast_args == NULL)
910             goto cleanup;
911         num_args = PySequence_Fast_GET_SIZE(fast_args);
912         converted_args = PyTuple_New(num_args);
913         if (converted_args == NULL)
914             goto cleanup;
915         for (arg_num = 0; arg_num < num_args; ++arg_num) {
916             PyObject *borrowed_arg, *converted_arg;
917             if (PySequence_Fast_GET_SIZE(fast_args) != num_args) {
918                 PyErr_SetString(PyExc_RuntimeError, "args changed during iteration");
919                 goto cleanup;
920             }
921             borrowed_arg = PySequence_Fast_GET_ITEM(fast_args, arg_num);
922             if (PyUnicode_FSConverter(borrowed_arg, &converted_arg) == 0)
923                 goto cleanup;
924             PyTuple_SET_ITEM(converted_args, arg_num, converted_arg);
925         }
926 
927         argv = _PySequence_BytesToCharpArray(converted_args);
928         Py_CLEAR(converted_args);
929         Py_CLEAR(fast_args);
930         if (!argv)
931             goto cleanup;
932     }
933 
934     if (env_list != Py_None) {
935         envp = _PySequence_BytesToCharpArray(env_list);
936         if (!envp)
937             goto cleanup;
938     }
939 
940     if (cwd_obj != Py_None) {
941         if (PyUnicode_FSConverter(cwd_obj, &cwd_obj2) == 0)
942             goto cleanup;
943         cwd = PyBytes_AsString(cwd_obj2);
944     } else {
945         cwd = NULL;
946     }
947 
948     if (groups_list != Py_None) {
949 #ifdef HAVE_SETGROUPS
950         Py_ssize_t i;
951         gid_t gid;
952 
953         if (!PyList_Check(groups_list)) {
954             PyErr_SetString(PyExc_TypeError,
955                     "setgroups argument must be a list");
956             goto cleanup;
957         }
958         num_groups = PySequence_Size(groups_list);
959 
960         if (num_groups < 0)
961             goto cleanup;
962 
963         if (num_groups > MAX_GROUPS) {
964             PyErr_SetString(PyExc_ValueError, "too many groups");
965             goto cleanup;
966         }
967 
968         if ((groups = PyMem_RawMalloc(num_groups * sizeof(gid_t))) == NULL) {
969             PyErr_SetString(PyExc_MemoryError,
970                     "failed to allocate memory for group list");
971             goto cleanup;
972         }
973 
974         for (i = 0; i < num_groups; i++) {
975             PyObject *elem;
976             elem = PySequence_GetItem(groups_list, i);
977             if (!elem)
978                 goto cleanup;
979             if (!PyLong_Check(elem)) {
980                 PyErr_SetString(PyExc_TypeError,
981                                 "groups must be integers");
982                 Py_DECREF(elem);
983                 goto cleanup;
984             } else {
985                 if (!_Py_Gid_Converter(elem, &gid)) {
986                     Py_DECREF(elem);
987                     PyErr_SetString(PyExc_ValueError, "invalid group id");
988                     goto cleanup;
989                 }
990                 groups[i] = gid;
991             }
992             Py_DECREF(elem);
993         }
994         call_setgroups = 1;
995 
996 #else /* HAVE_SETGROUPS */
997         PyErr_BadInternalCall();
998         goto cleanup;
999 #endif /* HAVE_SETGROUPS */
1000     }
1001 
1002     if (gid_object != Py_None) {
1003 #ifdef HAVE_SETREGID
1004         if (!_Py_Gid_Converter(gid_object, &gid))
1005             goto cleanup;
1006 
1007         call_setgid = 1;
1008 
1009 #else /* HAVE_SETREGID */
1010         PyErr_BadInternalCall();
1011         goto cleanup;
1012 #endif /* HAVE_SETREUID */
1013     }
1014 
1015     if (uid_object != Py_None) {
1016 #ifdef HAVE_SETREUID
1017         if (!_Py_Uid_Converter(uid_object, &uid))
1018             goto cleanup;
1019 
1020         call_setuid = 1;
1021 
1022 #else /* HAVE_SETREUID */
1023         PyErr_BadInternalCall();
1024         goto cleanup;
1025 #endif /* HAVE_SETREUID */
1026     }
1027 
1028     Py_ssize_t fds_to_keep_len = PyTuple_GET_SIZE(py_fds_to_keep);
1029     c_fds_to_keep = PyMem_Malloc(fds_to_keep_len * sizeof(int));
1030     if (c_fds_to_keep == NULL) {
1031         PyErr_SetString(PyExc_MemoryError, "failed to malloc c_fds_to_keep");
1032         goto cleanup;
1033     }
1034     if (convert_fds_to_keep_to_c(py_fds_to_keep, c_fds_to_keep) < 0) {
1035         goto cleanup;
1036     }
1037 
1038     /* This must be the last thing done before fork() because we do not
1039      * want to call PyOS_BeforeFork() if there is any chance of another
1040      * error leading to the cleanup: code without calling fork(). */
1041     if (preexec_fn != Py_None) {
1042         preexec_fn_args_tuple = PyTuple_New(0);
1043         if (!preexec_fn_args_tuple)
1044             goto cleanup;
1045         PyOS_BeforeFork();
1046         need_after_fork = 1;
1047     }
1048 
1049     /* NOTE: When old_sigmask is non-NULL, do_fork_exec() may use vfork(). */
1050     const void *old_sigmask = NULL;
1051 #ifdef VFORK_USABLE
1052     /* Use vfork() only if it's safe. See the comment above child_exec(). */
1053     sigset_t old_sigs;
1054     if (preexec_fn == Py_None && allow_vfork &&
1055         !call_setuid && !call_setgid && !call_setgroups) {
1056         /* Block all signals to ensure that no signal handlers are run in the
1057          * child process while it shares memory with us. Note that signals
1058          * used internally by C libraries won't be blocked by
1059          * pthread_sigmask(), but signal handlers installed by C libraries
1060          * normally service only signals originating from *within the process*,
1061          * so it should be sufficient to consider any library function that
1062          * might send such a signal to be vfork-unsafe and do not call it in
1063          * the child.
1064          */
1065         sigset_t all_sigs;
1066         sigfillset(&all_sigs);
1067         if ((saved_errno = pthread_sigmask(SIG_BLOCK, &all_sigs, &old_sigs))) {
1068             goto cleanup;
1069         }
1070         old_sigmask = &old_sigs;
1071     }
1072 #endif
1073 
1074     pid = do_fork_exec(exec_array, argv, envp, cwd,
1075                        p2cread, p2cwrite, c2pread, c2pwrite,
1076                        errread, errwrite, errpipe_read, errpipe_write,
1077                        close_fds, restore_signals, call_setsid, pgid_to_set,
1078                        call_setgid, gid, call_setgroups, num_groups, groups,
1079                        call_setuid, uid, child_umask, old_sigmask,
1080                        c_fds_to_keep, fds_to_keep_len,
1081                        preexec_fn, preexec_fn_args_tuple);
1082 
1083     /* Parent (original) process */
1084     if (pid == -1) {
1085         /* Capture errno for the exception. */
1086         saved_errno = errno;
1087     }
1088 
1089 #ifdef VFORK_USABLE
1090     if (old_sigmask) {
1091         /* vfork() semantics guarantees that the parent is blocked
1092          * until the child performs _exit() or execve(), so it is safe
1093          * to unblock signals once we're here.
1094          * Note that in environments where vfork() is implemented as fork(),
1095          * such as QEMU user-mode emulation, the parent won't be blocked,
1096          * but it won't share the address space with the child,
1097          * so it's still safe to unblock the signals.
1098          *
1099          * We don't handle errors here because this call can't fail
1100          * if valid arguments are given, and because there is no good
1101          * way for the caller to deal with a failure to restore
1102          * the thread signal mask. */
1103         (void) pthread_sigmask(SIG_SETMASK, old_sigmask, NULL);
1104     }
1105 #endif
1106 
1107     if (need_after_fork)
1108         PyOS_AfterFork_Parent();
1109 
1110 cleanup:
1111     if (c_fds_to_keep != NULL) {
1112         PyMem_Free(c_fds_to_keep);
1113     }
1114 
1115     if (saved_errno != 0) {
1116         errno = saved_errno;
1117         /* We can't call this above as PyOS_AfterFork_Parent() calls back
1118          * into Python code which would see the unreturned error. */
1119         PyErr_SetFromErrno(PyExc_OSError);
1120     }
1121 
1122     Py_XDECREF(preexec_fn_args_tuple);
1123     PyMem_RawFree(groups);
1124     Py_XDECREF(cwd_obj2);
1125     if (envp)
1126         _Py_FreeCharPArray(envp);
1127     Py_XDECREF(converted_args);
1128     Py_XDECREF(fast_args);
1129     if (argv)
1130         _Py_FreeCharPArray(argv);
1131     if (exec_array)
1132         _Py_FreeCharPArray(exec_array);
1133 
1134     if (need_to_reenable_gc) {
1135         PyGC_Enable();
1136     }
1137     Py_XDECREF(gc_module);
1138 
1139     return pid == -1 ? NULL : PyLong_FromPid(pid);
1140 }
1141 
1142 
1143 PyDoc_STRVAR(subprocess_fork_exec_doc,
1144 "fork_exec(args, executable_list, close_fds, pass_fds, cwd, env,\n\
1145           p2cread, p2cwrite, c2pread, c2pwrite,\n\
1146           errread, errwrite, errpipe_read, errpipe_write,\n\
1147           restore_signals, call_setsid, pgid_to_set,\n\
1148           gid, groups_list, uid,\n\
1149           preexec_fn)\n\
1150 \n\
1151 Forks a child process, closes parent file descriptors as appropriate in the\n\
1152 child and dups the few that are needed before calling exec() in the child\n\
1153 process.\n\
1154 \n\
1155 If close_fds is true, close file descriptors 3 and higher, except those listed\n\
1156 in the sorted tuple pass_fds.\n\
1157 \n\
1158 The preexec_fn, if supplied, will be called immediately before closing file\n\
1159 descriptors and exec.\n\
1160 WARNING: preexec_fn is NOT SAFE if your application uses threads.\n\
1161          It may trigger infrequent, difficult to debug deadlocks.\n\
1162 \n\
1163 If an error occurs in the child process before the exec, it is\n\
1164 serialized and written to the errpipe_write fd per subprocess.py.\n\
1165 \n\
1166 Returns: the child process's PID.\n\
1167 \n\
1168 Raises: Only on an error in the parent process.\n\
1169 ");
1170 
1171 /* module level code ********************************************************/
1172 
1173 PyDoc_STRVAR(module_doc,
1174 "A POSIX helper for the subprocess module.");
1175 
1176 static PyMethodDef module_methods[] = {
1177     {"fork_exec", subprocess_fork_exec, METH_VARARGS, subprocess_fork_exec_doc},
1178     {NULL, NULL}  /* sentinel */
1179 };
1180 
1181 static PyModuleDef_Slot _posixsubprocess_slots[] = {
1182     {0, NULL}
1183 };
1184 
1185 static struct PyModuleDef _posixsubprocessmodule = {
1186         PyModuleDef_HEAD_INIT,
1187         .m_name = "_posixsubprocess",
1188         .m_doc = module_doc,
1189         .m_size = 0,
1190         .m_methods = module_methods,
1191         .m_slots = _posixsubprocess_slots,
1192 };
1193 
1194 PyMODINIT_FUNC
PyInit__posixsubprocess(void)1195 PyInit__posixsubprocess(void)
1196 {
1197     return PyModuleDef_Init(&_posixsubprocessmodule);
1198 }
1199