Skip to content

Commit

Permalink
pidfd: implement PIDFD_THREAD flag for pidfd_open()
Browse files Browse the repository at this point in the history
With this flag:

	- pidfd_open() doesn't require that the target task must be
	  a thread-group leader

	- pidfd_poll() succeeds when the task exits and becomes a
	  zombie (iow, passes exit_notify()), even if it is a leader
	  and thread-group is not empty.

	  This means that the behaviour of pidfd_poll(PIDFD_THREAD,
	  pid-of-group-leader) is not well defined if it races with
	  exec() from its sub-thread; pidfd_poll() can succeed or not
	  depending on whether pidfd_task_exited() is called before
	  or after exchange_tids().

	  Perhaps we can improve this behaviour later, pidfd_poll()
	  can probably take sig->group_exec_task into account. But
	  this doesn't really differ from the case when the leader
	  exits before other threads (so pidfd_poll() succeeds) and
	  then another thread execs and pidfd_poll() will block again.

thread_group_exited() is no longer used, perhaps it can die.

Co-developed-by: Tycho Andersen <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Tested-by: Tycho Andersen <[email protected]>
Reviewed-by: Tycho Andersen <[email protected]>
Signed-off-by: Christian Brauner <[email protected]>
  • Loading branch information
oleg-nesterov authored and brauner committed Feb 2, 2024
1 parent 21e2520 commit 64bef69
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 23 deletions.
6 changes: 5 additions & 1 deletion fs/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)

BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;

/*
* leader and tsk exhanged their pids, the old pid dies,
* wake up the PIDFD_THREAD waiters.
*/
do_notify_pidfd(leader);
/*
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
Expand Down
3 changes: 2 additions & 1 deletion include/linux/pid.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;

struct file;

extern struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
void do_notify_pidfd(struct task_struct *task);

static inline struct pid *get_pid(struct pid *pid)
{
Expand Down
3 changes: 2 additions & 1 deletion include/uapi/linux/pidfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <linux/fcntl.h>

/* Flags for pidfd_open(). */
#define PIDFD_NONBLOCK O_NONBLOCK
#define PIDFD_NONBLOCK O_NONBLOCK
#define PIDFD_THREAD O_EXCL

#endif /* _UAPI_LINUX_PIDFD_H */
7 changes: 7 additions & 0 deletions kernel/exit.c
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);

tsk->exit_state = EXIT_ZOMBIE;
/*
* sub-thread or delay_group_leader(), wake up the
* PIDFD_THREAD waiters.
*/
if (!thread_group_empty(tsk))
do_notify_pidfd(tsk);

if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
Expand Down
38 changes: 31 additions & 7 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
Expand Down Expand Up @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)

seq_put_decimal_ll(m, "Pid:\t", nr);

/* TODO: report PIDFD_THREAD */

#ifdef CONFIG_PID_NS
seq_put_decimal_ll(m, "\nNSpid:\t", nr);
if (nr > 0) {
Expand All @@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif

static bool pidfd_task_exited(struct pid *pid, bool thread)
{
struct task_struct *task;
bool exited;

rcu_read_lock();
task = pid_task(pid, PIDTYPE_PID);
exited = !task ||
(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
rcu_read_unlock();

return exited;
}

/*
* Poll support for process exit notification.
*/
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct pid *pid = file->private_data;
bool thread = file->f_flags & PIDFD_THREAD;
__poll_t poll_flags = 0;

poll_wait(file, &pid->wait_pidfd, pts);

/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
* Depending on PIDFD_THREAD, inform pollers when the thread
* or the whole thread-group exits.
*/
if (thread_group_exited(pid))
if (pidfd_task_exited(pid, thread))
poll_flags = EPOLLIN | EPOLLRDNORM;

return poll_flags;
Expand Down Expand Up @@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
return PTR_ERR(pidfd_file);
}
get_pid(pid); /* held by pidfd_file now */
/*
* anon_inode_getfile() ignores everything outside of the
* O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
*/
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
*ret = pidfd_file;
return pidfd;
}
Expand All @@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
* The helper verifies that @pid is used as a thread group leader.
* The helper verifies that @pid is still in use, without PIDFD_THREAD the
* task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
Expand All @@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
*/
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
bool thread = flags & PIDFD_THREAD;

if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
return -EINVAL;

return __pidfd_prepare(pid, flags, ret);
Expand Down
14 changes: 3 additions & 11 deletions kernel/pid.c
Original file line number Diff line number Diff line change
Expand Up @@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Return the task associated with @pidfd. The function takes a reference on
* the returned task. The caller is responsible for releasing that reference.
*
* Currently, the process identified by @pidfd is always a thread-group leader.
* This restriction currently exists for all aspects of pidfds including pidfd
* creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
* (only supports thread group leaders).
*
* Return: On success, the task_struct associated with the pidfd.
* On error, a negative errno number will be returned.
*/
Expand Down Expand Up @@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
* the process identified by @pid. Currently, the process identified by
* @pid must be a thread-group leader. This restriction currently exists
* for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
* be used with CLONE_THREAD) and pidfd polling (only supports thread group
* leaders).
* the task identified by @pid. Without PIDFD_THREAD flag the target task
* must be a thread-group leader.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
Expand All @@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
int fd;
struct pid *p;

if (flags & ~PIDFD_NONBLOCK)
if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
return -EINVAL;

if (pid <= 0)
Expand Down
6 changes: 4 additions & 2 deletions kernel/signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -2019,7 +2019,7 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
return ret;
}

static void do_notify_pidfd(struct task_struct *task)
void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;

Expand Down Expand Up @@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/*
* tsk is a group leader and has no threads, wake up the pidfd waiters.
* tsk is a group leader and has no threads, wake up the
* non-PIDFD_THREAD waiters.
*/
if (thread_group_empty(tsk))
do_notify_pidfd(tsk);
Expand Down Expand Up @@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
prepare_kill_siginfo(sig, &kinfo);
}

/* TODO: respect PIDFD_THREAD */
ret = kill_pid_info(sig, &kinfo, pid);

err:
Expand Down

0 comments on commit 64bef69

Please sign in to comment.