Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <errno.h>
#include <fcntl.h>
#include <mutex>
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include "base/eintr_wrapper.h"
#include "base/logging.h"
#include "base/message_loop.h"
#include "base/process_util.h"
#include "mozilla/DataMutex.h"
#include "mozilla/StaticPtr.h"
#include "mozilla/ipc/IOThread.h"
#include "nsITimer.h"
#include "nsTArray.h"
#include "nsThreadUtils.h"
#include "nsXULAppAPI.h"
#include "prenv.h"
#include "chrome/common/process_watcher.h"
#ifdef MOZ_ENABLE_FORKSERVER
# include "mozilla/ipc/ForkServiceChild.h"
#endif
// Just to make sure the moz.build is doing the right things with
// TARGET_OS and/or OS_TARGET:
#if defined(MOZ_WIDGET_ANDROID) || defined(MOZ_WIDGET_UIKIT)
# error Unsupported OS
#endif
#if !defined(XP_DARWIN)
// Linux, {Free,Net,Open}BSD, and Solaris; but not macOS, yet.
# define HAVE_PIPE2 1
#endif
// The basic idea here is a minimal SIGCHLD handler which writes to a
// pipe and a libevent callback on the I/O thread which fires when the
// other end becomes readable. When we start waiting for process
// termination we check if it had already terminated, and otherwise
// register it to be checked later when SIGCHLD fires.
//
// Making this more complicated is that we usually want to kill the
// process after a timeout, in case it hangs trying to exit, but not
// if it's already exited by that point (see `DelayedKill`).
// But we also support waiting indefinitely, for debug/CI use cases
// like refcount logging / leak detection / code coverage, and in that
// case we block parent process shutdown until all children exit
// (which is done by blocking the I/O thread late in shutdown, which
// isn't ideal, but the Windows implementation has the same issue).
// Maximum amount of time (in milliseconds) to wait for the process to exit.
// XXX/cjones: fairly arbitrary, chosen to match process_watcher_win.cc
static constexpr int kMaxWaitMs = 2000;
// This is also somewhat arbitrary, but loosely based on Try results.
// See also toolkit.asyncshutdown.crash_timeout (currently 60s) after
// which the parent process will be killed.
#ifdef MOZ_CODE_COVERAGE
// Code coverage instrumentation can be slow (especially when writing
// out data, which has to take a lock on the data files).
static constexpr int kShutdownWaitMs = 80000;
#elif defined(MOZ_ASAN) || defined(MOZ_TSAN)
// Sanitizers slow things down in some cases; see bug 1806224.
static constexpr int kShutdownWaitMs = 40000;
#else
static constexpr int kShutdownWaitMs = 8000;
#endif
namespace {
using base::BlockingWait;
// Represents a child process being awaited (which is expected to exit
// soon, or already has).
//
// If `mForce` is null then we will wait indefinitely (and block
// parent shutdown; see above); otherwise it will be killed after a
// timeout (or during parent shutdown, if that happens first).
struct PendingChild {
pid_t mPid;
nsCOMPtr<nsITimer> mForce;
};
// `EnsureProcessTerminated` is called when a process is expected to
// be shutting down, so there should be relatively few `PendingChild`
// instances at any given time, meaning that using an array and doing
// O(n) operations should be fine.
static mozilla::StaticDataMutex<mozilla::StaticAutoPtr<nsTArray<PendingChild>>>
gPendingChildren("ProcessWatcher::gPendingChildren");
static int gSignalPipe[2] = {-1, -1};
static mozilla::Atomic<bool> gProcessWatcherShutdown;
// A wrapper around WaitForProcess to simplify the result (true if the
// process exited and the pid is now freed for reuse, false if it's
// still running), and handle the case where "blocking" mode doesn't
// block (so this function will always return true if `aBlock` is
// `YES`), and log a warning message if the process didn't exit
// successfully (as in `exit(0)`).
static bool IsProcessDead(pid_t pid, BlockingWait aBlock) {
int info = 0;
auto status = WaitForProcess(pid, aBlock, &info);
while (aBlock == BlockingWait::Yes &&
status == base::ProcessStatus::Running) {
// It doesn't matter if this is interrupted; we just need to
// wait for some amount of time while the other process status
// event is (hopefully) handled. This is used only during an
// error case at shutdown, so a 1s wait won't be too noticeable.
sleep(1);
status = WaitForProcess(pid, aBlock, &info);
}
switch (status) {
case base::ProcessStatus::Running:
return false;
case base::ProcessStatus::Exited:
if (info != 0) {
CHROMIUM_LOG(WARNING)
<< "process " << pid << " exited with status " << info;
}
return true;
case base::ProcessStatus::Killed:
CHROMIUM_LOG(WARNING)
<< "process " << pid << " exited on signal " << info;
return true;
case base::ProcessStatus::Error:
CHROMIUM_LOG(ERROR) << "waiting for process " << pid
<< " failed with error " << info;
// Don't keep trying.
return true;
default:
DCHECK(false) << "can't happen";
return true;
}
}
// Creates a timer to kill the process after a delay, for the
// `force=true` case. The timer is bound to the I/O thread, which
// means it needs to be cancelled there (and thus that child exit
// notifications need to be handled on the I/O thread).
already_AddRefed<nsITimer> DelayedKill(pid_t aPid) {
nsCOMPtr<nsITimer> timer;
nsresult rv = NS_NewTimerWithCallback(
getter_AddRefs(timer),
[aPid](nsITimer*) {
// If the process already exited, normally it would remain as
// a zombie and the `SIGKILL` would be ignored. But if the
// fork server crashed, then the child would be reparented to
// pid 1 and cleaned up immediately, so in that case we should
// not try to signal.
if (IsProcessDead(aPid, BlockingWait::No)) {
return;
}
// In theory it's possible for the fork server to crash and
// the child process to exit and have its pid reused by a new
// process all between these two statements, but that is
// *extremely* unlikely.
if (kill(aPid, SIGKILL) != 0) {
const int err = errno;
// Bug 1944669: suppress logging if it's a forkserver child
// process that already exited. (Before bug 1658072 we
// would kill(pid, 0) first, but that doesn't change
// anything.) This can be removed with bug 1752638.
#ifdef MOZ_ENABLE_FORKSERVER
const bool forkServed = mozilla::ipc::ForkServiceChild::WasUsed();
#else
constexpr bool forkServed = false;
#endif
if (err != ESRCH || !forkServed) {
CHROMIUM_LOG(ERROR) << "failed to send SIGKILL to process " << aPid
<< strerror(err);
}
}
// If the process was still running, it will exit and the
// SIGCHLD handler will waitpid it.
},
kMaxWaitMs, nsITimer::TYPE_ONE_SHOT, "ProcessWatcher::DelayedKill",
XRE_GetAsyncIOEventTarget());
// This should happen only during shutdown, in which case we're
// about to kill the process anyway during I/O thread destruction.
if (NS_FAILED(rv)) {
CHROMIUM_LOG(WARNING) << "failed to start kill timer for process " << aPid
<< "; killing immediately";
kill(aPid, SIGKILL);
return nullptr;
}
return timer.forget();
}
bool CrashProcessIfHanging(pid_t aPid) {
if (IsProcessDead(aPid, BlockingWait::No)) {
return false;
}
// If child processes seems to be hanging on shutdown, wait for a
// reasonable time. The wait is global instead of per-process
// because the child processes should be shutting down in
// parallel, and also we're potentially racing global timeouts
// like nsTerminator. (The counter doesn't need to be atomic;
// this is always called on the I/O thread.)
static int sWaitMs = kShutdownWaitMs;
if (sWaitMs > 0) {
CHROMIUM_LOG(WARNING) << "Process " << aPid
<< " may be hanging at shutdown; will wait for up to "
<< sWaitMs << "ms";
}
// There isn't a way to do a time-limited wait that's both
// portable and doesn't require messing with signals. Instead, we
// sleep in short increments and poll the process status.
while (sWaitMs > 0) {
static constexpr int kWaitTickMs = 200;
struct timespec ts = {kWaitTickMs / 1000, (kWaitTickMs % 1000) * 1000000};
HANDLE_EINTR(nanosleep(&ts, &ts));
sWaitMs -= kWaitTickMs;
if (IsProcessDead(aPid, BlockingWait::No)) {
return false;
}
}
// We want TreeHerder to flag this log line as an error, so that
// this is more obviously a deliberate crash; "fatal error" is one
// of the strings it looks for.
CHROMIUM_LOG(ERROR)
<< "Process " << aPid
<< " hanging at shutdown; attempting crash report (fatal error).";
kill(aPid, SIGABRT);
return true;
}
// Most of the logic is here. Reponds to SIGCHLD via the self-pipe,
// and handles shutdown behavior in `WillDestroyCurrentMessageLoop`.
// There is one instance of this class; it's created the first time
// it's used and destroys itself during IPC shutdown.
class ProcessCleaner final : public MessageLoopForIO::Watcher,
public MessageLoop::DestructionObserver {
public:
// Safety: this must be called on the I/O thread.
void Register() {
MessageLoopForIO* loop = MessageLoopForIO::current();
loop->AddDestructionObserver(this);
loop->WatchFileDescriptor(gSignalPipe[0], /* persistent= */ true,
MessageLoopForIO::WATCH_READ, &mWatcher, this);
}
void OnFileCanReadWithoutBlocking(int fd) override {
DCHECK(fd == gSignalPipe[0]);
ssize_t rv;
// Drain the pipe and prune dead processes.
do {
char msg[32];
rv = HANDLE_EINTR(read(gSignalPipe[0], msg, sizeof msg));
CHECK(rv != 0);
if (rv < 0) {
DCHECK(errno == EAGAIN || errno == EWOULDBLOCK);
} else {
#ifdef DEBUG
for (size_t i = 0; i < (size_t)rv; ++i) {
DCHECK(msg[i] == 0);
}
#endif
}
} while (rv > 0);
PruneDeadProcesses();
}
void OnFileCanWriteWithoutBlocking(int fd) override {
CHROMIUM_LOG(FATAL) << "unreachable";
}
void WillDestroyCurrentMessageLoop() override {
gProcessWatcherShutdown = true;
mWatcher.StopWatchingFileDescriptor();
auto lock = gPendingChildren.Lock();
auto& children = lock.ref();
if (children) {
for (const auto& child : *children) {
// If the child still has force-termination pending, do that now.
if (child.mForce) {
// This is too late for timers to run, so no need to Cancel().
//
// FIXME (bug 1724337, approximately): This code isn't run at
// all in practice, because the parent process will already have
// exited (unless the fastShutdownStage pref is changed).
if (kill(child.mPid, SIGKILL) != 0) {
CHROMIUM_LOG(ERROR)
<< "failed to send SIGKILL to process " << child.mPid;
continue;
}
} else {
// Exception for the fake hang tests in ipc/glue/test/browser
// (See also the comment in `~ProcessChild()`.)
if (!PR_GetEnv("MOZ_TEST_CHILD_EXIT_HANG") &&
!CrashProcessIfHanging(child.mPid)) {
continue;
}
}
// If the process was just killed, it should exit immediately;
// otherwise, block until it exits on its own.
IsProcessDead(child.mPid, BlockingWait::Yes);
}
children = nullptr;
}
#ifdef MOZ_ENABLE_FORKSERVER
mozilla::ipc::ForkServiceChild::StopForkServer();
#endif
delete this;
}
private:
MessageLoopForIO::FileDescriptorWatcher mWatcher;
static void PruneDeadProcesses() {
auto lock = gPendingChildren.Lock();
auto& children = lock.ref();
if (!children || children->IsEmpty()) {
return;
}
nsTArray<PendingChild> live;
for (const auto& child : *children) {
if (IsProcessDead(child.mPid, BlockingWait::No)) {
if (child.mForce) {
child.mForce->Cancel();
}
} else {
live.AppendElement(child);
}
}
*children = std::move(live);
}
};
static void HandleSigChld(int signum) {
DCHECK(signum == SIGCHLD);
char msg = 0;
HANDLE_EINTR(write(gSignalPipe[1], &msg, 1));
// Can't log here if this fails (at least not normally; SafeSPrintf
// from security/sandbox/chromium could be used).
//
// (Note that this could fail with EAGAIN if the pipe buffer becomes
// full; this is extremely unlikely, and it doesn't matter because
// the reader will be woken up regardless and doesn't care about the
// number of signals delivered.)
}
static void ProcessWatcherInit() {
int rv;
#ifdef HAVE_PIPE2
rv = pipe2(gSignalPipe, O_NONBLOCK | O_CLOEXEC);
CHECK(rv == 0)
<< "pipe2() failed";
#else
rv = pipe(gSignalPipe);
CHECK(rv == 0)
<< "pipe() failed";
for (int fd : gSignalPipe) {
rv = fcntl(fd, F_SETFL, O_NONBLOCK);
CHECK(rv == 0)
<< "O_NONBLOCK failed";
rv = fcntl(fd, F_SETFD, FD_CLOEXEC);
CHECK(rv == 0)
<< "FD_CLOEXEC failed";
}
#endif // HAVE_PIPE2
// Currently there are no other SIGCHLD handlers; this is debug
// asserted. If the situation changes, it should be relatively
// simple to delegate; note that this ProcessWatcher doesn't
// interfere with child processes it hasn't been asked to handle.
auto oldHandler = signal(SIGCHLD, HandleSigChld);
CHECK(oldHandler != SIG_ERR);
DCHECK(oldHandler == SIG_DFL);
// Start the ProcessCleaner; registering it with the I/O thread must
// happen on the I/O thread itself. It's okay for that to happen
// asynchronously: the callback is level-triggered, so if the signal
// handler already wrote to the pipe at that point then it will be
// detected, and the signal itself is async so additional delay
// doesn't change the semantics.
XRE_GetAsyncIOEventTarget()->Dispatch(
NS_NewRunnableFunction("ProcessCleaner::Register", [] {
ProcessCleaner* pc = new ProcessCleaner();
pc->Register();
}));
}
static void EnsureProcessWatcher() {
static std::once_flag sInited;
std::call_once(sInited, ProcessWatcherInit);
}
} // namespace
mozilla::UniqueFileHandle ProcessWatcher::GetSignalPipe() {
EnsureProcessWatcher();
int fd = gSignalPipe[1];
MOZ_ASSERT(fd >= 0);
fd = dup(fd);
MOZ_ASSERT(fd >= 0);
return mozilla::UniqueFileHandle(fd);
}
/**
* Do everything possible to ensure that |process| has been reaped
* before this process exits.
*
* |force| decides how strict to be with the child's shutdown.
*
* | child exit timeout | upon parent shutdown:
* +--------------------+----------------------------------
* force=true | 2 seconds | kill(child, SIGKILL)
* force=false | infinite | waitpid(child)
*
* If a child process doesn't shut down properly, and |force=false|
* used, then the parent will wait on the child forever. So,
* |force=false| is expected to be used when an external entity can be
* responsible for terminating hung processes, e.g. automated test
* harnesses.
*/
void ProcessWatcher::EnsureProcessTerminated(base::ProcessHandle process,
bool force) {
DCHECK(process != base::GetCurrentProcId());
DCHECK(process > 0);
if (gProcessWatcherShutdown) {
// This late in shutdown, should only come from the I/O thread;
// see further comments below.
mozilla::ipc::AssertIOThread();
// This should always be true given that gProcessWatcherShutdown
// is set, but just in case something changes with MessageLoop
// shutdown:
DCHECK(!MessageLoop::current()->IsAcceptingTasks());
// This is for the fork server itself, being torn down late
// in shutdown. Generally won't be reached with force=true,
// because build types that default to it will QuickExit first.
// It's not strictly necessary to wait for child processes when
// the parent process is about to exit (pid 1 should clean them
// up).
//
// However, if called in "wait forever" mode, let's wait for it
// and log the exit status if it was abnormal:
if (!force) {
(void)IsProcessDead(process, BlockingWait::Yes);
}
return;
}
EnsureProcessWatcher();
auto lock = gPendingChildren.Lock();
auto& children = lock.ref();
// Check if the process already exited. This needs to happen under
// the `gPendingChildren` lock to prevent this sequence:
//
// A1. this non-blocking wait fails
// B1. the process exits
// B2. SIGCHLD is handled
// B3. the ProcessCleaner wakes up and drains the signal pipe
// A2. the process is added to `gPendingChildren`
//
// Holding the lock prevents B3 from occurring between A1 and A2.
if (IsProcessDead(process, BlockingWait::No)) {
return;
}
if (!children) {
children = new nsTArray<PendingChild>();
}
// Check for duplicate pids. This is safe even in corner cases with
// pid reuse: the pid can't be reused by the OS until the zombie
// process has been waited, and both the `waitpid` and the following
// removal of the `PendingChild` object occur while continually
// holding the lock, which is also held here.
for (const auto& child : *children) {
if (child.mPid == process) {
#ifdef MOZ_ENABLE_FORKSERVER
if (mozilla::ipc::ForkServiceChild::WasUsed()) {
// Ideally, this would never be reached. But, in theory it's
// possible if the fork server crashes and is restarted: the
// process will be reparented to pid 1 which will clean it up
// immediately, at which point the pid could be reused (but
// it's very unlikely for that to happen so soon). So, if
// this is reached without any mistakes by the calling code,
// in that case the old process has already terminated and
// ProcessWatcher has no more responsibility for it.
CHROMIUM_LOG(WARNING) << "EnsureProcessTerminated: duplicate process"
" ID "
<< process;
// So, we want to end up with a PendingChild for the new
// process; we can just use the old one. Ideally we'd fix the
// `mForce` value, but that would involve needing to cancel a
// timer when we aren't necessarily on the right thread, and
// in practice the `force` parameter depends only on the build
// type.
return;
}
#endif
MOZ_ASSERT(false,
"EnsureProcessTerminated must be called at most once for a "
"given process");
return;
}
}
PendingChild child{};
child.mPid = process;
if (force) {
child.mForce = DelayedKill(process);
}
children->AppendElement(std::move(child));
}