Skip to content

Commit

Permalink
[8.1.0] Ensure that the lock file still exists when AcquireLock retur…
Browse files Browse the repository at this point in the history
…ns. (#25027)

This makes it possible to use an exclusive lock to guard against
concurrent deletion: otherwise, process 1 might delete the file and
release the exclusive lock in between process 2 opening the file and
acquiring a shared lock.

In practice, process 1 will be garbage-collecting an old install base
while process 2 will be attempting to start itself on the same install
base. (Note that install base locking is yet to be implemented.)

Also remove an ancient workaround for old glibcs: OFD locks have been
available under _GNU_SOURCE on Linux since 2014.

PiperOrigin-RevId: 699142575
Change-Id: Ifc931c1f2952d97c449aaeee17e6ea43cb4c637e
  • Loading branch information
tjgq authored Jan 23, 2025
1 parent 0829739 commit fda421f
Show file tree
Hide file tree
Showing 3 changed files with 205 additions and 161 deletions.
16 changes: 10 additions & 6 deletions src/main/cpp/blaze_util_platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,16 @@ enum class LockMode {
kExclusive,
};

// Acquires a `mode` lock on `path`, busy-waiting until it becomes available if
// `block` is true, and releasing it on exec if `batch_mode` is false.
// Crashes if the lock cannot be acquired. Returns a handle that can be
// subsequently passed to ReleaseLock as well as the time spent waiting for the
// lock, if any. The `name` argument is used to distinguish it from other locks
// in human-readable error messages.
// Acquires a `mode` lock on `path`, creating it if doesn't yet exist.
// If `block` is true, busy-wait until the lock becomes available.
// If `batch_mode` is false, release the lock on exec.
// The `path` is guaranteed to exist when this function returns; if it is
// deleted concurrently with obtaining the lock, we recreate it and try again.
// This makes it safe to delete the file under an exclusive lock.
// The `name` argument is used in human-readable error messages.
// Returns a handle that can be subsequently passed to ReleaseLock as well as
// the time spent waiting for the lock, if any.
// Crashes if an error occurs while attempting to obtain the lock.
std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
const std::string& name, const blaze_util::Path& path, LockMode mode,
bool batch_mode, bool block);
Expand Down
205 changes: 106 additions & 99 deletions src/main/cpp/blaze_util_posix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1 // for F_OFD_SETLK on Linux
#endif

#define _WITH_DPRINTF

#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
Expand Down Expand Up @@ -608,95 +613,124 @@ void SigPrintf(const char *format, ...) {
}
}

static int setlk(int fd, struct flock* lock) {
#ifdef __linux__
// If we're building with glibc <2.20, or another libc which predates
// OFD locks, define the constant ourselves. This assumes that the libc
// and kernel definitions for struct flock are identical.
#ifndef F_OFD_SETLK
#define F_OFD_SETLK 37
#endif
#endif
#ifdef F_OFD_SETLK
// Prefer OFD locks if available. POSIX locks can be lost "accidentally"
static bool TryLock(int fd, LockMode mode, const string& name) {
struct flock lock = {};
lock.l_type = static_cast<short>( // NOLINT (short is the right type)
mode == LockMode::kShared ? F_RDLCK : F_WRLCK);
// Locking is advisory, so any range works as long as it overlaps with the
// ones requested by other processes.
lock.l_whence = SEEK_SET;
lock.l_start = 0;
lock.l_len = 1;
// Prefer OFD locks when available. POSIX locks can be lost "accidentally"
// due to any close() on the lock file, and are not reliably preserved
// across execve() on Linux, which we need for --batch mode.
if (fcntl(fd, F_OFD_SETLK, lock) == 0) return 0;
#ifdef F_OFD_SETLK
if (fcntl(fd, F_OFD_SETLK, &lock) == 0) {
return true;
}
if (errno != EINVAL) {
if (errno != EACCES && errno != EAGAIN) {
BAZEL_DIE(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR)
<< "unexpected result from F_OFD_SETLK: " << GetLastErrorString();
<< "fcntl failed for " << name << " lock: " << GetLastErrorString();
}
return -1;
return false;
}
// F_OFD_SETLK was added in Linux 3.15. Older kernels return EINVAL.
// Fall back to F_SETLK in that case.
// Fall back to POSIX locks on EINVAL.
#endif
if (fcntl(fd, F_SETLK, lock) == 0) return 0;
if (fcntl(fd, F_SETLK, &lock) == 0) {
return true;
}
if (errno != EACCES && errno != EAGAIN) {
BAZEL_DIE(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR)
<< "unexpected result from F_SETLK: " << GetLastErrorString();
<< "fctnl failed for " << name << " lock: " << GetLastErrorString();
}
return -1;
return false;
}

std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
const std::string& name, const blaze_util::Path& path, LockMode mode,
bool batch_mode, bool block) {
int flags = O_CREAT;
switch (mode) {
case LockMode::kShared:
flags |= O_RDONLY;
break;
case LockMode::kExclusive:
flags |= O_RDWR;
break;
static bool StillExists(int fd, const string& name) {
struct stat st;
if (fstat(fd, &st) < 0) {
BAZEL_DIE(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR)
<< "fstat failed for " << name << " lock: " << GetLastErrorString();
}
// Keep server from inheriting a useless fd if we are not in batch mode.
if (!batch_mode) {
flags |= O_CLOEXEC;
return st.st_nlink > 0;
}

static string ReadOwnerInformation(int fd, const string& name) {
// Assume that 4KB are sufficient to fit the owner information.
string buffer(4096, 0);
ssize_t r = pread(fd, &buffer[0], buffer.size(), 0);
if (r < 0) {
BAZEL_LOG(WARNING) << "pread failed for " << name
<< " lock: " << GetLastErrorString();
r = 0;
}
buffer.resize(r);
return buffer;
}

int fd = open(path.AsNativePath().c_str(), flags, 0644);
if (fd < 0) {
string err = GetLastErrorString();
BAZEL_DIE(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR)
<< "open failed for " << name << " lock: " << err;
static void WriteOwnerInformation(int fd) {
(void)ftruncate(fd, 0);
lseek(fd, 0, SEEK_SET);
// Locking is advisory, so it doesn't matter that this may overflow the
// locked range.
dprintf(fd, "pid=%d\nowner=client\n", getpid());
string cwd = blaze_util::GetCwd();
dprintf(fd, "cwd=%s\n", cwd.c_str());
const char* tty = ttyname(STDIN_FILENO); // NOLINT (single-threaded)
if (tty != nullptr) {
dprintf(fd, "tty=%s\n", tty);
}
}

struct flock lock = {};
lock.l_type = static_cast<short>( // NOLINT (short is the right type)
mode == LockMode::kShared ? F_RDLCK : F_WRLCK);
lock.l_whence = SEEK_SET;
lock.l_start = 0;
// This doesn't really matter now, but allows us to subdivide the lock
// later if that becomes meaningful. (Ranges beyond EOF can be locked.)
lock.l_len = 4096;

// Take the lock. If it fails, busy-wait until it becomes available unless
// --noblock_for_lock was set.
//
// We used to rely on fcntl(F_SETLKW) to lazy-wait for the lock to become
// available, which is theoretically fine, but doing so prevents us from
// determining if the PID of the server holding the lock has changed under the
// hood. There have been multiple bug reports where users (especially macOS
// ones) mention that the Blaze invocation hangs on a non-existent PID. This
// should help troubleshoot those scenarios in case there really is a bug
// somewhere.
std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
const std::string& name, const blaze_util::Path& path, LockMode mode,
bool batch_mode, bool block) {
const uint64_t start_time = GetMillisecondsMonotonic();
bool multiple_attempts = false;
string owner;
const uint64_t start_time = GetMillisecondsMonotonic();
while (setlk(fd, &lock) == -1) {
string buffer(4096, 0);
ssize_t r = pread(fd, &buffer[0], buffer.size(), 0);
if (r < 0) {
BAZEL_LOG(WARNING) << "pread() " << name << " lock: " << strerror(errno);
r = 0;

while (true) {
int flags = O_CREAT | (mode == LockMode::kShared ? O_RDONLY : O_RDWR);
// Keep server from inheriting a useless fd if we are not in batch mode.
if (!batch_mode) {
flags |= O_CLOEXEC;
}
buffer.resize(r);
if (owner != buffer) {
// Each time we learn a new lock owner, print it out.
owner = buffer;

int fd = open(path.AsNativePath().c_str(), flags, 0644);
if (fd < 0) {
BAZEL_DIE(blaze_exit_code::LOCAL_ENVIRONMENTAL_ERROR)
<< "open failed for " << name << " lock: " << GetLastErrorString();
}

// Attempt to take the lock.
if (TryLock(fd, mode, name)) {
// Check that the lock file was not concurrently deleted.
if (StillExists(fd, name)) {
// If taking an exclusive lock, identify ourselves in the lock file.
// The contents are printed for human consumption when another client
// fails to take the lock, but not parsed otherwise.
if (mode == LockMode::kExclusive) {
WriteOwnerInformation(fd);
}
// If we succeeded on the first try, report zero wait time to avoid
// unnecessary noise in the logs. We are interested in how long it took
// for other commands to complete, not how fast acquiring a lock is.
const uint64_t end_time = GetMillisecondsMonotonic();
const auto wait_time =
multiple_attempts
? std::make_optional(DurationMillis(start_time, end_time))
: std::nullopt;
return std::make_pair(static_cast<LockHandle>(fd), wait_time);
}
}

// Someone else holds the lock. Obtain the identity of the current lock
// owner and print it out.
string new_owner = ReadOwnerInformation(fd, name);
if (new_owner != owner) {
owner = new_owner;
BAZEL_LOG(USER) << "Another command holds the " << name << " lock: \n"
<< owner;
if (block) {
Expand All @@ -711,38 +745,11 @@ std::pair<LockHandle, std::optional<DurationMillis>> AcquireLock(
<< " lock is held and --noblock_for_lock was given.";
}

TrySleep(500);
multiple_attempts = true;
}
const uint64_t end_time = GetMillisecondsMonotonic();

// If we took the lock on the first try, force the reported wait time to 0 to
// avoid unnecessary noise in the logs. In this metric, we are only
// interested in knowing how long it took for other commands to complete, not
// how fast acquiring a lock is.
const auto elapsed_time =
multiple_attempts
? std::make_optional(DurationMillis(start_time, end_time))
: std::nullopt;

// If taking an exclusive lock, identify ourselves in the lockfile.
// The contents are printed for human consumption when another client
// fails to take the lock, but not parsed otherwise.
if (mode == LockMode::kExclusive) {
(void)ftruncate(fd, 0);
lseek(fd, 0, SEEK_SET);
// Arguably we should ensure this fits in the 4KB we lock. In practice no
// one will have a cwd long enough to overflow that, and nothing currently
// uses the rest of the lock file anyway.
dprintf(fd, "pid=%d\nowner=client\n", getpid());
string cwd = blaze_util::GetCwd();
dprintf(fd, "cwd=%s\n", cwd.c_str());
if (const char* tty = ttyname(STDIN_FILENO)) { // NOLINT (single-threaded)
dprintf(fd, "tty=%s\n", tty);
}
}

return std::make_pair(static_cast<LockHandle>(fd), elapsed_time);
close(fd);
TrySleep(500);
}
}

void ReleaseLock(LockHandle lock_handle) {
Expand Down Expand Up @@ -895,4 +902,4 @@ void EnsurePythonPathOption(vector<string>* options) {
// do nothing.
}

} // namespace blaze.
} // namespace blaze
Loading

0 comments on commit fda421f

Please sign in to comment.