Skip to content

Commit

Permalink
change lock status memory ordering, tag spans
Browse files Browse the repository at this point in the history
This changes the memory ordering for the lock status load to `SeqCst` from
`Relaxed` to eliminate a potential source of panics.

Panics had the frames:
```
pingora_proxy::proxy_cache::<T>::handle_lock_status (proxy_cache.rs:748)
pingora_proxy::proxy_cache::<T>::proxy_cache::{{closure}} (proxy_cache.rs:211)
pingora_proxy::HttpProxy<T>::process_request::{{closure}} (lib.rs:509)
pingora_proxy::HttpProxy<T>::process_new_http::{{closure}} (lib.rs:727)
```

which showed we were checking on the status of the lock, after waiting on it,
and still seeing its status as waiting. The status is returned by value, so this
is not a time-of-check to time-of-use problem, this is an inconsistency in how
the lock status is managed. The change in memory order is mostly for the sake of
this programmer's attempts to understand what is happening.

This also completes a couple of TODOs to limit the wait period as well as tag
the span with the lock status.
  • Loading branch information
gumpt authored and drcaramelsyrup committed Sep 6, 2024
1 parent 5e3e774 commit e288bfe
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .bleep
Original file line number Diff line number Diff line change
@@ -1 +1 @@
30c624970d46b07efd110fcfb8dd0b6f9a099e2b
2351cdf592f9986201d754e6ee1f37f493f69abb
1 change: 1 addition & 0 deletions pingora-cache/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ lru = { workspace = true }
ahash = { workspace = true }
hex = "0.4"
httparse = { workspace = true }
strum = { version = "0.26", features = ["derive"] }

[dev-dependencies]
tokio-test = "0.4"
Expand Down
8 changes: 6 additions & 2 deletions pingora-cache/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use key::{CacheHashKey, HashBinary};
use lock::WritePermit;
use pingora_error::Result;
use pingora_http::ResponseHeader;
use rustracing::tag::Tag;
use std::time::{Duration, Instant, SystemTime};
use trace::CacheTraceCTX;

Expand Down Expand Up @@ -1047,7 +1048,7 @@ impl HttpCache {
/// Check [Self::is_cache_locked()], panic if this request doesn't have a read lock.
pub async fn cache_lock_wait(&mut self) -> LockStatus {
let inner = self.inner_mut();
let _span = inner.traces.child("cache_lock");
let mut span = inner.traces.child("cache_lock");
let lock = inner.lock.take(); // remove the lock from self
if let Some(Locked::Read(r)) = lock {
let now = Instant::now();
Expand All @@ -1059,7 +1060,10 @@ impl HttpCache {
.lock_duration
.map_or(lock_duration, |d| d + lock_duration),
);
r.lock_status() // TODO: tag the span with lock status
let status = r.lock_status();
let tag_value: &'static str = status.into();
span.set_tag(|| Tag::new("status", tag_value));
status
} else {
// should always call is_cache_locked() before this function
panic!("cache_lock_wait on wrong type of lock")
Expand Down
27 changes: 20 additions & 7 deletions pingora-cache/src/lock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,14 @@ impl CacheLock {
}
}

use log::warn;
use std::sync::atomic::{AtomicU8, Ordering};
use std::time::{Duration, Instant};
use strum::IntoStaticStr;
use tokio::sync::Semaphore;

/// Status which the read locks could possibly see.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, IntoStaticStr)]
pub enum LockStatus {
/// Waiting for the writer to populate the asset
Waiting,
Expand Down Expand Up @@ -180,7 +182,7 @@ impl LockCore {
}

fn lock_status(&self) -> LockStatus {
self.lock_status.load(Ordering::Relaxed).into()
self.lock_status.load(Ordering::SeqCst).into()
}
}

Expand All @@ -197,11 +199,22 @@ impl ReadLock {
return;
}

// TODO: should subtract now - start so that the lock don't wait beyond start + timeout
// Also need to be careful not to wake everyone up at the same time
// TODO: need to be careful not to wake everyone up at the same time
// (maybe not an issue because regular cache lock release behaves that way)
let _ = timeout(self.0.timeout, self.0.lock.acquire()).await;
// permit is returned to Semaphore right away
if let Some(duration) = self.0.timeout.checked_sub(self.0.lock_start.elapsed()) {
match timeout(duration, self.0.lock.acquire()).await {
Ok(Ok(_)) => { // permit is returned to Semaphore right away
}
Ok(Err(e)) => {
warn!("error acquiring semaphore {e:?}")
}
Err(_) => {
self.0
.lock_status
.store(LockStatus::Timeout.into(), Ordering::SeqCst);
}
}
}
}

/// Test if it is still locked
Expand All @@ -211,7 +224,7 @@ impl ReadLock {

/// Whether the lock is expired, e.g., the writer has been holding the lock for too long
pub fn expired(&self) -> bool {
// NOTE: this whether the lock is currently expired
// NOTE: this is whether the lock is currently expired
// not whether it was timed out during wait()
self.0.lock_start.elapsed() >= self.0.timeout
}
Expand Down

0 comments on commit e288bfe

Please sign in to comment.