Skip to content

Commit 958fd4d

Browse files
committed
Claude-created paginated listing
1 parent 4ea16c0 commit 958fd4d

File tree

2 files changed

+171
-10
lines changed

2 files changed

+171
-10
lines changed

obstore/src/list.rs

Lines changed: 109 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit};
88
use futures::stream::{BoxStream, Fuse};
99
use futures::StreamExt;
1010
use indexmap::IndexMap;
11-
use object_store::list::PaginatedListStore;
11+
use object_store::list::{PaginatedListOptions, PaginatedListStore};
1212
use object_store::path::Path;
1313
use object_store::{ListResult, ObjectMeta, ObjectStore};
1414
use pyo3::exceptions::{PyImportError, PyStopAsyncIteration, PyStopIteration, PyValueError};
@@ -24,7 +24,7 @@ use pyo3_object_store::{
2424
};
2525
use tokio::sync::Mutex;
2626

27-
enum MaybePaginatedListStore {
27+
pub(crate) enum MaybePaginatedListStore {
2828
SupportsPagination(Arc<dyn PaginatedListStore>),
2929
NoPagination(Arc<dyn ObjectStore>),
3030
}
@@ -452,7 +452,7 @@ impl<'py> IntoPyObject<'py> for PyListResult {
452452
#[pyo3(signature = (store, prefix=None, *, offset=None, chunk_size=50, return_arrow=false))]
453453
pub(crate) fn list(
454454
py: Python,
455-
store: PyObjectStore,
455+
store: MaybePaginatedListStore,
456456
prefix: Option<String>,
457457
offset: Option<String>,
458458
chunk_size: usize,
@@ -470,12 +470,13 @@ pub(crate) fn list(
470470
.map_err(|err| PyImportError::new_err(format!("{msg}\n\n{err}")))?;
471471
}
472472

473-
let store = store.into_inner().clone();
474-
let prefix = prefix.map(|s| s.into());
475-
let stream = if let Some(offset) = offset {
476-
store.list_with_offset(prefix.as_ref(), &offset.into())
477-
} else {
478-
store.list(prefix.as_ref())
473+
let stream = match store {
474+
MaybePaginatedListStore::SupportsPagination(paginated_store) => {
475+
create_paginated_stream(paginated_store, prefix, offset, chunk_size)
476+
}
477+
MaybePaginatedListStore::NoPagination(object_store) => {
478+
create_filtered_stream(object_store, prefix, offset)
479+
}
479480
};
480481
Ok(PyListStream::new(stream, chunk_size, return_arrow))
481482
}
@@ -526,3 +527,102 @@ async fn list_with_delimiter_materialize(
526527
let list_result = store.list_with_delimiter(prefix).await?;
527528
Ok(PyListResult::new(list_result, return_arrow))
528529
}
530+
531+
fn create_paginated_stream(
532+
store: Arc<dyn PaginatedListStore>,
533+
prefix: Option<String>,
534+
offset: Option<String>,
535+
chunk_size: usize,
536+
) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
537+
// Create a stream that will fetch from the paginated store
538+
let stream = futures::stream::unfold(
539+
(store, prefix, offset, None, true),
540+
move |(store, prefix, offset, page_token, has_more)| async move {
541+
if !has_more {
542+
return None;
543+
}
544+
545+
let opts = PaginatedListOptions {
546+
offset: offset.clone(),
547+
delimiter: None,
548+
max_keys: Some(chunk_size),
549+
page_token,
550+
..Default::default()
551+
};
552+
553+
match store.list_paginated(prefix.as_deref(), opts).await {
554+
Ok(result) => {
555+
let next_has_more = result.page_token.is_some();
556+
let next_page_token = result.page_token;
557+
let objects = result.result.objects;
558+
559+
let next_state = (store, prefix, offset, next_page_token, next_has_more);
560+
Some((objects, next_state))
561+
}
562+
Err(_e) => {
563+
// TODO: propagate error
564+
// For errors, return empty list and stop
565+
Some((Vec::new(), (store, prefix, offset, None, false)))
566+
}
567+
}
568+
},
569+
)
570+
.flat_map(|objects| futures::stream::iter(objects.into_iter().map(Ok)));
571+
572+
Box::pin(stream)
573+
}
574+
575+
fn create_filtered_stream(
576+
store: Arc<dyn ObjectStore>,
577+
prefix: Option<String>,
578+
offset: Option<String>,
579+
) -> BoxStream<'static, object_store::Result<ObjectMeta>> {
580+
// For substring filtering, we need to split the prefix into:
581+
// 1. A directory prefix for efficient listing
582+
// 2. A substring filter to apply to the results
583+
let (list_prefix, substring_filter) = if let Some(prefix_str) = &prefix {
584+
if let Some((dir_prefix, substring)) = prefix_str.rsplit_once('/') {
585+
(Some(dir_prefix.to_string()), Some(substring.to_string()))
586+
} else {
587+
(None, Some(prefix_str.clone()))
588+
}
589+
} else {
590+
(None, None)
591+
};
592+
593+
let prefix_path = list_prefix.map(|s| s.into());
594+
let base_stream = if let Some(offset) = offset {
595+
store.list_with_offset(prefix_path.as_ref(), &offset.into())
596+
} else {
597+
store.list(prefix_path.as_ref())
598+
};
599+
600+
// Apply substring filtering if needed
601+
let filtered_stream = if let Some(substring) = substring_filter {
602+
Box::pin(base_stream.filter_map(move |result| {
603+
let substring = substring.clone();
604+
async move {
605+
match result {
606+
Ok(meta) => {
607+
// Extract filename from path for substring matching
608+
let path_str = meta.location.as_ref();
609+
if let Some(filename) = path_str.split('/').last() {
610+
if filename.contains(&substring) {
611+
Some(Ok(meta))
612+
} else {
613+
None
614+
}
615+
} else {
616+
Some(Ok(meta))
617+
}
618+
}
619+
Err(e) => Some(Err(e)),
620+
}
621+
}
622+
}))
623+
} else {
624+
base_stream
625+
};
626+
627+
filtered_stream
628+
}

tests/test_list.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import tempfile
2+
from pathlib import Path
3+
14
import polars as pl
25
import pyarrow as pa
36
import pytest
47
from arro3.core import RecordBatch, Table
58

6-
from obstore.store import MemoryStore
9+
from obstore.store import LocalStore, MemoryStore
710

811

912
def test_list():
@@ -130,6 +133,64 @@ async def test_list_with_delimiter_async():
130133
assert objects["path"][1].as_py() == "a/file2.txt"
131134

132135

136+
def test_list_substring_filtering():
137+
store = MemoryStore()
138+
139+
# Add files with various patterns
140+
store.put("data/file1.txt", b"foo")
141+
store.put("data/test_file.txt", b"bar")
142+
store.put("data/another.csv", b"baz")
143+
store.put("data/test_data.json", b"qux")
144+
store.put("logs/test_log.txt", b"log")
145+
146+
# Test substring filtering for files containing "test"
147+
result = store.list("data/test").collect()
148+
paths = [item["path"] for item in result]
149+
150+
# Should match files with "test" in the filename within data/ directory
151+
assert "data/test_file.txt" in paths
152+
assert "data/test_data.json" in paths
153+
assert "data/file1.txt" not in paths
154+
assert "data/another.csv" not in paths
155+
assert "logs/test_log.txt" not in paths
156+
157+
# Test with arrow format
158+
stream = store.list("data/test", return_arrow=True)
159+
batch = stream.collect()
160+
assert isinstance(batch, RecordBatch)
161+
assert batch.num_rows == 2
162+
163+
164+
def test_list_substring_filtering_local_store():
165+
with tempfile.TemporaryDirectory() as temp_dir:
166+
temp_dir_path = Path(temp_dir)
167+
store = LocalStore(temp_dir_path)
168+
169+
# Create directory structure
170+
data_dir = temp_dir_path / "data"
171+
data_dir.mkdir(parents=True, exist_ok=True)
172+
173+
# Write test files
174+
with (data_dir / "file1.txt").open("w") as f:
175+
f.write("foo")
176+
with (data_dir / "test_file.txt").open("w") as f:
177+
f.write("bar")
178+
with (data_dir / "another.csv").open("w") as f:
179+
f.write("baz")
180+
with (data_dir / "test_data.json").open("w") as f:
181+
f.write("qux")
182+
183+
# Test substring filtering for files containing "test"
184+
result = store.list("data/test").collect()
185+
paths = [item["path"] for item in result]
186+
187+
# Should match files with "test" in the filename within data/ directory
188+
assert "data/test_file.txt" in paths
189+
assert "data/test_data.json" in paths
190+
assert "data/file1.txt" not in paths
191+
assert "data/another.csv" not in paths
192+
193+
133194
def test_list_as_arrow_to_polars():
134195
store = MemoryStore()
135196

0 commit comments

Comments
 (0)