Skip to content

Commit 8c5a4bc

Browse files
authored
Make gaggle_ls support recursive listing (#6)
1 parent d231ba6 commit 8c5a4bc

File tree

11 files changed

+276
-52
lines changed

11 files changed

+276
-52
lines changed

.github/workflows/dist_pipeline.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ on:
44
pull_request:
55
branches:
66
- main
7+
paths-ignore:
8+
- '**.md'
9+
- 'docs/**'
10+
- '.github/**'
711
push:
812
tags:
913
- 'v*'

.github/workflows/lints.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ on:
55
pull_request:
66
branches:
77
- main
8+
paths-ignore:
9+
- '**.md'
10+
- 'docs/**'
11+
- '.github/**'
812

913
permissions:
1014
contents: read

.github/workflows/tests.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ on:
55
pull_request:
66
branches:
77
- main
8+
paths-ignore:
9+
- '**.md'
10+
- 'docs/**'
11+
- '.github/**'
812
push:
913
branches:
1014
- main

README.md

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ updates, etc.
3030
This workflow can quickly become complex, especially when working with multiple datasets or when datasets are updated
3131
frequently.
3232
Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside
33-
DuckDB that allow you to run fast analytical queries on the data.
33+
DuckDB that allows you to run fast analytical queries on the data.
3434

3535
In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets.
3636

@@ -97,24 +97,23 @@ make release
9797
-- Get extension version
9898
select gaggle_version();
9999

100-
-- List files in the dataset
100+
-- List files in the dataset (recursively)
101101
-- (Note that if the datasets is not downloaded, it will be downloaded and cached)
102102
select *
103-
from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;
103+
from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 5;
104104

105-
-- Read a Parquet file from local cache using a prepared statement
106-
-- (DuckDB doesn't allow the use of subqueries in function arguments, so we use a prepared statement)
105+
-- Read a Parquet file from the local cache using a prepared statement
107106
prepare rp as select * from read_parquet(?) limit 10;
108107
execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'));
109108

110109
-- Alternatively, we can use a replacement scan to read directly via `kaggle:` prefix
111110
select count(*)
112111
from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';
113112

114-
-- Optionally, we check cache info
113+
-- Then, we check cache info
115114
select gaggle_cache_info();
116115

117-
-- Check if cached dataset is current (is newest version?)
116+
-- Check if cached dataset is current (is the newest version?)
118117
select gaggle_is_current('habedi/flickr-8k-dataset-clean');
119118
```
120119

docs/README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ The table below includes the information about all SQL functions exposed by Gagg
1717
| 11 | `gaggle_version_info(dataset_path VARCHAR)` | `VARCHAR (JSON)` | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`. |
1818
| 12 | `gaggle_json_each(json VARCHAR)` | `VARCHAR` | Expands a JSON object into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. Users normally shouldn't use this function. |
1919
| 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves a specific file's local path inside a downloaded dataset. |
20-
| 14 | `gaggle_ls(dataset_path VARCHAR)` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursively) in the dataset's local directory; `size` is in MB. |
20+
| 14 | `gaggle_ls(dataset_path VARCHAR[, recursive BOOLEAN])` | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files in the dataset's local directory; non-recursive by default. When `recursive=true` will walk subdirectories. `path` values are returned as `owner/dataset/<relative-path>` (not an absolute filesystem path); `size` is in MB. |
2121

2222
> [!NOTE]
2323
> * The `gaggle_file_path` function will retrieve and cache the file if it is not already downloaded; set
@@ -65,14 +65,22 @@ select gaggle_info('uciml/iris') as dataset_metadata;
6565
#### Reading Data
6666

6767
```sql
68-
-- List files as a table
68+
-- List files as a table (non-recursive)
6969
select *
7070
from gaggle_ls('uciml/iris') limit 5;
7171

72+
-- List files as a table (recursive)
73+
select *
74+
from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10;
75+
7276
-- List files as a JSON array
7377
select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json
7478
from gaggle_ls('uciml/iris');
7579

80+
-- Note: returned `path` values are in the form 'owner/dataset/...',
81+
-- which work for use with replacement scans or as an identifier inside the cache;
82+
-- to get an absolute filesystem path use `gaggle_file_path(owner_dataset, relative_path)`.
83+
7684
-- Resolve a file path and read it via a prepared statement
7785
prepare rp as select * from read_parquet(?) limit 10;
7886
execute rp(gaggle_file_path('owner/dataset', 'file.parquet'));

docs/examples/e1_core_functionality.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ select gaggle_download('uciml/iris') as download_path;
3030

3131
-- Section 5: list files (JSON)
3232
select '## list files (json)';
33+
-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/<relative-path>' (not absolute filesystem paths)
3334
select to_json(
3435
list(struct_pack(name := name, size := size, path := path))
3536
) as files_json
@@ -39,6 +40,10 @@ from gaggle_ls('uciml/iris');
3940
select '## list files (table)';
4041
select * from gaggle_ls('uciml/iris') limit 5;
4142

43+
-- Recursive listing example (walk subdirectories)
44+
select '## recursive listing example';
45+
select * from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10;
46+
4247
-- Section 6: get dataset metadata
4348
select '## get dataset metadata';
4449
select gaggle_info('uciml/iris') as dataset_metadata;

docs/examples/e2_advanced_features.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,18 @@ execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'
1414

1515
-- Section 2: list and process multiple files
1616
select '## list and process dataset files (json and table)';
17+
-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/<relative-path>' (not absolute filesystem paths)
1718
with files as (
1819
select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json
1920
from gaggle_ls('habedi/flickr-8k-dataset-clean')
2021
)
2122
select files_json from files;
2223
select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;
2324

25+
-- Recursive listing (example)
26+
select '## recursive listing example for flickr dataset';
27+
select * from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 10;
28+
2429
-- Section 2b: use replacement scan for direct reads via `kaggle:` URLs
2530
select '## Replacement scan - direct reads via `kaggle:`';
2631
-- Single file read

external/duckdb

Submodule duckdb updated 1068 files

gaggle/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "gaggle"
3-
version = "0.1.0-alpha.3"
3+
version = "0.1.0-alpha.4"
44
edition = "2021"
55
publish = false
66

0 commit comments

Comments
 (0)