Make gaggle_ls support recursive listing (#6)

habedi · web-flow · commit 8c5a4bc114e7 · 2025-11-15T20:09:29.000+01:00
diff --git a/.github/workflows/dist_pipeline.yml b/.github/workflows/dist_pipeline.yml
@@ -4,6 +4,10 @@ on:
   pull_request:
     branches:
       - main
+    paths-ignore:
+      - '**.md'
+      - 'docs/**'
+      - '.github/**'
   push:
     tags:
       - 'v*'
diff --git a/.github/workflows/lints.yml b/.github/workflows/lints.yml
@@ -5,6 +5,10 @@ on:
   pull_request:
     branches:
       - main
+    paths-ignore:
+      - '**.md'
+      - 'docs/**'
+      - '.github/**'
 
 permissions:
   contents: read
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -5,6 +5,10 @@ on:
   pull_request:
     branches:
       - main
+    paths-ignore:
+      - '**.md'
+      - 'docs/**'
+      - '.github/**'
   push:
     branches:
       - main
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ updates, etc.
 This workflow can quickly become complex, especially when working with multiple datasets or when datasets are updated
 frequently.
 Gaggle tries to help simplify this process by hiding the complexity and letting you work with datasets directly inside
-DuckDB that allow you to run fast analytical queries on the data.
+DuckDB that allows you to run fast analytical queries on the data.
 
 In essence, Gaggle makes DuckDB into a SQL-enabled frontend for Kaggle datasets.
 
@@ -97,24 +97,23 @@ make release
 -- Get extension version
 select gaggle_version();
 
--- List files in the dataset
+-- List files in the dataset (recursively)
 -- (Note that if the datasets is not downloaded, it will be downloaded and cached)
 select *
-from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;
+from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 5;
 
--- Read a Parquet file from local cache using a prepared statement
--- (DuckDB doesn't allow the use of subqueries in function arguments, so we use a prepared statement)
+-- Read a Parquet file from the local cache using a prepared statement
 prepare rp as select * from read_parquet(?) limit 10;
 execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'));
 
 -- Alternatively, we can use a replacement scan to read directly via `kaggle:` prefix
 select count(*)
 from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';
 
--- Optionally, we check cache info
+-- Then, we check cache info
 select gaggle_cache_info();
 
--- Check if cached dataset is current (is newest version?)
+-- Check if cached dataset is current (is the newest version?)
 select gaggle_is_current('habedi/flickr-8k-dataset-clean');
 ```
 
diff --git a/docs/README.md b/docs/README.md
@@ -17,7 +17,7 @@ The table below includes the information about all SQL functions exposed by Gagg
 | 11 | `gaggle_version_info(dataset_path VARCHAR)`                     | `VARCHAR (JSON)`                                 | Returns version info: `cached_version`, `latest_version`, `is_current`, `is_cached`.                                                            |
 | 12 | `gaggle_json_each(json VARCHAR)`                                | `VARCHAR`                                        | Expands a JSON object into newline-delimited JSON rows with fields: `key`, `value`, `type`, `path`. Users normally shouldn't use this function. |
 | 13 | `gaggle_file_path(dataset_path VARCHAR, filename VARCHAR)`      | `VARCHAR`                                        | Resolves a specific file's local path inside a downloaded dataset.                                                                              |
-| 14 | `gaggle_ls(dataset_path VARCHAR)`                               | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files (non-recursively) in the dataset's local directory; `size` is in MB.                                                                |
+| 14 | `gaggle_ls(dataset_path VARCHAR[, recursive BOOLEAN])`            | `TABLE(name VARCHAR, size BIGINT, path VARCHAR)` | Lists files in the dataset's local directory; non-recursive by default. When `recursive=true` will walk subdirectories. `path` values are returned as `owner/dataset/<relative-path>` (not an absolute filesystem path); `size` is in MB. |
 
 > [!NOTE]
 > * The `gaggle_file_path` function will retrieve and cache the file if it is not already downloaded; set
@@ -65,14 +65,22 @@ select gaggle_info('uciml/iris') as dataset_metadata;
 #### Reading Data
 
 ```sql
--- List files as a table
+-- List files as a table (non-recursive)
 select *
 from gaggle_ls('uciml/iris') limit 5;
 
+-- List files as a table (recursive)
+select *
+from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10;
+
 -- List files as a JSON array
 select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json
 from gaggle_ls('uciml/iris');
 
+-- Note: returned `path` values are in the form 'owner/dataset/...',
+-- which work for use with replacement scans or as an identifier inside the cache;
+-- to get an absolute filesystem path use `gaggle_file_path(owner_dataset, relative_path)`.
+
 -- Resolve a file path and read it via a prepared statement
 prepare rp as select * from read_parquet(?) limit 10;
 execute rp(gaggle_file_path('owner/dataset', 'file.parquet'));
diff --git a/docs/examples/e1_core_functionality.sql b/docs/examples/e1_core_functionality.sql
@@ -30,6 +30,7 @@ select gaggle_download('uciml/iris') as download_path;
 
 -- Section 5: list files (JSON)
 select '## list files (json)';
+-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/<relative-path>' (not absolute filesystem paths)
 select to_json(
          list(struct_pack(name := name, size := size, path := path))
        ) as files_json
@@ -39,6 +40,10 @@ from gaggle_ls('uciml/iris');
 select '## list files (table)';
 select * from gaggle_ls('uciml/iris') limit 5;
 
+-- Recursive listing example (walk subdirectories)
+select '## recursive listing example';
+select * from gaggle_ls('suganthidevasagayam/social-media-post-of-postpartum-depression', true) limit 10;
+
 -- Section 6: get dataset metadata
 select '## get dataset metadata';
 select gaggle_info('uciml/iris') as dataset_metadata;
diff --git a/docs/examples/e2_advanced_features.sql b/docs/examples/e2_advanced_features.sql
@@ -14,13 +14,18 @@ execute rp(gaggle_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet'
 
 -- Section 2: list and process multiple files
 select '## list and process dataset files (json and table)';
+-- Note: `path` values returned by gaggle_ls are of the form 'owner/dataset/<relative-path>' (not absolute filesystem paths)
 with files as (
   select to_json(list(struct_pack(name := name, size := size, path := path))) as files_json
   from gaggle_ls('habedi/flickr-8k-dataset-clean')
 )
 select files_json from files;
 select * from gaggle_ls('habedi/flickr-8k-dataset-clean') limit 5;
 
+-- Recursive listing (example)
+select '## recursive listing example for flickr dataset';
+select * from gaggle_ls('habedi/flickr-8k-dataset-clean', true) limit 10;
+
 -- Section 2b: use replacement scan for direct reads via `kaggle:` URLs
 select '## Replacement scan - direct reads via `kaggle:`';
 -- Single file read
diff --git a/external/duckdb b/external/duckdb
@@ -1 +1 @@
-Subproject commit 4c2573afaec92b2a7b530e22d9d5e2d98cbfc9d4
+Subproject commit 8521dd47d7f05f454b84f7cfcad6b9e23ec061b5
diff --git a/external/extension-ci-tools b/external/extension-ci-tools
@@ -1 +1 @@
-Subproject commit 7b1ba0a5651e9fd85cf555f950d5f8bef88adf7e
+Subproject commit aac9640615e51d6e7e8b72d4bf023703cfd8e479
diff --git a/gaggle/Cargo.toml b/gaggle/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gaggle"
-version = "0.1.0-alpha.3"
+version = "0.1.0-alpha.4"
 edition = "2021"
 publish = false
 
diff --git a/gaggle/bindings/gaggle_extension.cpp b/gaggle/bindings/gaggle_extension.cpp