From 0272fefbcac664672fd2d5fa874da6487311f023 Mon Sep 17 00:00:00 2001 From: Nico Wagner Date: Wed, 5 Feb 2025 15:17:48 +0000 Subject: [PATCH] feat(datashed): add `select` command Signed-off-by: Nico Wagner --- Cargo.lock | 242 ++++++++++++++----------- crates/datashed/src/cli.rs | 1 + crates/datashed/src/commands/mod.rs | 2 + crates/datashed/src/commands/select.rs | 182 +++++++++++++++++++ crates/datashed/src/main.rs | 1 + 5 files changed, 322 insertions(+), 106 deletions(-) create mode 100644 crates/datashed/src/commands/select.rs diff --git a/Cargo.lock b/Cargo.lock index cc01b9f..64e7419 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -88,7 +88,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -205,7 +205,7 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -231,7 +231,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", "zerocopy", @@ -381,18 +381,18 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "async-trait" -version = "0.1.85" +version = "0.1.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" +checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -477,7 +477,7 @@ checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", - "brotli-decompressor 4.0.1", + "brotli-decompressor 4.0.2", ] [[package]] @@ -492,9 +492,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.1" +version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -513,9 +513,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "bytemuck" @@ -534,7 +534,7 @@ checksum = "3fa76293b4f7bb636ab88fd78228235b5248b4d05cc589aed610f954af5d7c7a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -545,9 +545,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" [[package]] name = "bytestring" @@ -569,9 +569,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.10" +version = "1.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229" +checksum = "755717a7de9ec452bf7f3f1a3099085deabd7f2962b861dae91ecd7a365903d2" dependencies = [ "jobserver", "libc", @@ -619,9 +619,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.26" +version = "4.5.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783" +checksum = "3e77c3243bd94243c03672cb5154667347c457ca271254724f9f393aee1c05ff" dependencies = [ "clap_builder", "clap_derive", @@ -629,9 +629,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.26" +version = "4.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121" +checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" dependencies = [ "anstream", "anstyle", @@ -642,23 +642,23 @@ dependencies = [ [[package]] name = "clap_complete" -version = "4.5.42" +version = "4.5.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33a7e468e750fa4b6be660e8b5651ad47372e8fb114030b594c2d75d48c5ffd0" +checksum = "375f9d8255adeeedd51053574fd8d4ba875ea5fa558e86617b07f09f1680c8b6" dependencies = [ "clap", ] [[package]] name = "clap_derive" -version = "4.5.24" +version = "4.5.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c" +checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -741,7 +741,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] @@ -781,9 +781,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -880,9 +880,9 @@ dependencies = [ [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -991,15 +991,15 @@ dependencies = [ [[package]] name = "derive_more" -version = "0.99.18" +version = "0.99.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" +checksum = "3da29a38df43d6f156149c9b43ded5e018ddff2a855cf2cfd62e8cd7d079c69f" dependencies = [ "convert_case", "proc-macro2", "quote", "rustc_version", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1054,14 +1054,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "dyn-clone" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" +checksum = "feeef44e73baff3a26d371801df019877a9866a8c493d315ab00177843314f35" [[package]] name = "either" @@ -1093,7 +1093,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1292,7 +1292,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1344,10 +1344,22 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets 0.52.6", +] + [[package]] name = "gimli" version = "0.31.1" @@ -1513,9 +1525,9 @@ checksum = "21dec9db110f5f872ed9699c3ecf50cf16f423502706ba5c72462e28d3157573" [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "httpdate" @@ -1540,9 +1552,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.5.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -1748,7 +1760,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -1810,9 +1822,9 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.17.9" +version = "0.17.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ "console", "number_prefix", @@ -2215,7 +2227,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.48.0", ] @@ -2227,7 +2239,7 @@ checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -2255,9 +2267,9 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.12" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +checksum = "0dab59f8e050d5df8e4dd87d9206fb6f65a483e20ac9fda365ade4fab353196c" dependencies = [ "libc", "log", @@ -2433,9 +2445,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.68" +version = "0.10.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5" +checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" dependencies = [ "bitflags", "cfg-if", @@ -2454,20 +2466,20 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "openssl-sys" -version = "0.9.104" +version = "0.9.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" +checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" dependencies = [ "cc", "libc", @@ -2566,7 +2578,7 @@ dependencies = [ [[package]] name = "pica-record" version = "0.1.0" -source = "git+https://github.com/deutsche-nationalbibliothek/pica-rs.git?branch=main#c5557c3fb62385919f55ae5ed0883f4b5467d817" +source = "git+https://github.com/deutsche-nationalbibliothek/pica-rs.git?branch=main#15709fff3234833957aa45076413929a18d65857" dependencies = [ "bstr", "flate2", @@ -2612,7 +2624,7 @@ version = "0.45.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c0af18ae021b0396c42f39396146332957ebc4d4d25d931b4fe73509948f348" dependencies = [ - "getrandom", + "getrandom 0.2.15", "polars-arrow", "polars-core", "polars-error", @@ -2641,7 +2653,7 @@ dependencies = [ "dyn-clone", "either", "ethnum", - "getrandom", + "getrandom 0.2.15", "hashbrown 0.15.2", "itoap", "lz4", @@ -3198,7 +3210,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -3263,7 +3275,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3281,7 +3293,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" dependencies = [ - "getrandom", + "getrandom 0.2.15", "libredox", "thiserror 2.0.11", ] @@ -3303,7 +3315,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3394,7 +3406,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin", "untrusted", @@ -3418,9 +3430,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.43" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ "bitflags", "errno", @@ -3431,9 +3443,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.21" +version = "0.23.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" +checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" dependencies = [ "once_cell", "rustls-pki-types", @@ -3453,9 +3465,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" [[package]] name = "rustls-webpki" @@ -3476,9 +3488,9 @@ checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "schannel" @@ -3555,14 +3567,14 @@ checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] name = "serde_json" -version = "1.0.137" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ "indexmap", "itoa", @@ -3663,7 +3675,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa2bcf6c6e164e81bc7a5d49fc6988b3d515d9e8c07457d7b74ffb9324b9cd40" dependencies = [ "ahash", - "getrandom", + "getrandom 0.2.15", "halfbrown", "once_cell", "ref-cast", @@ -3808,7 +3820,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3821,7 +3833,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3843,9 +3855,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.96" +version = "2.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" dependencies = [ "proc-macro2", "quote", @@ -3869,7 +3881,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3925,13 +3937,13 @@ checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5" [[package]] name = "tempfile" -version = "3.15.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" +checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" dependencies = [ "cfg-if", "fastrand", - "getrandom", + "getrandom 0.3.1", "once_cell", "rustix", "windows-sys 0.59.0", @@ -3982,7 +3994,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -3993,7 +4005,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4087,7 +4099,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4147,9 +4159,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" dependencies = [ "indexmap", "serde", @@ -4225,9 +4237,9 @@ checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-normalization" @@ -4309,11 +4321,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.12.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4" +checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" dependencies = [ - "getrandom", + "getrandom 0.3.1", ] [[package]] @@ -4361,6 +4373,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -4383,7 +4404,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-shared", ] @@ -4418,7 +4439,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4513,7 +4534,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4524,7 +4545,7 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4716,13 +4737,22 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "86e376c75f4f43f44db463cf729e0d3acbf954d13e22c51e26e4c264b4ab545f" dependencies = [ "memchr", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags", +] + [[package]] name = "write16" version = "1.0.0" @@ -4772,7 +4802,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -4794,7 +4824,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] @@ -4814,7 +4844,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", "synstructure", ] @@ -4843,7 +4873,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.96", + "syn 2.0.98", ] [[package]] diff --git a/crates/datashed/src/cli.rs b/crates/datashed/src/cli.rs index f6a5957..e170a12 100644 --- a/crates/datashed/src/cli.rs +++ b/crates/datashed/src/cli.rs @@ -34,6 +34,7 @@ pub(crate) enum Command { Lfreq(Lfreq), Rate(Rate), Restore(Restore), + Select(Select), Serve(Serve), Status(Status), Summary(Summary), diff --git a/crates/datashed/src/commands/mod.rs b/crates/datashed/src/commands/mod.rs index 51692bd..47f425e 100644 --- a/crates/datashed/src/commands/mod.rs +++ b/crates/datashed/src/commands/mod.rs @@ -9,6 +9,7 @@ pub(crate) use init::Init; pub(crate) use lfreq::Lfreq; pub(crate) use rate::Rate; pub(crate) use restore::Restore; +pub(crate) use select::Select; pub(crate) use serve::Serve; pub(crate) use status::Status; pub(crate) use summary::Summary; @@ -28,6 +29,7 @@ mod init; mod lfreq; mod rate; mod restore; +mod select; mod serve; mod status; mod summary; diff --git a/crates/datashed/src/commands/select.rs b/crates/datashed/src/commands/select.rs new file mode 100644 index 0000000..3fe3809 --- /dev/null +++ b/crates/datashed/src/commands/select.rs @@ -0,0 +1,182 @@ +use std::ffi::OsStr; +use std::fs::File; +use std::io::stdout; +use std::path::PathBuf; + +use clap::Parser; +use polars::prelude::*; +use polars::sql::SQLContext; + +use crate::prelude::*; + +/// Select data from the index. +#[derive(Debug, Default, Parser)] +pub(crate) struct Select { + /// Run verbosely. Print additional progress information to the + /// standard error stream. This option conflicts with the + /// `--quiet` option. + #[arg(short, long, conflicts_with = "quiet")] + verbose: bool, + + /// Operate quietly; do not show progress. This option conflicts + /// with the `--verbose` option. + #[arg(short, long, conflicts_with = "verbose")] + quiet: bool, + + /// Ingore the datashed's index and use `filename` instead. + #[arg(long, short = 'I')] + index: Option, + + /// Ignore documents which are *not* explicitly listed in the given + /// allow-lists. + #[arg(long = "allow-list", short = 'A')] + allow_list: Option, + + /// Ignore documents which are explicitly listed in the given + /// deny-lists. + #[arg(long = "deny-list", short = 'D')] + deny_list: Option, + + /// Whether to append to an existing file or not. + #[arg(long, short = 'a', requires = "output")] + append: bool, + + /// Write the sub-index into `filename`. By default output will be + /// written in CSV format to the standard output (`stdout`). + #[arg(short, long, value_name = "filename")] + output: Option, + + /// An optional predicate to filter the document-set. + #[arg(long = "where")] + predicate: Option, + + #[arg(long, default_value = "idn", conflicts_with_all = ["left_on", "right_on"])] + on: String, + + #[arg(long, requires = "right_on", conflicts_with = "on")] + left_on: Option, + + #[arg(long, requires = "left_on", conflicts_with = "on")] + right_on: Option, + + /// The columns names of the index to select. + #[arg(long, short, default_value = "*")] + columns: String, +} + +fn read_filter_list(path: PathBuf) -> DatashedResult { + Ok(match path.extension().and_then(OsStr::to_str) { + Some("ipc" | "arrow") => IpcReader::new(File::open(path)?) + .memory_mapped(None) + .finish()?, + _ => CsvReadOptions::default() + .with_has_header(true) + .with_infer_schema_length(Some(0)) + .try_into_reader_with_file_path(Some(path))? + .finish()?, + }) +} + +impl Select { + pub(crate) fn execute(self) -> DatashedResult<()> { + let datashed = Datashed::discover()?; + let index = if let Some(path) = self.index { + IpcReader::new(File::open(path)?) + .memory_mapped(None) + .finish()? + } else { + datashed.index()? + }; + + let columns = self.columns; + let mut ctx = SQLContext::new(); + ctx.register("df", index.lazy()); + + let mut df = if let Some(predicate) = self.predicate { + ctx.execute(&format!( + "SELECT {columns} FROM df WHERE {predicate}", + ))? + } else { + ctx.execute(&format!("SELECT {columns} FROM df"))? + }; + + let left_on: Vec<_> = self + .left_on + .unwrap_or(self.on.clone()) + .split(',') + .map(str::trim) + .map(col) + .collect(); + + let right_on: Vec<_> = self + .right_on + .unwrap_or(self.on) + .split(',') + .map(str::trim) + .map(col) + .collect(); + + if let Some(path) = self.allow_list { + df = df.join( + read_filter_list(path)?.lazy(), + left_on.clone(), + right_on.clone(), + JoinArgs::new(JoinType::Semi), + ); + } + + if let Some(path) = self.deny_list { + df = df.join( + read_filter_list(path)?.lazy(), + left_on, + right_on, + JoinArgs::new(JoinType::Anti), + ); + } + + let mut df = df.collect()?; + + if let Some(ref path) = self.output { + match path.extension().and_then(OsStr::to_str) { + Some("csv") => { + if self.append { + let existing = CsvReadOptions::default() + .with_has_header(true) + .with_infer_schema_length(Some(0)) + .try_into_reader_with_file_path(Some( + path.into(), + ))? + .finish()?; + + df = existing.vstack(&df)?; + } + + let mut writer = + CsvWriter::new(File::create(path)?); + writer.finish(&mut df)?; + } + _ => { + if self.append { + let existing = + IpcReader::new(File::open(path)?) + .memory_mapped(None) + .finish()?; + df = existing.vstack(&df)?; + } + + let mut writer = + IpcWriter::new(File::create(path)?) + .with_compression(Some( + IpcCompression::ZSTD, + )); + writer.finish(&mut df)?; + } + } + } else { + let mut writer = CsvWriter::new(stdout().lock()); + writer.finish(&mut df)?; + } + + Ok(()) + } +} diff --git a/crates/datashed/src/main.rs b/crates/datashed/src/main.rs index df04c75..24e785a 100644 --- a/crates/datashed/src/main.rs +++ b/crates/datashed/src/main.rs @@ -54,6 +54,7 @@ async fn run(args: Args) -> DatashedResult<()> { Command::Lfreq(cmd) => cmd.execute(), Command::Restore(cmd) => cmd.execute(), Command::Rate(cmd) => cmd.execute().await, + Command::Select(cmd) => cmd.execute(), Command::Serve(cmd) => cmd.execute().await, Command::Status(cmd) => cmd.execute(), Command::Summary(cmd) => cmd.execute(),